Remove all assembler and intrinsics from decoder
This commit drops all use of assembler and intrinsics from the libFLAC decoder. This is because they are only for 32-bit x86, hard to debug, maintain and fuzz properly, and because the decoder has much greater security risks than the encoder.
This commit is contained in:
parent
a67102694d
commit
febff86af0
@ -38,9 +38,6 @@
|
||||
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
|
||||
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
|
||||
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
|
||||
cglobal FLAC__lpc_restore_signal_asm_ia32
|
||||
cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
|
||||
cglobal FLAC__lpc_restore_signal_wide_asm_ia32
|
||||
|
||||
code_section
|
||||
|
||||
@ -446,377 +443,6 @@ cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; **********************************************************************
|
||||
;
|
||||
; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
|
||||
; {
|
||||
; unsigned i, j;
|
||||
; FLAC__int32 sum;
|
||||
;
|
||||
; FLAC__ASSERT(order > 0);
|
||||
;
|
||||
; for(i = 0; i < data_len; i++) {
|
||||
; sum = 0;
|
||||
; for(j = 0; j < order; j++)
|
||||
; sum += qlp_coeff[j] * data[i-j-1];
|
||||
; data[i] = residual[i] + (sum >> lp_quantization);
|
||||
; }
|
||||
; }
|
||||
ALIGN 16
|
||||
cident FLAC__lpc_restore_signal_asm_ia32
|
||||
;[esp + 40] data[]
|
||||
;[esp + 36] lp_quantization
|
||||
;[esp + 32] order
|
||||
;[esp + 28] qlp_coeff[]
|
||||
;[esp + 24] data_len
|
||||
;[esp + 20] residual[]
|
||||
|
||||
;ASSERT(order > 0)
|
||||
|
||||
push ebp
|
||||
push ebx
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov esi, [esp + 20] ; esi = residual[]
|
||||
mov edi, [esp + 40] ; edi = data[]
|
||||
mov eax, [esp + 32] ; eax = order
|
||||
mov ebx, [esp + 24] ; ebx = data_len
|
||||
|
||||
test ebx, ebx
|
||||
jz near .end ; do nothing if data_len == 0
|
||||
|
||||
.begin:
|
||||
cmp eax, byte 1
|
||||
jg short .x87_1more
|
||||
|
||||
mov ecx, [esp + 28]
|
||||
mov edx, [ecx]
|
||||
mov eax, [edi - 4]
|
||||
mov ecx, [esp + 36]
|
||||
ALIGN 16
|
||||
.x87_1_loop_i:
|
||||
imul eax, edx
|
||||
sar eax, cl
|
||||
add eax, [esi]
|
||||
mov [edi], eax
|
||||
add esi, byte 4
|
||||
add edi, byte 4
|
||||
dec ebx
|
||||
jnz .x87_1_loop_i
|
||||
|
||||
jmp .end
|
||||
|
||||
.x87_1more:
|
||||
cmp eax, byte 32 ; for order <= 32 there is a faster routine
|
||||
jbe short .x87_32
|
||||
|
||||
; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
|
||||
ALIGN 16
|
||||
.x87_32more_loop_i:
|
||||
xor ebp, ebp
|
||||
mov ecx, [esp + 32]
|
||||
mov edx, ecx
|
||||
shl edx, 2
|
||||
add edx, [esp + 28]
|
||||
neg ecx
|
||||
ALIGN 16
|
||||
.x87_32more_loop_j:
|
||||
sub edx, byte 4
|
||||
mov eax, [edx]
|
||||
imul eax, [edi + 4 * ecx]
|
||||
add ebp, eax
|
||||
inc ecx
|
||||
jnz short .x87_32more_loop_j
|
||||
|
||||
mov ecx, [esp + 36]
|
||||
sar ebp, cl
|
||||
add ebp, [esi]
|
||||
mov [edi], ebp
|
||||
add edi, byte 4
|
||||
add esi, byte 4
|
||||
|
||||
dec ebx
|
||||
jnz .x87_32more_loop_i
|
||||
|
||||
jmp .end
|
||||
|
||||
.mov_eip_to_eax:
|
||||
mov eax, [esp]
|
||||
ret
|
||||
|
||||
.x87_32:
|
||||
sub esi, edi
|
||||
neg eax
|
||||
lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
|
||||
call .mov_eip_to_eax
|
||||
.get_eip0:
|
||||
add edx, eax
|
||||
inc edx ; compensate for the shorter opcode on the last iteration
|
||||
mov eax, [esp + 28] ; eax = qlp_coeff[]
|
||||
xor ebp, ebp
|
||||
jmp edx
|
||||
|
||||
mov ecx, [eax + 124] ; ecx = qlp_coeff[31]
|
||||
imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-32]
|
||||
add ebp, ecx ; sum += qlp_coeff[31] * data[i-32]
|
||||
mov ecx, [eax + 120] ; ecx = qlp_coeff[30]
|
||||
imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-31]
|
||||
add ebp, ecx ; sum += qlp_coeff[30] * data[i-31]
|
||||
mov ecx, [eax + 116] ; ecx = qlp_coeff[29]
|
||||
imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-30]
|
||||
add ebp, ecx ; sum += qlp_coeff[29] * data[i-30]
|
||||
mov ecx, [eax + 112] ; ecx = qlp_coeff[28]
|
||||
imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-29]
|
||||
add ebp, ecx ; sum += qlp_coeff[28] * data[i-29]
|
||||
mov ecx, [eax + 108] ; ecx = qlp_coeff[27]
|
||||
imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-28]
|
||||
add ebp, ecx ; sum += qlp_coeff[27] * data[i-28]
|
||||
mov ecx, [eax + 104] ; ecx = qlp_coeff[26]
|
||||
imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-27]
|
||||
add ebp, ecx ; sum += qlp_coeff[26] * data[i-27]
|
||||
mov ecx, [eax + 100] ; ecx = qlp_coeff[25]
|
||||
imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-26]
|
||||
add ebp, ecx ; sum += qlp_coeff[25] * data[i-26]
|
||||
mov ecx, [eax + 96] ; ecx = qlp_coeff[24]
|
||||
imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-25]
|
||||
add ebp, ecx ; sum += qlp_coeff[24] * data[i-25]
|
||||
mov ecx, [eax + 92] ; ecx = qlp_coeff[23]
|
||||
imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-24]
|
||||
add ebp, ecx ; sum += qlp_coeff[23] * data[i-24]
|
||||
mov ecx, [eax + 88] ; ecx = qlp_coeff[22]
|
||||
imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-23]
|
||||
add ebp, ecx ; sum += qlp_coeff[22] * data[i-23]
|
||||
mov ecx, [eax + 84] ; ecx = qlp_coeff[21]
|
||||
imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-22]
|
||||
add ebp, ecx ; sum += qlp_coeff[21] * data[i-22]
|
||||
mov ecx, [eax + 80] ; ecx = qlp_coeff[20]
|
||||
imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-21]
|
||||
add ebp, ecx ; sum += qlp_coeff[20] * data[i-21]
|
||||
mov ecx, [eax + 76] ; ecx = qlp_coeff[19]
|
||||
imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-20]
|
||||
add ebp, ecx ; sum += qlp_coeff[19] * data[i-20]
|
||||
mov ecx, [eax + 72] ; ecx = qlp_coeff[18]
|
||||
imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-19]
|
||||
add ebp, ecx ; sum += qlp_coeff[18] * data[i-19]
|
||||
mov ecx, [eax + 68] ; ecx = qlp_coeff[17]
|
||||
imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-18]
|
||||
add ebp, ecx ; sum += qlp_coeff[17] * data[i-18]
|
||||
mov ecx, [eax + 64] ; ecx = qlp_coeff[16]
|
||||
imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-17]
|
||||
add ebp, ecx ; sum += qlp_coeff[16] * data[i-17]
|
||||
mov ecx, [eax + 60] ; ecx = qlp_coeff[15]
|
||||
imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-16]
|
||||
add ebp, ecx ; sum += qlp_coeff[15] * data[i-16]
|
||||
mov ecx, [eax + 56] ; ecx = qlp_coeff[14]
|
||||
imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-15]
|
||||
add ebp, ecx ; sum += qlp_coeff[14] * data[i-15]
|
||||
mov ecx, [eax + 52] ; ecx = qlp_coeff[13]
|
||||
imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-14]
|
||||
add ebp, ecx ; sum += qlp_coeff[13] * data[i-14]
|
||||
mov ecx, [eax + 48] ; ecx = qlp_coeff[12]
|
||||
imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-13]
|
||||
add ebp, ecx ; sum += qlp_coeff[12] * data[i-13]
|
||||
mov ecx, [eax + 44] ; ecx = qlp_coeff[11]
|
||||
imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-12]
|
||||
add ebp, ecx ; sum += qlp_coeff[11] * data[i-12]
|
||||
mov ecx, [eax + 40] ; ecx = qlp_coeff[10]
|
||||
imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-11]
|
||||
add ebp, ecx ; sum += qlp_coeff[10] * data[i-11]
|
||||
mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9]
|
||||
imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-10]
|
||||
add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-10]
|
||||
mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8]
|
||||
imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i- 9]
|
||||
add ebp, ecx ; sum += qlp_coeff[ 8] * data[i- 9]
|
||||
mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7]
|
||||
imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i- 8]
|
||||
add ebp, ecx ; sum += qlp_coeff[ 7] * data[i- 8]
|
||||
mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6]
|
||||
imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i- 7]
|
||||
add ebp, ecx ; sum += qlp_coeff[ 6] * data[i- 7]
|
||||
mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5]
|
||||
imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i- 6]
|
||||
add ebp, ecx ; sum += qlp_coeff[ 5] * data[i- 6]
|
||||
mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4]
|
||||
imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i- 5]
|
||||
add ebp, ecx ; sum += qlp_coeff[ 4] * data[i- 5]
|
||||
mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3]
|
||||
imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i- 4]
|
||||
add ebp, ecx ; sum += qlp_coeff[ 3] * data[i- 4]
|
||||
mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2]
|
||||
imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i- 3]
|
||||
add ebp, ecx ; sum += qlp_coeff[ 2] * data[i- 3]
|
||||
mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1]
|
||||
imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i- 2]
|
||||
add ebp, ecx ; sum += qlp_coeff[ 1] * data[i- 2]
|
||||
mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: one byte missing from instruction)
|
||||
imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i- 1]
|
||||
add ebp, ecx ; sum += qlp_coeff[ 0] * data[i- 1]
|
||||
.jumper_0:
|
||||
|
||||
mov ecx, [esp + 36]
|
||||
sar ebp, cl ; ebp = (sum >> lp_quantization)
|
||||
add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp_quantization)
|
||||
mov [edi], ebp ; data[i] = residual[i] + (sum >> lp_quantization)
|
||||
add edi, byte 4
|
||||
|
||||
dec ebx
|
||||
jz short .end
|
||||
xor ebp, ebp
|
||||
jmp edx
|
||||
|
||||
.end:
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
|
||||
; the channel and qlp_coeffs must be <= 16. Especially note that this routine
|
||||
; cannot be used for side-channel coded 16bps channels since the effective bps
|
||||
; is 17.
|
||||
; WATCHOUT: this routine requires that each data array have a buffer of up to
|
||||
; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each
|
||||
; channel n, data[n][-1] through data[n][-3] should be accessible and zero.
|
||||
ALIGN 16
|
||||
cident FLAC__lpc_restore_signal_asm_ia32_mmx
|
||||
;[esp + 40] data[]
|
||||
;[esp + 36] lp_quantization
|
||||
;[esp + 32] order
|
||||
;[esp + 28] qlp_coeff[]
|
||||
;[esp + 24] data_len
|
||||
;[esp + 20] residual[]
|
||||
|
||||
;ASSERT(order > 0)
|
||||
|
||||
push ebp
|
||||
push ebx
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov esi, [esp + 20]
|
||||
mov edi, [esp + 40]
|
||||
mov eax, [esp + 32]
|
||||
mov ebx, [esp + 24]
|
||||
|
||||
test ebx, ebx
|
||||
jz near .end ; do nothing if data_len == 0
|
||||
cmp eax, byte 4
|
||||
jb near FLAC__lpc_restore_signal_asm_ia32.begin
|
||||
|
||||
mov edx, [esp + 28]
|
||||
movd mm6, [esp + 36]
|
||||
mov ebp, esp
|
||||
|
||||
and esp, 0xfffffff8
|
||||
|
||||
xor ecx, ecx
|
||||
.copy_qlp_loop:
|
||||
push word [edx + 4 * ecx]
|
||||
inc ecx
|
||||
cmp ecx, eax
|
||||
jnz short .copy_qlp_loop
|
||||
|
||||
and ecx, 0x3
|
||||
test ecx, ecx
|
||||
je short .za_end
|
||||
sub ecx, byte 4
|
||||
.za_loop:
|
||||
push word 0
|
||||
inc eax
|
||||
inc ecx
|
||||
jnz short .za_loop
|
||||
.za_end:
|
||||
|
||||
movq mm5, [esp + 2 * eax - 8]
|
||||
movd mm4, [edi - 16]
|
||||
punpckldq mm4, [edi - 12]
|
||||
movd mm0, [edi - 8]
|
||||
punpckldq mm0, [edi - 4]
|
||||
packssdw mm4, mm0
|
||||
|
||||
cmp eax, byte 4
|
||||
jnbe short .mmx_4more
|
||||
|
||||
ALIGN 16
|
||||
.mmx_4_loop_i:
|
||||
movq mm7, mm4
|
||||
pmaddwd mm7, mm5
|
||||
movq mm0, mm7
|
||||
punpckhdq mm7, mm7
|
||||
paddd mm7, mm0
|
||||
psrad mm7, mm6
|
||||
movd mm1, [esi]
|
||||
paddd mm7, mm1
|
||||
movd [edi], mm7
|
||||
psllq mm7, 48
|
||||
psrlq mm4, 16
|
||||
por mm4, mm7
|
||||
|
||||
add esi, byte 4
|
||||
add edi, byte 4
|
||||
|
||||
dec ebx
|
||||
jnz .mmx_4_loop_i
|
||||
jmp .mmx_end
|
||||
.mmx_4more:
|
||||
shl eax, 2
|
||||
neg eax
|
||||
add eax, byte 16
|
||||
ALIGN 16
|
||||
.mmx_4more_loop_i:
|
||||
mov ecx, edi
|
||||
add ecx, eax
|
||||
mov edx, esp
|
||||
|
||||
movq mm7, mm4
|
||||
pmaddwd mm7, mm5
|
||||
|
||||
ALIGN 16
|
||||
.mmx_4more_loop_j:
|
||||
movd mm0, [ecx - 16]
|
||||
punpckldq mm0, [ecx - 12]
|
||||
movd mm1, [ecx - 8]
|
||||
punpckldq mm1, [ecx - 4]
|
||||
packssdw mm0, mm1
|
||||
pmaddwd mm0, [edx]
|
||||
paddd mm7, mm0
|
||||
|
||||
add edx, byte 8
|
||||
add ecx, byte 16
|
||||
cmp ecx, edi
|
||||
jnz .mmx_4more_loop_j
|
||||
|
||||
movq mm0, mm7
|
||||
punpckhdq mm7, mm7
|
||||
paddd mm7, mm0
|
||||
psrad mm7, mm6
|
||||
movd mm1, [esi]
|
||||
paddd mm7, mm1
|
||||
movd [edi], mm7
|
||||
psllq mm7, 48
|
||||
psrlq mm4, 16
|
||||
por mm4, mm7
|
||||
|
||||
add esi, byte 4
|
||||
add edi, byte 4
|
||||
|
||||
dec ebx
|
||||
jnz short .mmx_4more_loop_i
|
||||
.mmx_end:
|
||||
emms
|
||||
mov esp, ebp
|
||||
|
||||
.end:
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
|
||||
; **********************************************************************
|
||||
;
|
||||
;void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
|
||||
@ -1098,285 +724,4 @@ cident FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; **********************************************************************
|
||||
;
|
||||
; void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
|
||||
; {
|
||||
; unsigned i, j;
|
||||
; FLAC__int64 sum;
|
||||
;
|
||||
; FLAC__ASSERT(order > 0);
|
||||
;
|
||||
; for(i = 0; i < data_len; i++) {
|
||||
; sum = 0;
|
||||
; for(j = 0; j < order; j++)
|
||||
; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1];
|
||||
; data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization);
|
||||
; }
|
||||
; }
|
||||
ALIGN 16
|
||||
cident FLAC__lpc_restore_signal_wide_asm_ia32
|
||||
;[esp + 40] data[]
|
||||
;[esp + 36] lp_quantization
|
||||
;[esp + 32] order
|
||||
;[esp + 28] qlp_coeff[]
|
||||
;[esp + 24] data_len
|
||||
;[esp + 20] residual[]
|
||||
|
||||
;ASSERT(order > 0)
|
||||
;ASSERT(order <= 32)
|
||||
;ASSERT(lp_quantization <= 31)
|
||||
|
||||
push ebp
|
||||
push ebx
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov ebx, [esp + 24] ; ebx = data_len
|
||||
test ebx, ebx
|
||||
jz near .end ; do nothing if data_len == 0
|
||||
|
||||
.begin:
|
||||
mov eax, [esp + 32] ; eax = order
|
||||
cmp eax, 1
|
||||
jg short .x87_32
|
||||
|
||||
mov esi, [esp + 20] ; esi = residual[]
|
||||
mov edi, [esp + 40] ; edi = data[]
|
||||
mov ecx, [esp + 28] ; ecx = qlp_coeff[]
|
||||
mov ebp, [ecx] ; ebp = qlp_coeff[0]
|
||||
mov eax, [edi - 4] ; eax = data[-1]
|
||||
mov ecx, [esp + 36] ; cl = lp_quantization
|
||||
ALIGN 16
|
||||
.x87_1_loop_i:
|
||||
imul ebp ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1]
|
||||
shrd eax, edx, cl ; 0 <= lp_quantization <= 15
|
||||
;
|
||||
add eax, [esi]
|
||||
mov [edi], eax
|
||||
;
|
||||
add esi, 4
|
||||
add edi, 4
|
||||
dec ebx
|
||||
jnz .x87_1_loop_i
|
||||
jmp .end
|
||||
|
||||
.mov_eip_to_eax:
|
||||
mov eax, [esp]
|
||||
ret
|
||||
|
||||
.x87_32: ; eax = order
|
||||
neg eax
|
||||
add eax, eax
|
||||
lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0]
|
||||
call .mov_eip_to_eax
|
||||
.get_eip0:
|
||||
add ebp, eax
|
||||
inc ebp ; compensate for the shorter opcode on the last iteration
|
||||
|
||||
mov ebx, [esp + 28] ; ebx = qlp_coeff[]
|
||||
mov edi, [esp + 40] ; esi = data[]
|
||||
sub [esp + 20], edi ; residual[] -= data[]
|
||||
|
||||
xor ecx, ecx
|
||||
xor esi, esi
|
||||
jmp ebp
|
||||
|
||||
;eax = --
|
||||
;edx = --
|
||||
;ecx = 0
|
||||
;esi = 0
|
||||
;
|
||||
;ebx = qlp_coeff[]
|
||||
;edi = data[]
|
||||
;ebp = @address
|
||||
|
||||
mov eax, [ebx + 124] ; eax = qlp_coeff[31]
|
||||
imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * data[i-32]
|
||||
add ecx, eax
|
||||
adc esi, edx ; sum += qlp_coeff[31] * data[i-32]
|
||||
|
||||
mov eax, [ebx + 120] ; eax = qlp_coeff[30]
|
||||
imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * data[i-31]
|
||||
add ecx, eax
|
||||
adc esi, edx ; sum += qlp_coeff[30] * data[i-31]
|
||||
|
||||
mov eax, [ebx + 116]
|
||||
imul dword [edi - 120]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 112]
|
||||
imul dword [edi - 116]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 108]
|
||||
imul dword [edi - 112]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 104]
|
||||
imul dword [edi - 108]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 100]
|
||||
imul dword [edi - 104]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 96]
|
||||
imul dword [edi - 100]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 92]
|
||||
imul dword [edi - 96]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 88]
|
||||
imul dword [edi - 92]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 84]
|
||||
imul dword [edi - 88]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 80]
|
||||
imul dword [edi - 84]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 76]
|
||||
imul dword [edi - 80]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 72]
|
||||
imul dword [edi - 76]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 68]
|
||||
imul dword [edi - 72]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 64]
|
||||
imul dword [edi - 68]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 60]
|
||||
imul dword [edi - 64]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 56]
|
||||
imul dword [edi - 60]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 52]
|
||||
imul dword [edi - 56]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 48]
|
||||
imul dword [edi - 52]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 44]
|
||||
imul dword [edi - 48]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 40]
|
||||
imul dword [edi - 44]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 36]
|
||||
imul dword [edi - 40]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 32]
|
||||
imul dword [edi - 36]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 28]
|
||||
imul dword [edi - 32]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 24]
|
||||
imul dword [edi - 28]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 20]
|
||||
imul dword [edi - 24]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 16]
|
||||
imul dword [edi - 20]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 12]
|
||||
imul dword [edi - 16]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 8]
|
||||
imul dword [edi - 12]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx + 4]
|
||||
imul dword [edi - 8]
|
||||
add ecx, eax
|
||||
adc esi, edx
|
||||
|
||||
mov eax, [ebx] ; eax = qlp_coeff[ 0] (NOTE: one byte missing from instruction)
|
||||
imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * data[i- 1]
|
||||
add ecx, eax
|
||||
adc esi, edx ; sum += qlp_coeff[ 0] * data[i- 1]
|
||||
|
||||
.jumper_0:
|
||||
mov edx, ecx
|
||||
;esi:edx = sum
|
||||
mov ecx, [esp + 36] ; cl = lp_quantization
|
||||
shrd edx, esi, cl ; edx = (sum >> lp_quantization)
|
||||
;eax = --
|
||||
;ecx = --
|
||||
;edx = sum >> lp_q
|
||||
;esi = --
|
||||
;
|
||||
mov eax, [esp + 20] ; residual[] - data[]
|
||||
add edx, [edi + eax] ; edx = residual[i] + (sum >> lp_quantization)
|
||||
mov [edi], edx ; data[i] = residual[i] + (sum >> lp_quantization)
|
||||
add edi, 4
|
||||
|
||||
dec dword [esp + 24]
|
||||
jz short .end
|
||||
xor ecx, ecx
|
||||
xor esi, esi
|
||||
jmp ebp
|
||||
|
||||
.end:
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
; end
|
||||
|
@ -206,22 +206,6 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
|
||||
*/
|
||||
void FLAC__lpc_restore_signal(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
|
||||
void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
|
||||
#ifndef FLAC__NO_ASM
|
||||
# ifdef FLAC__CPU_IA32
|
||||
# ifdef FLAC__HAS_NASM
|
||||
void FLAC__lpc_restore_signal_asm_ia32(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
|
||||
void FLAC__lpc_restore_signal_asm_ia32_mmx(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
|
||||
void FLAC__lpc_restore_signal_wide_asm_ia32(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
|
||||
# endif /* FLAC__HAS_NASM */
|
||||
# endif /* FLAC__CPU_IA32 */
|
||||
# if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
|
||||
# ifdef FLAC__SSE4_1_SUPPORTED
|
||||
void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
|
||||
void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
|
||||
void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
|
||||
# endif
|
||||
# endif
|
||||
#endif /* FLAC__NO_ASM */
|
||||
|
||||
#ifndef FLAC__INTEGER_ONLY_LIBRARY
|
||||
|
||||
|
@ -588,550 +588,6 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
|
||||
}
|
||||
}
|
||||
|
||||
FLAC__SSE_TARGET("sse4.1")
|
||||
void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[])
|
||||
{
|
||||
int i;
|
||||
const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
|
||||
|
||||
if (!data_len)
|
||||
return;
|
||||
|
||||
FLAC__ASSERT(order > 0);
|
||||
FLAC__ASSERT(order <= 32);
|
||||
FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */
|
||||
|
||||
if(order <= 12) {
|
||||
if(order > 8) { /* order == 9, 10, 11, 12 */
|
||||
if(order > 10) { /* order == 11, 12 */
|
||||
__m128i qlp[6], dat[6];
|
||||
__m128i summ, temp;
|
||||
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0))); // 0 q[1] 0 q[0]
|
||||
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2))); // 0 q[3] 0 q[2]
|
||||
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4))); // 0 q[5] 0 q[4]
|
||||
qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6))); // 0 q[7] 0 q[6]
|
||||
qlp[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8))); // 0 q[9] 0 q[8]
|
||||
if (order == 12)
|
||||
qlp[5] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+10))); // 0 q[11] 0 q[10]
|
||||
else
|
||||
qlp[5] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[10])); // 0 0 0 q[10]
|
||||
|
||||
dat[5] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-12)), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
|
||||
dat[4] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-10)), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-10] 0 d[i-9]
|
||||
dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-8 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-8] 0 d[i-7]
|
||||
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-6] 0 d[i-5]
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-4] 0 d[i-3]
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-2] 0 d[i-1]
|
||||
|
||||
summ = _mm_mul_epi32(dat[5], qlp[5]) ;
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4]));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
|
||||
|
||||
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); // ?_64 sum_64
|
||||
summ = _mm_srl_epi64(summ, cnt); // ?_64 (sum >> lp_quantization)_64 == ?_32 ?_32 ?_32 (sum >> lp_quantization)_32
|
||||
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ); // ? ? ? d[i]
|
||||
data[0] = _mm_cvtsi128_si32(temp);
|
||||
|
||||
for(i = 1; i < (int)data_len; i++) {
|
||||
temp = _mm_slli_si128(temp, 8);
|
||||
dat[5] = _mm_alignr_epi8(dat[5], dat[4], 8); // ? d[i-11] ? d[i-10]
|
||||
dat[4] = _mm_alignr_epi8(dat[4], dat[3], 8); // ? d[i-9] ? d[i-8]
|
||||
dat[3] = _mm_alignr_epi8(dat[3], dat[2], 8); // ? d[i-7] ? d[i-6]
|
||||
dat[2] = _mm_alignr_epi8(dat[2], dat[1], 8); // ? d[i-5] ? d[i-4]
|
||||
dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8); // ? d[i-3] ? d[i-2]
|
||||
dat[0] = _mm_alignr_epi8(dat[0], temp, 8); // ? d[i-1] ? d[i ]
|
||||
|
||||
summ = _mm_mul_epi32(dat[5], qlp[5]) ;
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4]));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
|
||||
|
||||
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); // ?_64 sum_64
|
||||
summ = _mm_srl_epi64(summ, cnt); // ?_64 (sum >> lp_quantization)_64 == ?_32 ?_32 ?_32 (sum >> lp_quantization)_32
|
||||
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ); // ? ? ? d[i]
|
||||
data[i] = _mm_cvtsi128_si32(temp);
|
||||
}
|
||||
}
|
||||
else { /* order == 9, 10 */
|
||||
__m128i qlp[5], dat[5];
|
||||
__m128i summ, temp;
|
||||
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)));
|
||||
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)));
|
||||
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4)));
|
||||
qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6)));
|
||||
if (order == 10)
|
||||
qlp[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8)));
|
||||
else
|
||||
qlp[4] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[8]));
|
||||
|
||||
dat[4] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-10)), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-8 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
|
||||
|
||||
summ = _mm_mul_epi32(dat[4], qlp[4]) ;
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
|
||||
|
||||
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
|
||||
summ = _mm_srl_epi64(summ, cnt);
|
||||
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
|
||||
data[0] = _mm_cvtsi128_si32(temp);
|
||||
|
||||
for(i = 1; i < (int)data_len; i++) {
|
||||
temp = _mm_slli_si128(temp, 8);
|
||||
dat[4] = _mm_alignr_epi8(dat[4], dat[3], 8);
|
||||
dat[3] = _mm_alignr_epi8(dat[3], dat[2], 8);
|
||||
dat[2] = _mm_alignr_epi8(dat[2], dat[1], 8);
|
||||
dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8);
|
||||
dat[0] = _mm_alignr_epi8(dat[0], temp, 8);
|
||||
|
||||
summ = _mm_mul_epi32(dat[4], qlp[4]) ;
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
|
||||
|
||||
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
|
||||
summ = _mm_srl_epi64(summ, cnt);
|
||||
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
|
||||
data[i] = _mm_cvtsi128_si32(temp);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(order > 4) { /* order == 5, 6, 7, 8 */
|
||||
if(order > 6) { /* order == 7, 8 */
|
||||
__m128i qlp[4], dat[4];
|
||||
__m128i summ, temp;
|
||||
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)));
|
||||
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)));
|
||||
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4)));
|
||||
if (order == 8)
|
||||
qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6)));
|
||||
else
|
||||
qlp[3] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[6]));
|
||||
|
||||
dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-8 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
|
||||
|
||||
summ = _mm_mul_epi32(dat[3], qlp[3]) ;
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
|
||||
|
||||
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
|
||||
summ = _mm_srl_epi64(summ, cnt);
|
||||
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
|
||||
data[0] = _mm_cvtsi128_si32(temp);
|
||||
|
||||
for(i = 1; i < (int)data_len; i++) {
|
||||
temp = _mm_slli_si128(temp, 8);
|
||||
dat[3] = _mm_alignr_epi8(dat[3], dat[2], 8);
|
||||
dat[2] = _mm_alignr_epi8(dat[2], dat[1], 8);
|
||||
dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8);
|
||||
dat[0] = _mm_alignr_epi8(dat[0], temp, 8);
|
||||
|
||||
summ = _mm_mul_epi32(dat[3], qlp[3]) ;
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
|
||||
|
||||
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
|
||||
summ = _mm_srl_epi64(summ, cnt);
|
||||
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
|
||||
data[i] = _mm_cvtsi128_si32(temp);
|
||||
}
|
||||
}
|
||||
else { /* order == 5, 6 */
|
||||
__m128i qlp[3], dat[3];
|
||||
__m128i summ, temp;
|
||||
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)));
|
||||
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)));
|
||||
if (order == 6)
|
||||
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4)));
|
||||
else
|
||||
qlp[2] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[4]));
|
||||
|
||||
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
|
||||
|
||||
summ = _mm_mul_epi32(dat[2], qlp[2]) ;
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
|
||||
|
||||
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
|
||||
summ = _mm_srl_epi64(summ, cnt);
|
||||
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
|
||||
data[0] = _mm_cvtsi128_si32(temp);
|
||||
|
||||
for(i = 1; i < (int)data_len; i++) {
|
||||
temp = _mm_slli_si128(temp, 8);
|
||||
dat[2] = _mm_alignr_epi8(dat[2], dat[1], 8);
|
||||
dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8);
|
||||
dat[0] = _mm_alignr_epi8(dat[0], temp, 8);
|
||||
|
||||
summ = _mm_mul_epi32(dat[2], qlp[2]) ;
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
|
||||
|
||||
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
|
||||
summ = _mm_srl_epi64(summ, cnt);
|
||||
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
|
||||
data[i] = _mm_cvtsi128_si32(temp);
|
||||
}
|
||||
}
|
||||
}
|
||||
else { /* order == 1, 2, 3, 4 */
|
||||
if(order > 2) { /* order == 3, 4 */
|
||||
__m128i qlp[2], dat[2];
|
||||
__m128i summ, temp;
|
||||
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)));
|
||||
if (order == 4)
|
||||
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)));
|
||||
else
|
||||
qlp[1] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[2]));
|
||||
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
|
||||
|
||||
summ = _mm_mul_epi32(dat[1], qlp[1]) ;
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
|
||||
|
||||
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
|
||||
summ = _mm_srl_epi64(summ, cnt);
|
||||
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
|
||||
data[0] = _mm_cvtsi128_si32(temp);
|
||||
|
||||
for(i = 1; i < (int)data_len; i++) {
|
||||
temp = _mm_slli_si128(temp, 8);
|
||||
dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8);
|
||||
dat[0] = _mm_alignr_epi8(dat[0], temp, 8);
|
||||
|
||||
summ = _mm_mul_epi32(dat[1], qlp[1]) ;
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
|
||||
|
||||
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
|
||||
summ = _mm_srl_epi64(summ, cnt);
|
||||
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
|
||||
data[i] = _mm_cvtsi128_si32(temp);
|
||||
}
|
||||
}
|
||||
else { /* order == 1, 2 */
|
||||
if(order == 2) {
|
||||
__m128i qlp0, dat0;
|
||||
__m128i summ, temp;
|
||||
qlp0 = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff)));
|
||||
|
||||
dat0 = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
|
||||
|
||||
summ = _mm_mul_epi32(dat0, qlp0);
|
||||
|
||||
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
|
||||
summ = _mm_srl_epi64(summ, cnt);
|
||||
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
|
||||
data[0] = _mm_cvtsi128_si32(temp);
|
||||
|
||||
for(i = 1; i < (int)data_len; i++) {
|
||||
dat0 = _mm_alignr_epi8(dat0, _mm_slli_si128(temp, 8), 8);
|
||||
|
||||
summ = _mm_mul_epi32(dat0, qlp0);
|
||||
|
||||
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
|
||||
summ = _mm_srl_epi64(summ, cnt);
|
||||
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
|
||||
data[i] = _mm_cvtsi128_si32(temp);
|
||||
}
|
||||
}
|
||||
else { /* order == 1 */
|
||||
__m128i qlp0;
|
||||
__m128i summ, temp;
|
||||
qlp0 = _mm_cvtsi32_si128(qlp_coeff[0]);
|
||||
temp = _mm_cvtsi32_si128(data[-1]);
|
||||
|
||||
summ = _mm_mul_epi32(temp, qlp0);
|
||||
summ = _mm_srl_epi64(summ, cnt);
|
||||
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
|
||||
data[0] = _mm_cvtsi128_si32(temp);
|
||||
|
||||
for(i = 1; i < (int)data_len; i++) {
|
||||
summ = _mm_mul_epi32(temp, qlp0);
|
||||
summ = _mm_srl_epi64(summ, cnt);
|
||||
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
|
||||
data[i] = _mm_cvtsi128_si32(temp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else { /* order > 12 */
|
||||
__m128i qlp[16];
|
||||
|
||||
for(i = 0; i < (int)order/2; i++)
|
||||
qlp[i] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+i*2)), _MM_SHUFFLE(2,0,3,1)); // 0 q[2*i] 0 q[2*i+1]
|
||||
if(order & 1)
|
||||
qlp[i] = _mm_shuffle_epi32(_mm_cvtsi32_si128(qlp_coeff[i*2]), _MM_SHUFFLE(2,0,3,1));
|
||||
|
||||
for(i = 0; i < (int)data_len; i++) {
|
||||
__m128i summ = _mm_setzero_si128(), dat;
|
||||
FLAC__int32 * const datai = &data[i];
|
||||
|
||||
switch((order+1) / 2) {
|
||||
case 16: /* order == 31, 32 */
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-32)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[15])); /* Falls through. */
|
||||
case 15:
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-30)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[14])); /* Falls through. */
|
||||
case 14:
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-28)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[13])); /* Falls through. */
|
||||
case 13:
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-26)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[12])); /* Falls through. */
|
||||
case 12:
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-24)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[11])); /* Falls through. */
|
||||
case 11:
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-22)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[10])); /* Falls through. */
|
||||
case 10:
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-20)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[9])); /* Falls through. */
|
||||
case 9:
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-18)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[8])); /* Falls through. */
|
||||
case 8:
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-16)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[7])); /* Falls through. */
|
||||
case 7: /* order == 13, 14 */
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-14)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[6]));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-12)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[5]));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-10)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[4]));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-8)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[3]));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-6)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[2]));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-4)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[1]));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-2)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[0]));
|
||||
}
|
||||
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
|
||||
summ = _mm_srl_epi64(summ, cnt);
|
||||
summ = _mm_add_epi32(summ, _mm_cvtsi32_si128(residual[i]));
|
||||
data[i] = _mm_cvtsi128_si32(summ);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
FLAC__SSE_TARGET("sse4.1")
|
||||
void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[])
|
||||
{
|
||||
if(order < 8) {
|
||||
FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
|
||||
return;
|
||||
}
|
||||
|
||||
FLAC__ASSERT(order >= 8);
|
||||
FLAC__ASSERT(order <= 32);
|
||||
|
||||
if(order <= 12) {
|
||||
int i;
|
||||
const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
|
||||
|
||||
if(order > 8) /* order == 9, 10, 11, 12 */
|
||||
{
|
||||
__m128i qlp[3], dat[3];
|
||||
__m128i summ, temp;
|
||||
|
||||
qlp[0] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 0)); // q[3] q[2] q[1] q[0]
|
||||
qlp[1] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 4)); // q[7] q[6] q[5] q[4]
|
||||
qlp[2] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 8)); // q[11] q[10] q[9] q[8]
|
||||
switch (order)
|
||||
{
|
||||
case 9:
|
||||
qlp[2] = _mm_slli_si128(qlp[2], 12); qlp[2] = _mm_srli_si128(qlp[2], 12); break; // 0 0 0 q[8]
|
||||
case 10:
|
||||
qlp[2] = _mm_slli_si128(qlp[2], 8); qlp[2] = _mm_srli_si128(qlp[2], 8); break; // 0 0 q[9] q[8]
|
||||
case 11:
|
||||
qlp[2] = _mm_slli_si128(qlp[2], 4); qlp[2] = _mm_srli_si128(qlp[2], 4); break; // 0 q[10] q[9] q[8]
|
||||
}
|
||||
|
||||
dat[2] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 12)), _MM_SHUFFLE(0, 1, 2, 3)); // d[i-12] d[i-11] d[i-10] d[i-9]
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 8)), _MM_SHUFFLE(0, 1, 2, 3)); // d[i-8] d[i-7] d[i-6] d[i-5]
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 4)), _MM_SHUFFLE(0, 1, 2, 3)); // d[i-4] d[i-3] d[i-2] d[i-1]
|
||||
|
||||
for (i = 0;;) {
|
||||
summ = _mm_mullo_epi32(dat[2], qlp[2]);
|
||||
summ = _mm_add_epi32(summ, _mm_mullo_epi32(dat[1], qlp[1]));
|
||||
summ = _mm_add_epi32(summ, _mm_mullo_epi32(dat[0], qlp[0]));
|
||||
|
||||
summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
|
||||
summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
|
||||
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
|
||||
data[i] = _mm_cvtsi128_si32(temp);
|
||||
|
||||
if(++i >= (int)data_len) break;
|
||||
|
||||
temp = _mm_slli_si128(temp, 12);
|
||||
dat[2] = _mm_alignr_epi8(dat[2], dat[1], 12);
|
||||
dat[1] = _mm_alignr_epi8(dat[1], dat[0], 12);
|
||||
dat[0] = _mm_alignr_epi8(dat[0], temp, 12);
|
||||
}
|
||||
}
|
||||
else /* order == 8 */
|
||||
{
|
||||
__m128i qlp[2], dat[2];
|
||||
__m128i summ, temp;
|
||||
|
||||
qlp[0] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 0));
|
||||
qlp[1] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 4));
|
||||
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 8)), _MM_SHUFFLE(0, 1, 2, 3));
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 4)), _MM_SHUFFLE(0, 1, 2, 3));
|
||||
|
||||
for (i = 0;;) {
|
||||
summ = _mm_add_epi32(_mm_mullo_epi32(dat[1], qlp[1]), _mm_mullo_epi32(dat[0], qlp[0]));
|
||||
|
||||
summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
|
||||
summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
|
||||
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
|
||||
data[i] = _mm_cvtsi128_si32(temp);
|
||||
|
||||
if(++i >= (int)data_len) break;
|
||||
|
||||
temp = _mm_slli_si128(temp, 12);
|
||||
dat[1] = _mm_alignr_epi8(dat[1], dat[0], 12);
|
||||
dat[0] = _mm_alignr_epi8(dat[0], temp, 12);
|
||||
}
|
||||
}
|
||||
}
|
||||
else { /* order > 12 */
|
||||
#ifdef FLAC__HAS_NASM
|
||||
FLAC__lpc_restore_signal_asm_ia32(residual, data_len, qlp_coeff, order, lp_quantization, data);
|
||||
#else
|
||||
FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
FLAC__SSE_TARGET("ssse3")
|
||||
void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[])
|
||||
{
|
||||
if(order < 8) {
|
||||
FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
|
||||
return;
|
||||
}
|
||||
|
||||
FLAC__ASSERT(order >= 8);
|
||||
FLAC__ASSERT(order <= 32);
|
||||
|
||||
if(order <= 12) {
|
||||
int i;
|
||||
const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
|
||||
|
||||
if(order > 8) /* order == 9, 10, 11, 12 */
|
||||
{
|
||||
__m128i qlp[2], dat[2];
|
||||
__m128i summ, temp;
|
||||
|
||||
qlp[0] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+0)); // q[3] q[2] q[1] q[0]
|
||||
temp = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+4)); // q[7] q[6] q[5] q[4]
|
||||
qlp[1] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+8)); // q[11] q[10] q[9] q[8]
|
||||
switch(order)
|
||||
{
|
||||
case 9:
|
||||
qlp[1] = _mm_slli_si128(qlp[1], 12); qlp[1] = _mm_srli_si128(qlp[1], 12); break; // 0 0 0 q[8]
|
||||
case 10:
|
||||
qlp[1] = _mm_slli_si128(qlp[1], 8); qlp[1] = _mm_srli_si128(qlp[1], 8); break; // 0 0 q[9] q[8]
|
||||
case 11:
|
||||
qlp[1] = _mm_slli_si128(qlp[1], 4); qlp[1] = _mm_srli_si128(qlp[1], 4); break; // 0 q[10] q[9] q[8]
|
||||
}
|
||||
qlp[0] = _mm_packs_epi32(qlp[0], temp); // q[7] q[6] q[5] q[4] q[3] q[2] q[1] q[0]
|
||||
qlp[1] = _mm_packs_epi32(qlp[1], _mm_setzero_si128()); // 0 0 0 0 q[11] q[10] q[9] q[8]
|
||||
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-12)), _MM_SHUFFLE(0,1,2,3)); // d[i-12] d[i-11] d[i-10] d[i-9]
|
||||
temp = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-8)), _MM_SHUFFLE(0,1,2,3)); // d[i-8] d[i-7] d[i-6] d[i-5]
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-4)), _MM_SHUFFLE(0,1,2,3)); // d[i-4] d[i-3] d[i-2] d[i-1]
|
||||
|
||||
dat[1] = _mm_packs_epi32(dat[1], _mm_setzero_si128()); // 0 0 0 0 d[i-12] d[i-11] d[i-10] d[i-9]
|
||||
dat[0] = _mm_packs_epi32(dat[0], temp); // d[i-8] d[i-7] d[i-6] d[i-5] d[i-4] d[i-3] d[i-2] d[i-1]
|
||||
|
||||
for(i = 0;;) {
|
||||
summ = _mm_madd_epi16(dat[1], qlp[1]);
|
||||
summ = _mm_add_epi32(summ, _mm_madd_epi16(dat[0], qlp[0]));
|
||||
|
||||
summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
|
||||
summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
|
||||
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
|
||||
data[i] = _mm_cvtsi128_si32(temp);
|
||||
|
||||
if(++i >= (int)data_len) break;
|
||||
|
||||
temp = _mm_slli_si128(temp, 14);
|
||||
dat[1] = _mm_alignr_epi8(dat[1], dat[0], 14); // 0 0 0 d[i-12] d[i-11] d[i-10] d[i-9] d[i-8]
|
||||
dat[0] = _mm_alignr_epi8(dat[0], temp, 14); // d[i-7] d[i-6] d[i-5] d[i-4] d[i-3] d[i-2] d[i-1] d[i]
|
||||
}
|
||||
}
|
||||
else /* order == 8 */
|
||||
{
|
||||
__m128i qlp0, dat0;
|
||||
__m128i summ, temp;
|
||||
|
||||
qlp0 = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+0)); // q[3] q[2] q[1] q[0]
|
||||
temp = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+4)); // q[7] q[6] q[5] q[4]
|
||||
qlp0 = _mm_packs_epi32(qlp0, temp); // q[7] q[6] q[5] q[4] q[3] q[2] q[1] q[0]
|
||||
|
||||
temp = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-8)), _MM_SHUFFLE(0,1,2,3));
|
||||
dat0 = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-4)), _MM_SHUFFLE(0,1,2,3));
|
||||
dat0 = _mm_packs_epi32(dat0, temp); // d[i-8] d[i-7] d[i-6] d[i-5] d[i-4] d[i-3] d[i-2] d[i-1]
|
||||
|
||||
for(i = 0;;) {
|
||||
summ = _mm_madd_epi16(dat0, qlp0);
|
||||
|
||||
summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
|
||||
summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
|
||||
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
|
||||
data[i] = _mm_cvtsi128_si32(temp);
|
||||
|
||||
if(++i >= (int)data_len) break;
|
||||
|
||||
temp = _mm_slli_si128(temp, 14);
|
||||
dat0 = _mm_alignr_epi8(dat0, temp, 14); // d[i-7] d[i-6] d[i-5] d[i-4] d[i-3] d[i-2] d[i-1] d[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
else { /* order > 12 */
|
||||
#ifdef FLAC__HAS_NASM
|
||||
FLAC__lpc_restore_signal_asm_ia32_mmx(residual, data_len, qlp_coeff, order, lp_quantization, data);
|
||||
#else
|
||||
FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* defined FLAC__CPU_IA32 */
|
||||
|
||||
FLAC__SSE_TARGET("sse4.1")
|
||||
|
@ -45,7 +45,6 @@
|
||||
#include "protected/stream_decoder.h"
|
||||
#include "private/bitreader.h"
|
||||
#include "private/bitmath.h"
|
||||
#include "private/cpu.h"
|
||||
#include "private/crc.h"
|
||||
#include "private/fixed.h"
|
||||
#include "private/format.h"
|
||||
@ -129,12 +128,6 @@ typedef struct FLAC__StreamDecoderPrivate {
|
||||
FLAC__StreamDecoderWriteCallback write_callback;
|
||||
FLAC__StreamDecoderMetadataCallback metadata_callback;
|
||||
FLAC__StreamDecoderErrorCallback error_callback;
|
||||
/* generic 32-bit datapath: */
|
||||
void (*local_lpc_restore_signal)(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
|
||||
/* generic 64-bit datapath: */
|
||||
void (*local_lpc_restore_signal_64bit)(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
|
||||
/* for use when the signal is <= 16 bits-per-sample, or <= 15 bits-per-sample on a side channel (which requires 1 extra bit): */
|
||||
void (*local_lpc_restore_signal_16bit)(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
|
||||
void *client_data;
|
||||
FILE *file; /* only used if FLAC__stream_decoder_init_file()/FLAC__stream_decoder_init_file() called, else NULL */
|
||||
FLAC__BitReader *input;
|
||||
@ -152,7 +145,6 @@ typedef struct FLAC__StreamDecoderPrivate {
|
||||
size_t metadata_filter_ids_count, metadata_filter_ids_capacity; /* units for both are IDs, not bytes */
|
||||
FLAC__Frame frame;
|
||||
FLAC__bool cached; /* true if there is a byte in lookahead */
|
||||
FLAC__CPUInfo cpuinfo;
|
||||
FLAC__byte header_warmup[2]; /* contains the sync code and reserved bits */
|
||||
FLAC__byte lookahead; /* temp storage when we need to look ahead one byte in the stream */
|
||||
/* unaligned (original) pointers to allocated data */
|
||||
@ -373,48 +365,6 @@ static FLAC__StreamDecoderInitStatus init_stream_internal_(
|
||||
return decoder->protected_->initstate = FLAC__STREAM_DECODER_INIT_STATUS_ERROR_OPENING_FILE;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* get the CPU info and set the function pointers
|
||||
*/
|
||||
FLAC__cpu_info(&decoder->private_->cpuinfo);
|
||||
/* first default to the non-asm routines */
|
||||
decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal;
|
||||
decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide;
|
||||
decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal;
|
||||
/* now override with asm where appropriate */
|
||||
#ifndef FLAC__NO_ASM
|
||||
if(decoder->private_->cpuinfo.use_asm) {
|
||||
#ifdef FLAC__CPU_IA32
|
||||
FLAC__ASSERT(decoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32);
|
||||
#ifdef FLAC__HAS_NASM
|
||||
decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide_asm_ia32; /* OPT_IA32: was really necessary for GCC < 4.9 */
|
||||
if (decoder->private_->cpuinfo.x86.mmx) {
|
||||
decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_asm_ia32;
|
||||
decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ia32_mmx;
|
||||
}
|
||||
else {
|
||||
decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_asm_ia32;
|
||||
decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ia32;
|
||||
}
|
||||
#endif
|
||||
#if FLAC__HAS_X86INTRIN && ! defined FLAC__INTEGER_ONLY_LIBRARY
|
||||
# if defined FLAC__SSE4_1_SUPPORTED
|
||||
if (decoder->private_->cpuinfo.x86.sse41) {
|
||||
# if !defined FLAC__HAS_NASM /* these are not undoubtedly faster than their MMX ASM counterparts */
|
||||
decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_intrin_sse41;
|
||||
decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_16_intrin_sse41;
|
||||
# endif
|
||||
decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide_intrin_sse41;
|
||||
}
|
||||
# endif
|
||||
#endif
|
||||
#elif defined FLAC__CPU_X86_64
|
||||
FLAC__ASSERT(decoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_X86_64);
|
||||
/* No useful SSE optimizations yet */
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
/* from here on, errors are fatal */
|
||||
|
||||
if(!FLAC__bitreader_init(decoder->private_->input, read_callback_, decoder)) {
|
||||
@ -2848,12 +2798,9 @@ FLAC__bool read_subframe_lpc_(FLAC__StreamDecoder *decoder, uint32_t channel, ui
|
||||
if(do_full_decode) {
|
||||
memcpy(decoder->private_->output[channel], subframe->warmup, sizeof(FLAC__int32) * order);
|
||||
if(bps + subframe->qlp_coeff_precision + FLAC__bitmath_ilog2(order) <= 32)
|
||||
if(bps <= 16 && subframe->qlp_coeff_precision <= 16)
|
||||
decoder->private_->local_lpc_restore_signal_16bit(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
|
||||
else
|
||||
decoder->private_->local_lpc_restore_signal(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
|
||||
FLAC__lpc_restore_signal(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
|
||||
else
|
||||
decoder->private_->local_lpc_restore_signal_64bit(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
|
||||
FLAC__lpc_restore_signal_wide(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
|
||||
}
|
||||
|
||||
return true;
|
||||
|
Loading…
Reference in New Issue
Block a user