Silence clang warnings on alignment of unaligned loads
Clang throws *a lot* of warnings on alignment requirement increase where no alignment is required at all. This is a results of the way intrinsics were implemented long ago. See https://stackoverflow.com/questions/71279668/why-does-clang-complain-about-alignment-on-sse-intrinsic-unaligned-loads This commit silences these warnings by first casting the pointer to (const void*) or (void*) before casting the (const __m128i*), (__m128i*), (const __m256i*) or (__m256i*) Compiling with and without this patch returns exactly the same binary for GCC 9.3 and clang 10.0
This commit is contained in:
parent
179cdce1db
commit
be1df4085a
@ -77,20 +77,20 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_madd_epi16(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
|
||||
mull = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_madd_epi16(q11, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-12)));
|
||||
mull = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 11 */
|
||||
@ -109,19 +109,19 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
|
||||
mull = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-11)));
|
||||
mull = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -141,18 +141,18 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(data+i-10)));
|
||||
mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-10)));
|
||||
mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 9 */
|
||||
@ -169,17 +169,17 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 )));
|
||||
mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-9 )));
|
||||
mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -199,16 +199,16 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 )));
|
||||
mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-8 )));
|
||||
mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 7 */
|
||||
@ -223,15 +223,15 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 )));
|
||||
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7 )));
|
||||
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -247,14 +247,14 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 )));
|
||||
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6 )));
|
||||
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 5 */
|
||||
@ -267,13 +267,13 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 )));
|
||||
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5 )));
|
||||
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -289,12 +289,12 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 )));
|
||||
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4 )));
|
||||
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 3 */
|
||||
@ -305,11 +305,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 )));
|
||||
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3 )));
|
||||
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -321,10 +321,10 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 )));
|
||||
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2 )));
|
||||
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 1 */
|
||||
@ -333,9 +333,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ;
|
||||
summ = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 )));
|
||||
summ = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 )));
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -432,20 +432,20 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_mullo_epi32(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
|
||||
mull = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_mullo_epi32(q11, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-12)));
|
||||
mull = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-9))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 11 */
|
||||
@ -464,19 +464,19 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
|
||||
mull = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-11)));
|
||||
mull = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-9))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -496,18 +496,18 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(data+i-10)));
|
||||
mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-10)));
|
||||
mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-9))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 9 */
|
||||
@ -524,17 +524,17 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9)));
|
||||
mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-9)));
|
||||
mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -554,16 +554,16 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8)));
|
||||
mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-8)));
|
||||
mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 7 */
|
||||
@ -578,15 +578,15 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7)));
|
||||
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7)));
|
||||
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -602,14 +602,14 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6)));
|
||||
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6)));
|
||||
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 5 */
|
||||
@ -622,13 +622,13 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5)));
|
||||
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5)));
|
||||
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -644,12 +644,12 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4)));
|
||||
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4)));
|
||||
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 3 */
|
||||
@ -660,11 +660,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3)));
|
||||
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3)));
|
||||
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
|
||||
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -676,10 +676,10 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2)));
|
||||
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2)));
|
||||
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 1 */
|
||||
@ -688,9 +688,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
|
||||
|
||||
for(i = 0; i < (int)data_len-7; i+=8) {
|
||||
__m256i summ;
|
||||
summ = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1)));
|
||||
summ = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1)));
|
||||
summ = _mm256_sra_epi32(summ, cnt);
|
||||
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
|
||||
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -765,7 +765,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
|
||||
int i;
|
||||
FLAC__int64 sum;
|
||||
const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
|
||||
const __m256i pack = _mm256_loadu_si256((const __m256i *)pack_arr);
|
||||
const __m256i pack = _mm256_loadu_si256((const __m256i *)(const void*)pack_arr);
|
||||
|
||||
FLAC__ASSERT(order > 0);
|
||||
FLAC__ASSERT(order <= 32);
|
||||
@ -791,20 +791,20 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_mul_epi32(q11, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-12))));
|
||||
mull = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11)))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
summ = _mm256_mul_epi32(q11, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-12))));
|
||||
mull = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-11)))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
}
|
||||
}
|
||||
else { /* order == 11 */
|
||||
@ -823,19 +823,19 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11))));
|
||||
mull = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
summ = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-11))));
|
||||
mull = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -855,18 +855,18 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10))));
|
||||
mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
summ = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-10))));
|
||||
mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
}
|
||||
}
|
||||
else { /* order == 9 */
|
||||
@ -883,17 +883,17 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 ))));
|
||||
mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
summ = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-9 ))));
|
||||
mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -913,16 +913,16 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 ))));
|
||||
mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
summ = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-8 ))));
|
||||
mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
}
|
||||
}
|
||||
else { /* order == 7 */
|
||||
@ -937,15 +937,15 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 ))));
|
||||
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
summ = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-7 ))));
|
||||
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -961,14 +961,14 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 ))));
|
||||
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
summ = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-6 ))));
|
||||
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
}
|
||||
}
|
||||
else { /* order == 5 */
|
||||
@ -981,13 +981,13 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 ))));
|
||||
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
summ = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-5 ))));
|
||||
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1003,12 +1003,12 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 ))));
|
||||
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
summ = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-4 ))));
|
||||
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
}
|
||||
}
|
||||
else { /* order == 3 */
|
||||
@ -1019,11 +1019,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 ))));
|
||||
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
summ = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-3 ))));
|
||||
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1035,10 +1035,10 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m256i summ, mull;
|
||||
summ = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 ))));
|
||||
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
summ = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-2 ))));
|
||||
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
|
||||
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
}
|
||||
}
|
||||
else { /* order == 1 */
|
||||
@ -1047,9 +1047,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m256i summ;
|
||||
summ = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 ))));
|
||||
summ = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 ))));
|
||||
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -80,20 +80,20 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ, mull;
|
||||
summ = _mm_madd_epi16(q11, _mm_loadu_si128((const __m128i*)(data+i-12)));
|
||||
mull = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_madd_epi16(q11, _mm_loadu_si128((const __m128i*)(const void*)(data+i-12)));
|
||||
mull = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(const void*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 11 */
|
||||
@ -112,19 +112,19 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ, mull;
|
||||
summ = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11)));
|
||||
mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(const void*)(data+i-11)));
|
||||
mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -144,18 +144,18 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ, mull;
|
||||
summ = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10)));
|
||||
mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10)));
|
||||
mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 9 */
|
||||
@ -172,17 +172,17 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ, mull;
|
||||
summ = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9)));
|
||||
mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9)));
|
||||
mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -202,16 +202,16 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ, mull;
|
||||
summ = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8)));
|
||||
mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8)));
|
||||
mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 7 */
|
||||
@ -226,15 +226,15 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ, mull;
|
||||
summ = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7)));
|
||||
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7)));
|
||||
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -250,14 +250,14 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ, mull;
|
||||
summ = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6)));
|
||||
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6)));
|
||||
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 5 */
|
||||
@ -270,13 +270,13 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ, mull;
|
||||
summ = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5)));
|
||||
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5)));
|
||||
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -292,12 +292,12 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ, mull;
|
||||
summ = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4)));
|
||||
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4)));
|
||||
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 3 */
|
||||
@ -308,11 +308,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ, mull;
|
||||
summ = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3)));
|
||||
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3)));
|
||||
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -324,10 +324,10 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ, mull;
|
||||
summ = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2)));
|
||||
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2)));
|
||||
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 1 */
|
||||
@ -336,9 +336,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ;
|
||||
summ = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1)));
|
||||
summ = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1)));
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -417,12 +417,12 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
|
||||
if(order > 10) { /* order == 11, 12 */
|
||||
if(order == 12) {
|
||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); // 0 0 q[1] q[0]
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); // 0 0 q[3] q[2]
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); // 0 0 q[5] q[4]
|
||||
xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); // 0 0 q[7] q[6]
|
||||
xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); // 0 0 q[9] q[8]
|
||||
xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0 0 q[11] q[10]
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)); // 0 0 q[1] q[0]
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)); // 0 0 q[3] q[2]
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4)); // 0 0 q[5] q[4]
|
||||
xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6)); // 0 0 q[7] q[6]
|
||||
xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8)); // 0 0 q[9] q[8]
|
||||
xmm5 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+10)); // 0 0 q[11] q[10]
|
||||
|
||||
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0]
|
||||
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2]
|
||||
@ -435,41 +435,41 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
|
||||
//sum = 0;
|
||||
//sum += qlp_coeff[11] * data[i-12];
|
||||
//sum += qlp_coeff[10] * data[i-11];
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12)); // 0 0 d[i-11] d[i-12]
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-12)); // 0 0 d[i-11] d[i-12]
|
||||
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
|
||||
xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
|
||||
|
||||
//sum += qlp_coeff[9] * data[i-10];
|
||||
//sum += qlp_coeff[8] * data[i-9];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm4);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[7] * data[i-8];
|
||||
//sum += qlp_coeff[6] * data[i-7];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm3);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[5] * data[i-6];
|
||||
//sum += qlp_coeff[4] * data[i-5];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm2);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[3] * data[i-4];
|
||||
//sum += qlp_coeff[2] * data[i-3];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm1);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[1] * data[i-2];
|
||||
//sum += qlp_coeff[0] * data[i-1];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm0);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
@ -480,11 +480,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
|
||||
}
|
||||
else { /* order == 11 */
|
||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
|
||||
xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
|
||||
xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
|
||||
xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
|
||||
xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8));
|
||||
xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
|
||||
|
||||
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
|
||||
@ -501,35 +501,35 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
|
||||
|
||||
//sum += qlp_coeff[9] * data[i-10];
|
||||
//sum += qlp_coeff[8] * data[i-9];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm4);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[7] * data[i-8];
|
||||
//sum += qlp_coeff[6] * data[i-7];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm3);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[5] * data[i-6];
|
||||
//sum += qlp_coeff[4] * data[i-5];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm2);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[3] * data[i-4];
|
||||
//sum += qlp_coeff[2] * data[i-3];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm1);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[1] * data[i-2];
|
||||
//sum += qlp_coeff[0] * data[i-1];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm0);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
@ -542,11 +542,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
|
||||
else { /* order == 9, 10 */
|
||||
if(order == 10) {
|
||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
|
||||
xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
|
||||
xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
|
||||
xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
|
||||
xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8));
|
||||
|
||||
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
|
||||
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
|
||||
@ -558,34 +558,34 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
|
||||
//sum = 0;
|
||||
//sum += qlp_coeff[9] * data[i-10];
|
||||
//sum += qlp_coeff[8] * data[i-9];
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
|
||||
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm7 = _mm_mul_epu32(xmm7, xmm4);
|
||||
|
||||
//sum += qlp_coeff[7] * data[i-8];
|
||||
//sum += qlp_coeff[6] * data[i-7];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm3);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[5] * data[i-6];
|
||||
//sum += qlp_coeff[4] * data[i-5];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm2);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[3] * data[i-4];
|
||||
//sum += qlp_coeff[2] * data[i-3];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm1);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[1] * data[i-2];
|
||||
//sum += qlp_coeff[0] * data[i-1];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm0);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
@ -596,10 +596,10 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
|
||||
}
|
||||
else { /* order == 9 */
|
||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
|
||||
xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
|
||||
xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
|
||||
xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
|
||||
|
||||
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
|
||||
@ -615,28 +615,28 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
|
||||
|
||||
//sum += qlp_coeff[7] * data[i-8];
|
||||
//sum += qlp_coeff[6] * data[i-7];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm3);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[5] * data[i-6];
|
||||
//sum += qlp_coeff[4] * data[i-5];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm2);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[3] * data[i-4];
|
||||
//sum += qlp_coeff[2] * data[i-3];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm1);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[1] * data[i-2];
|
||||
//sum += qlp_coeff[0] * data[i-1];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm0);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
@ -651,10 +651,10 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
|
||||
if(order > 6) { /* order == 7, 8 */
|
||||
if(order == 8) {
|
||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
|
||||
xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
|
||||
xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
|
||||
|
||||
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
|
||||
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
|
||||
@ -665,27 +665,27 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
|
||||
//sum = 0;
|
||||
//sum += qlp_coeff[7] * data[i-8];
|
||||
//sum += qlp_coeff[6] * data[i-7];
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
|
||||
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm7 = _mm_mul_epu32(xmm7, xmm3);
|
||||
|
||||
//sum += qlp_coeff[5] * data[i-6];
|
||||
//sum += qlp_coeff[4] * data[i-5];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm2);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[3] * data[i-4];
|
||||
//sum += qlp_coeff[2] * data[i-3];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm1);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[1] * data[i-2];
|
||||
//sum += qlp_coeff[0] * data[i-1];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm0);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
@ -696,9 +696,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
|
||||
}
|
||||
else { /* order == 7 */
|
||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
|
||||
xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
|
||||
|
||||
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
|
||||
@ -713,21 +713,21 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
|
||||
|
||||
//sum += qlp_coeff[5] * data[i-6];
|
||||
//sum += qlp_coeff[4] * data[i-5];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm2);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[3] * data[i-4];
|
||||
//sum += qlp_coeff[2] * data[i-3];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm1);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[1] * data[i-2];
|
||||
//sum += qlp_coeff[0] * data[i-1];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm0);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
@ -740,9 +740,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
|
||||
else { /* order == 5, 6 */
|
||||
if(order == 6) {
|
||||
__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
|
||||
|
||||
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
|
||||
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
|
||||
@ -752,20 +752,20 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
|
||||
//sum = 0;
|
||||
//sum += qlp_coeff[5] * data[i-6];
|
||||
//sum += qlp_coeff[4] * data[i-5];
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
|
||||
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm7 = _mm_mul_epu32(xmm7, xmm2);
|
||||
|
||||
//sum += qlp_coeff[3] * data[i-4];
|
||||
//sum += qlp_coeff[2] * data[i-3];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm1);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[1] * data[i-2];
|
||||
//sum += qlp_coeff[0] * data[i-1];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm0);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
@ -776,8 +776,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
|
||||
}
|
||||
else { /* order == 5 */
|
||||
__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
|
||||
xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
|
||||
|
||||
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
|
||||
@ -791,14 +791,14 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
|
||||
|
||||
//sum += qlp_coeff[3] * data[i-4];
|
||||
//sum += qlp_coeff[2] * data[i-3];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm1);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[1] * data[i-2];
|
||||
//sum += qlp_coeff[0] * data[i-1];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm0);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
@ -813,8 +813,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
|
||||
if(order > 2) { /* order == 3, 4 */
|
||||
if(order == 4) {
|
||||
__m128i xmm0, xmm1, xmm6, xmm7;
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
|
||||
|
||||
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
|
||||
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
|
||||
@ -823,13 +823,13 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
|
||||
//sum = 0;
|
||||
//sum += qlp_coeff[3] * data[i-4];
|
||||
//sum += qlp_coeff[2] * data[i-3];
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
|
||||
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm7 = _mm_mul_epu32(xmm7, xmm1);
|
||||
|
||||
//sum += qlp_coeff[1] * data[i-2];
|
||||
//sum += qlp_coeff[0] * data[i-1];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm0);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
@ -840,7 +840,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
|
||||
}
|
||||
else { /* order == 3 */
|
||||
__m128i xmm0, xmm1, xmm6, xmm7;
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
|
||||
xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
|
||||
|
||||
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
|
||||
@ -853,7 +853,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
|
||||
|
||||
//sum += qlp_coeff[1] * data[i-2];
|
||||
//sum += qlp_coeff[0] * data[i-1];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epu32(xmm6, xmm0);
|
||||
xmm7 = _mm_add_epi32(xmm7, xmm6);
|
||||
@ -866,14 +866,14 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
|
||||
else { /* order == 1, 2 */
|
||||
if(order == 2) {
|
||||
__m128i xmm0, xmm7;
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
|
||||
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
|
||||
|
||||
for(i = 0; i < (int)data_len; i++) {
|
||||
//sum = 0;
|
||||
//sum += qlp_coeff[1] * data[i-2];
|
||||
//sum += qlp_coeff[0] * data[i-1];
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
|
||||
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm7 = _mm_mul_epu32(xmm7, xmm0);
|
||||
|
||||
|
@ -67,12 +67,12 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
|
||||
if(order > 10) { /* order == 11, 12 */
|
||||
if(order == 12) {
|
||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); // 0 0 q[1] q[0]
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); // 0 0 q[3] q[2]
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); // 0 0 q[5] q[4]
|
||||
xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); // 0 0 q[7] q[6]
|
||||
xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); // 0 0 q[9] q[8]
|
||||
xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0 0 q[11] q[10]
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)); // 0 0 q[1] q[0]
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)); // 0 0 q[3] q[2]
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4)); // 0 0 q[5] q[4]
|
||||
xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6)); // 0 0 q[7] q[6]
|
||||
xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8)); // 0 0 q[9] q[8]
|
||||
xmm5 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+10)); // 0 0 q[11] q[10]
|
||||
|
||||
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0]
|
||||
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2]
|
||||
@ -85,41 +85,41 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
|
||||
//sum = 0;
|
||||
//sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
|
||||
//sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12)); // 0 0 d[i-11] d[i-12]
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-12)); // 0 0 d[i-11] d[i-12]
|
||||
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
|
||||
xmm7 = _mm_mul_epi32(xmm7, xmm5);
|
||||
|
||||
//sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
|
||||
//sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm4);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
|
||||
//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm3);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
|
||||
//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm2);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
|
||||
//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm1);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
|
||||
//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm0);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
@ -130,11 +130,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
|
||||
}
|
||||
else { /* order == 11 */
|
||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
|
||||
xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
|
||||
xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
|
||||
xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
|
||||
xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8));
|
||||
xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
|
||||
|
||||
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
|
||||
@ -151,35 +151,35 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
|
||||
|
||||
//sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
|
||||
//sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm4);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
|
||||
//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm3);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
|
||||
//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm2);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
|
||||
//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm1);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
|
||||
//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm0);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
@ -192,11 +192,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
|
||||
else { /* order == 9, 10 */
|
||||
if(order == 10) {
|
||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
|
||||
xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
|
||||
xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
|
||||
xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
|
||||
xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8));
|
||||
|
||||
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
|
||||
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
|
||||
@ -208,34 +208,34 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
|
||||
//sum = 0;
|
||||
//sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
|
||||
//sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
|
||||
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm7 = _mm_mul_epi32(xmm7, xmm4);
|
||||
|
||||
//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
|
||||
//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm3);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
|
||||
//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm2);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
|
||||
//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm1);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
|
||||
//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm0);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
@ -246,10 +246,10 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
|
||||
}
|
||||
else { /* order == 9 */
|
||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
|
||||
xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
|
||||
xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
|
||||
xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
|
||||
|
||||
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
|
||||
@ -265,28 +265,28 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
|
||||
|
||||
//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
|
||||
//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm3);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
|
||||
//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm2);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
|
||||
//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm1);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
|
||||
//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm0);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
@ -301,10 +301,10 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
|
||||
if(order > 6) { /* order == 7, 8 */
|
||||
if(order == 8) {
|
||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
|
||||
xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
|
||||
xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
|
||||
|
||||
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
|
||||
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
|
||||
@ -315,27 +315,27 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
|
||||
//sum = 0;
|
||||
//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
|
||||
//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
|
||||
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm7 = _mm_mul_epi32(xmm7, xmm3);
|
||||
|
||||
//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
|
||||
//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm2);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
|
||||
//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm1);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
|
||||
//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm0);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
@ -346,9 +346,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
|
||||
}
|
||||
else { /* order == 7 */
|
||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
|
||||
xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
|
||||
|
||||
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
|
||||
@ -363,21 +363,21 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
|
||||
|
||||
//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
|
||||
//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm2);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
|
||||
//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm1);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
|
||||
//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm0);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
@ -390,9 +390,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
|
||||
else { /* order == 5, 6 */
|
||||
if(order == 6) {
|
||||
__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
|
||||
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
|
||||
|
||||
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
|
||||
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
|
||||
@ -402,20 +402,20 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
|
||||
//sum = 0;
|
||||
//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
|
||||
//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
|
||||
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm7 = _mm_mul_epi32(xmm7, xmm2);
|
||||
|
||||
//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
|
||||
//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm1);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
|
||||
//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm0);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
@ -426,8 +426,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
|
||||
}
|
||||
else { /* order == 5 */
|
||||
__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
|
||||
xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
|
||||
|
||||
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
|
||||
@ -441,14 +441,14 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
|
||||
|
||||
//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
|
||||
//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm1);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
|
||||
//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
|
||||
//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm0);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
@ -463,8 +463,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
|
||||
if(order > 2) { /* order == 3, 4 */
|
||||
if(order == 4) {
|
||||
__m128i xmm0, xmm1, xmm6, xmm7;
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
|
||||
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
|
||||
|
||||
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
|
||||
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
|
||||
@ -473,13 +473,13 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
|
||||
//sum = 0;
|
||||
//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
|
||||
//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
|
||||
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm7 = _mm_mul_epi32(xmm7, xmm1);
|
||||
|
||||
//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
|
||||
//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm0);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
@ -490,7 +490,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
|
||||
}
|
||||
else { /* order == 3 */
|
||||
__m128i xmm0, xmm1, xmm6, xmm7;
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
|
||||
xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
|
||||
|
||||
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
|
||||
@ -503,7 +503,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
|
||||
|
||||
//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
|
||||
//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
|
||||
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
|
||||
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm6 = _mm_mul_epi32(xmm6, xmm0);
|
||||
xmm7 = _mm_add_epi64(xmm7, xmm6);
|
||||
@ -516,14 +516,14 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
|
||||
else { /* order == 1, 2 */
|
||||
if(order == 2) {
|
||||
__m128i xmm0, xmm7;
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
|
||||
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
|
||||
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
|
||||
|
||||
for(i = 0; i < (int)data_len; i++) {
|
||||
//sum = 0;
|
||||
//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
|
||||
//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
|
||||
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
|
||||
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
|
||||
xmm7 = _mm_mul_epi32(xmm7, xmm0);
|
||||
|
||||
@ -606,22 +606,22 @@ void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], ui
|
||||
if(order > 10) { /* order == 11, 12 */
|
||||
__m128i qlp[6], dat[6];
|
||||
__m128i summ, temp;
|
||||
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+0))); // 0 q[1] 0 q[0]
|
||||
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+2))); // 0 q[3] 0 q[2]
|
||||
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+4))); // 0 q[5] 0 q[4]
|
||||
qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+6))); // 0 q[7] 0 q[6]
|
||||
qlp[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+8))); // 0 q[9] 0 q[8]
|
||||
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0))); // 0 q[1] 0 q[0]
|
||||
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2))); // 0 q[3] 0 q[2]
|
||||
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4))); // 0 q[5] 0 q[4]
|
||||
qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6))); // 0 q[7] 0 q[6]
|
||||
qlp[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8))); // 0 q[9] 0 q[8]
|
||||
if (order == 12)
|
||||
qlp[5] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+10))); // 0 q[11] 0 q[10]
|
||||
qlp[5] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+10))); // 0 q[11] 0 q[10]
|
||||
else
|
||||
qlp[5] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[10])); // 0 0 0 q[10]
|
||||
|
||||
dat[5] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-12)), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
|
||||
dat[4] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-10)), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-10] 0 d[i-9]
|
||||
dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-8 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-8] 0 d[i-7]
|
||||
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-6 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-6] 0 d[i-5]
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-4 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-4] 0 d[i-3]
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-2 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-2] 0 d[i-1]
|
||||
dat[5] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-12)), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
|
||||
dat[4] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-10)), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-10] 0 d[i-9]
|
||||
dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-8 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-8] 0 d[i-7]
|
||||
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-6] 0 d[i-5]
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-4] 0 d[i-3]
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-2] 0 d[i-1]
|
||||
|
||||
summ = _mm_mul_epi32(dat[5], qlp[5]) ;
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4]));
|
||||
@ -660,20 +660,20 @@ void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], ui
|
||||
else { /* order == 9, 10 */
|
||||
__m128i qlp[5], dat[5];
|
||||
__m128i summ, temp;
|
||||
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+0)));
|
||||
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+2)));
|
||||
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+4)));
|
||||
qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+6)));
|
||||
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)));
|
||||
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)));
|
||||
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4)));
|
||||
qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6)));
|
||||
if (order == 10)
|
||||
qlp[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+8)));
|
||||
qlp[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8)));
|
||||
else
|
||||
qlp[4] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[8]));
|
||||
|
||||
dat[4] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-10)), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-8 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[4] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-10)), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-8 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
|
||||
|
||||
summ = _mm_mul_epi32(dat[4], qlp[4]) ;
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
|
||||
@ -711,18 +711,18 @@ void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], ui
|
||||
if(order > 6) { /* order == 7, 8 */
|
||||
__m128i qlp[4], dat[4];
|
||||
__m128i summ, temp;
|
||||
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+0)));
|
||||
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+2)));
|
||||
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+4)));
|
||||
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)));
|
||||
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)));
|
||||
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4)));
|
||||
if (order == 8)
|
||||
qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+6)));
|
||||
qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6)));
|
||||
else
|
||||
qlp[3] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[6]));
|
||||
|
||||
dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-8 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-8 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
|
||||
|
||||
summ = _mm_mul_epi32(dat[3], qlp[3]) ;
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
|
||||
@ -755,16 +755,16 @@ void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], ui
|
||||
else { /* order == 5, 6 */
|
||||
__m128i qlp[3], dat[3];
|
||||
__m128i summ, temp;
|
||||
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+0)));
|
||||
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+2)));
|
||||
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)));
|
||||
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)));
|
||||
if (order == 6)
|
||||
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+4)));
|
||||
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4)));
|
||||
else
|
||||
qlp[2] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[4]));
|
||||
|
||||
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
|
||||
|
||||
summ = _mm_mul_epi32(dat[2], qlp[2]) ;
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
|
||||
@ -796,14 +796,14 @@ void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], ui
|
||||
if(order > 2) { /* order == 3, 4 */
|
||||
__m128i qlp[2], dat[2];
|
||||
__m128i summ, temp;
|
||||
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+0)));
|
||||
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)));
|
||||
if (order == 4)
|
||||
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+2)));
|
||||
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)));
|
||||
else
|
||||
qlp[1] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[2]));
|
||||
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
|
||||
|
||||
summ = _mm_mul_epi32(dat[1], qlp[1]) ;
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
|
||||
@ -831,9 +831,9 @@ void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], ui
|
||||
if(order == 2) {
|
||||
__m128i qlp0, dat0;
|
||||
__m128i summ, temp;
|
||||
qlp0 = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff)));
|
||||
qlp0 = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff)));
|
||||
|
||||
dat0 = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
|
||||
dat0 = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
|
||||
|
||||
summ = _mm_mul_epi32(dat0, qlp0);
|
||||
|
||||
@ -878,7 +878,7 @@ void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], ui
|
||||
__m128i qlp[16];
|
||||
|
||||
for(i = 0; i < (int)order/2; i++)
|
||||
qlp[i] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(qlp_coeff+i*2)), _MM_SHUFFLE(2,0,3,1)); // 0 q[2*i] 0 q[2*i+1]
|
||||
qlp[i] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+i*2)), _MM_SHUFFLE(2,0,3,1)); // 0 q[2*i] 0 q[2*i+1]
|
||||
if(order & 1)
|
||||
qlp[i] = _mm_shuffle_epi32(_mm_cvtsi32_si128(qlp_coeff[i*2]), _MM_SHUFFLE(2,0,3,1));
|
||||
|
||||
@ -888,46 +888,46 @@ void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], ui
|
||||
|
||||
switch((order+1) / 2) {
|
||||
case 16: /* order == 31, 32 */
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-32)));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-32)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[15])); /* Falls through. */
|
||||
case 15:
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-30)));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-30)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[14])); /* Falls through. */
|
||||
case 14:
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-28)));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-28)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[13])); /* Falls through. */
|
||||
case 13:
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-26)));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-26)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[12])); /* Falls through. */
|
||||
case 12:
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-24)));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-24)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[11])); /* Falls through. */
|
||||
case 11:
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-22)));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-22)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[10])); /* Falls through. */
|
||||
case 10:
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-20)));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-20)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[9])); /* Falls through. */
|
||||
case 9:
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-18)));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-18)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[8])); /* Falls through. */
|
||||
case 8:
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-16)));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-16)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[7])); /* Falls through. */
|
||||
case 7: /* order == 13, 14 */
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-14)));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-14)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[6]));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-12)));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-12)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[5]));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-10)));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-10)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[4]));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-8)));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-8)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[3]));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-6)));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-6)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[2]));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-4)));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-4)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[1]));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-2)));
|
||||
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-2)));
|
||||
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[0]));
|
||||
}
|
||||
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
|
||||
@ -958,9 +958,9 @@ void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_
|
||||
__m128i qlp[3], dat[3];
|
||||
__m128i summ, temp;
|
||||
|
||||
qlp[0] = _mm_loadu_si128((const __m128i*)(qlp_coeff + 0)); // q[3] q[2] q[1] q[0]
|
||||
qlp[1] = _mm_loadu_si128((const __m128i*)(qlp_coeff + 4)); // q[7] q[6] q[5] q[4]
|
||||
qlp[2] = _mm_loadu_si128((const __m128i*)(qlp_coeff + 8)); // q[11] q[10] q[9] q[8]
|
||||
qlp[0] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 0)); // q[3] q[2] q[1] q[0]
|
||||
qlp[1] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 4)); // q[7] q[6] q[5] q[4]
|
||||
qlp[2] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 8)); // q[11] q[10] q[9] q[8]
|
||||
switch (order)
|
||||
{
|
||||
case 9:
|
||||
@ -971,9 +971,9 @@ void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_
|
||||
qlp[2] = _mm_slli_si128(qlp[2], 4); qlp[2] = _mm_srli_si128(qlp[2], 4); break; // 0 q[10] q[9] q[8]
|
||||
}
|
||||
|
||||
dat[2] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(data - 12)), _MM_SHUFFLE(0, 1, 2, 3)); // d[i-12] d[i-11] d[i-10] d[i-9]
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(data - 8)), _MM_SHUFFLE(0, 1, 2, 3)); // d[i-8] d[i-7] d[i-6] d[i-5]
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(data - 4)), _MM_SHUFFLE(0, 1, 2, 3)); // d[i-4] d[i-3] d[i-2] d[i-1]
|
||||
dat[2] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 12)), _MM_SHUFFLE(0, 1, 2, 3)); // d[i-12] d[i-11] d[i-10] d[i-9]
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 8)), _MM_SHUFFLE(0, 1, 2, 3)); // d[i-8] d[i-7] d[i-6] d[i-5]
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 4)), _MM_SHUFFLE(0, 1, 2, 3)); // d[i-4] d[i-3] d[i-2] d[i-1]
|
||||
|
||||
for (i = 0;;) {
|
||||
summ = _mm_mullo_epi32(dat[2], qlp[2]);
|
||||
@ -1000,11 +1000,11 @@ void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_
|
||||
__m128i qlp[2], dat[2];
|
||||
__m128i summ, temp;
|
||||
|
||||
qlp[0] = _mm_loadu_si128((const __m128i*)(qlp_coeff + 0));
|
||||
qlp[1] = _mm_loadu_si128((const __m128i*)(qlp_coeff + 4));
|
||||
qlp[0] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 0));
|
||||
qlp[1] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 4));
|
||||
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(data - 8)), _MM_SHUFFLE(0, 1, 2, 3));
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(data - 4)), _MM_SHUFFLE(0, 1, 2, 3));
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 8)), _MM_SHUFFLE(0, 1, 2, 3));
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 4)), _MM_SHUFFLE(0, 1, 2, 3));
|
||||
|
||||
for (i = 0;;) {
|
||||
summ = _mm_add_epi32(_mm_mullo_epi32(dat[1], qlp[1]), _mm_mullo_epi32(dat[0], qlp[0]));
|
||||
@ -1053,9 +1053,9 @@ void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint
|
||||
__m128i qlp[2], dat[2];
|
||||
__m128i summ, temp;
|
||||
|
||||
qlp[0] = _mm_loadu_si128((const __m128i*)(qlp_coeff+0)); // q[3] q[2] q[1] q[0]
|
||||
temp = _mm_loadu_si128((const __m128i*)(qlp_coeff+4)); // q[7] q[6] q[5] q[4]
|
||||
qlp[1] = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); // q[11] q[10] q[9] q[8]
|
||||
qlp[0] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+0)); // q[3] q[2] q[1] q[0]
|
||||
temp = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+4)); // q[7] q[6] q[5] q[4]
|
||||
qlp[1] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+8)); // q[11] q[10] q[9] q[8]
|
||||
switch(order)
|
||||
{
|
||||
case 9:
|
||||
@ -1068,9 +1068,9 @@ void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint
|
||||
qlp[0] = _mm_packs_epi32(qlp[0], temp); // q[7] q[6] q[5] q[4] q[3] q[2] q[1] q[0]
|
||||
qlp[1] = _mm_packs_epi32(qlp[1], _mm_setzero_si128()); // 0 0 0 0 q[11] q[10] q[9] q[8]
|
||||
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(data-12)), _MM_SHUFFLE(0,1,2,3)); // d[i-12] d[i-11] d[i-10] d[i-9]
|
||||
temp = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(data-8)), _MM_SHUFFLE(0,1,2,3)); // d[i-8] d[i-7] d[i-6] d[i-5]
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(data-4)), _MM_SHUFFLE(0,1,2,3)); // d[i-4] d[i-3] d[i-2] d[i-1]
|
||||
dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-12)), _MM_SHUFFLE(0,1,2,3)); // d[i-12] d[i-11] d[i-10] d[i-9]
|
||||
temp = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-8)), _MM_SHUFFLE(0,1,2,3)); // d[i-8] d[i-7] d[i-6] d[i-5]
|
||||
dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-4)), _MM_SHUFFLE(0,1,2,3)); // d[i-4] d[i-3] d[i-2] d[i-1]
|
||||
|
||||
dat[1] = _mm_packs_epi32(dat[1], _mm_setzero_si128()); // 0 0 0 0 d[i-12] d[i-11] d[i-10] d[i-9]
|
||||
dat[0] = _mm_packs_epi32(dat[0], temp); // d[i-8] d[i-7] d[i-6] d[i-5] d[i-4] d[i-3] d[i-2] d[i-1]
|
||||
@ -1098,12 +1098,12 @@ void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint
|
||||
__m128i qlp0, dat0;
|
||||
__m128i summ, temp;
|
||||
|
||||
qlp0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0)); // q[3] q[2] q[1] q[0]
|
||||
temp = _mm_loadu_si128((const __m128i*)(qlp_coeff+4)); // q[7] q[6] q[5] q[4]
|
||||
qlp0 = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+0)); // q[3] q[2] q[1] q[0]
|
||||
temp = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+4)); // q[7] q[6] q[5] q[4]
|
||||
qlp0 = _mm_packs_epi32(qlp0, temp); // q[7] q[6] q[5] q[4] q[3] q[2] q[1] q[0]
|
||||
|
||||
temp = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(data-8)), _MM_SHUFFLE(0,1,2,3));
|
||||
dat0 = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(data-4)), _MM_SHUFFLE(0,1,2,3));
|
||||
temp = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-8)), _MM_SHUFFLE(0,1,2,3));
|
||||
dat0 = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-4)), _MM_SHUFFLE(0,1,2,3));
|
||||
dat0 = _mm_packs_epi32(dat0, temp); // d[i-8] d[i-7] d[i-6] d[i-5] d[i-4] d[i-3] d[i-2] d[i-1]
|
||||
|
||||
for(i = 0;;) {
|
||||
@ -1164,20 +1164,20 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ, mull;
|
||||
summ = _mm_mullo_epi32(q11, _mm_loadu_si128((const __m128i*)(data+i-12)));
|
||||
mull = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_mullo_epi32(q11, _mm_loadu_si128((const __m128i*)(const void*)(data+i-12)));
|
||||
mull = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(const void*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 11 */
|
||||
@ -1196,19 +1196,19 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ, mull;
|
||||
summ = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(data+i-11)));
|
||||
mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(const void*)(data+i-11)));
|
||||
mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1228,18 +1228,18 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ, mull;
|
||||
summ = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10)));
|
||||
mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10)));
|
||||
mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 9 */
|
||||
@ -1256,17 +1256,17 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ, mull;
|
||||
summ = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9)));
|
||||
mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9)));
|
||||
mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1286,16 +1286,16 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ, mull;
|
||||
summ = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8)));
|
||||
mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8)));
|
||||
mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 7 */
|
||||
@ -1310,15 +1310,15 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ, mull;
|
||||
summ = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7)));
|
||||
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7)));
|
||||
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1334,14 +1334,14 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ, mull;
|
||||
summ = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6)));
|
||||
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6)));
|
||||
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 5 */
|
||||
@ -1354,13 +1354,13 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ, mull;
|
||||
summ = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5)));
|
||||
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5)));
|
||||
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1376,12 +1376,12 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ, mull;
|
||||
summ = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4)));
|
||||
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4)));
|
||||
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 3 */
|
||||
@ -1392,11 +1392,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ, mull;
|
||||
summ = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3)));
|
||||
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3)));
|
||||
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
|
||||
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1408,10 +1408,10 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ, mull;
|
||||
summ = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2)));
|
||||
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2)));
|
||||
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
else { /* order == 1 */
|
||||
@ -1420,9 +1420,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
|
||||
|
||||
for(i = 0; i < (int)data_len-3; i+=4) {
|
||||
__m128i summ;
|
||||
summ = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1)));
|
||||
summ = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1)));
|
||||
summ = _mm_sra_epi32(summ, cnt);
|
||||
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
|
||||
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -67,14 +67,14 @@ void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual
|
||||
end += default_partition_samples;
|
||||
|
||||
for( ; (int)residual_sample < (int)end-7; residual_sample+=8) {
|
||||
__m256i res256 = _mm256_abs_epi32(_mm256_loadu_si256((const __m256i*)(residual+residual_sample)));
|
||||
__m256i res256 = _mm256_abs_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(residual+residual_sample)));
|
||||
sum256 = _mm256_add_epi32(sum256, res256);
|
||||
}
|
||||
|
||||
sum128 = _mm_add_epi32(_mm256_extracti128_si256(sum256, 1), _mm256_castsi256_si128(sum256));
|
||||
|
||||
for( ; (int)residual_sample < (int)end-3; residual_sample+=4) {
|
||||
__m128i res128 = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample)));
|
||||
__m128i res128 = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(const void*)(residual+residual_sample)));
|
||||
sum128 = _mm_add_epi32(sum128, res128);
|
||||
}
|
||||
|
||||
@ -99,7 +99,7 @@ void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual
|
||||
end += default_partition_samples;
|
||||
|
||||
for( ; (int)residual_sample < (int)end-3; residual_sample+=4) {
|
||||
__m128i res128 = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample)));
|
||||
__m128i res128 = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(const void*)(residual+residual_sample)));
|
||||
__m256i res256 = _mm256_cvtepu32_epi64(res128);
|
||||
sum256 = _mm256_add_epi64(sum256, res256);
|
||||
}
|
||||
@ -107,7 +107,7 @@ void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual
|
||||
sum128 = _mm_add_epi64(_mm256_extracti128_si256(sum256, 1), _mm256_castsi256_si128(sum256));
|
||||
|
||||
for( ; (int)residual_sample < (int)end-1; residual_sample+=2) {
|
||||
__m128i res128 = _mm_abs_epi32(_mm_loadl_epi64((const __m128i*)(residual+residual_sample)));
|
||||
__m128i res128 = _mm_abs_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(residual+residual_sample)));
|
||||
res128 = _mm_cvtepu32_epi64(res128);
|
||||
sum128 = _mm_add_epi64(sum128, res128);
|
||||
}
|
||||
@ -118,7 +118,7 @@ void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual
|
||||
}
|
||||
|
||||
sum128 = _mm_add_epi64(sum128, _mm_srli_si128(sum128, 8));
|
||||
_mm_storel_epi64((__m128i*)(abs_residual_partition_sums+partition), sum128);
|
||||
_mm_storel_epi64((__m128i*)(void*)(abs_residual_partition_sums+partition), sum128);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -88,7 +88,7 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual
|
||||
}
|
||||
|
||||
for( ; residual_sample < e3; residual_sample+=4) {
|
||||
__m128i mm_res = local_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample)));
|
||||
__m128i mm_res = local_abs_epi32(_mm_loadu_si128((const __m128i*)(const void*)(residual+residual_sample)));
|
||||
mm_sum = _mm_add_epi32(mm_sum, mm_res);
|
||||
}
|
||||
|
||||
@ -121,7 +121,7 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual
|
||||
}
|
||||
|
||||
for( ; residual_sample < e3; residual_sample+=2) {
|
||||
__m128i mm_res = local_abs_epi32(_mm_loadl_epi64((const __m128i*)(residual+residual_sample))); /* 0 0 |r1| |r0| */
|
||||
__m128i mm_res = local_abs_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(residual+residual_sample))); /* 0 0 |r1| |r0| */
|
||||
mm_res = _mm_shuffle_epi32(mm_res, _MM_SHUFFLE(3,1,2,0)); /* 0 |r1| 0 |r0| == |r1_64| |r0_64| */
|
||||
mm_sum = _mm_add_epi64(mm_sum, mm_res);
|
||||
}
|
||||
@ -132,7 +132,7 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual
|
||||
}
|
||||
|
||||
mm_sum = _mm_add_epi64(mm_sum, _mm_srli_si128(mm_sum, 8));
|
||||
_mm_storel_epi64((__m128i*)(abs_residual_partition_sums+partition), mm_sum);
|
||||
_mm_storel_epi64((__m128i*)(void*)(abs_residual_partition_sums+partition), mm_sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -77,7 +77,7 @@ void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residua
|
||||
}
|
||||
|
||||
for( ; residual_sample < e3; residual_sample+=4) {
|
||||
__m128i mm_res = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample)));
|
||||
__m128i mm_res = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(const void*)(residual+residual_sample)));
|
||||
mm_sum = _mm_add_epi32(mm_sum, mm_res);
|
||||
}
|
||||
|
||||
@ -110,7 +110,7 @@ void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residua
|
||||
}
|
||||
|
||||
for( ; residual_sample < e3; residual_sample+=2) {
|
||||
__m128i mm_res = _mm_abs_epi32(_mm_loadl_epi64((const __m128i*)(residual+residual_sample))); /* 0 0 |r1| |r0| */
|
||||
__m128i mm_res = _mm_abs_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(residual+residual_sample))); /* 0 0 |r1| |r0| */
|
||||
mm_res = _mm_shuffle_epi32(mm_res, _MM_SHUFFLE(3,1,2,0)); /* 0 |r1| 0 |r0| == |r1_64| |r0_64| */
|
||||
mm_sum = _mm_add_epi64(mm_sum, mm_res);
|
||||
}
|
||||
@ -121,7 +121,7 @@ void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residua
|
||||
}
|
||||
|
||||
mm_sum = _mm_add_epi64(mm_sum, _mm_srli_si128(mm_sum, 8));
|
||||
_mm_storel_epi64((__m128i*)(abs_residual_partition_sums+partition), mm_sum);
|
||||
_mm_storel_epi64((__m128i*)(void*)(abs_residual_partition_sums+partition), mm_sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user