Silence clang warnings on alignment of unaligned loads

Clang throws *a lot* of warnings on alignment requirement increase
where no alignment is required at all. This is a results of the
way intrinsics were implemented long ago. See
https://stackoverflow.com/questions/71279668/why-does-clang-complain-about-alignment-on-sse-intrinsic-unaligned-loads

This commit silences these warnings by first casting the pointer
to (const void*) or (void*) before casting the (const __m128i*),
(__m128i*), (const __m256i*) or (__m256i*)

Compiling with and without this patch returns exactly the same
binary for GCC 9.3 and clang 10.0
This commit is contained in:
Martijn van Beurden 2022-02-28 21:11:44 +01:00
parent 179cdce1db
commit be1df4085a
6 changed files with 685 additions and 685 deletions

View File

@ -77,20 +77,20 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ, mull;
summ = _mm256_madd_epi16(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
mull = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_madd_epi16(q11, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-12)));
mull = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
else { /* order == 11 */
@ -109,19 +109,19 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ, mull;
summ = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
mull = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-11)));
mull = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
}
@ -141,18 +141,18 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ, mull;
summ = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(data+i-10)));
mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-10)));
mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
else { /* order == 9 */
@ -169,17 +169,17 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ, mull;
summ = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 )));
mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-9 )));
mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
}
@ -199,16 +199,16 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ, mull;
summ = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 )));
mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-8 )));
mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
else { /* order == 7 */
@ -223,15 +223,15 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ, mull;
summ = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 )));
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7 )));
mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
}
@ -247,14 +247,14 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ, mull;
summ = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 )));
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6 )));
mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
else { /* order == 5 */
@ -267,13 +267,13 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ, mull;
summ = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 )));
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5 )));
mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
}
@ -289,12 +289,12 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ, mull;
summ = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 )));
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4 )));
mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
else { /* order == 3 */
@ -305,11 +305,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ, mull;
summ = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 )));
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3 )));
mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
}
@ -321,10 +321,10 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ, mull;
summ = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 )));
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2 )));
mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
else { /* order == 1 */
@ -333,9 +333,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ;
summ = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 )));
summ = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1 )));
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
}
@ -432,20 +432,20 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ, mull;
summ = _mm256_mullo_epi32(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
mull = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_mullo_epi32(q11, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-12)));
mull = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-9))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
else { /* order == 11 */
@ -464,19 +464,19 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ, mull;
summ = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
mull = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-11)));
mull = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-9))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
}
@ -496,18 +496,18 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ, mull;
summ = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(data+i-10)));
mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-10)));
mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-9))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
else { /* order == 9 */
@ -524,17 +524,17 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ, mull;
summ = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9)));
mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-9)));
mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
}
@ -554,16 +554,16 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ, mull;
summ = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8)));
mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-8)));
mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
else { /* order == 7 */
@ -578,15 +578,15 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ, mull;
summ = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7)));
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-7)));
mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
}
@ -602,14 +602,14 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ, mull;
summ = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6)));
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-6)));
mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
else { /* order == 5 */
@ -622,13 +622,13 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ, mull;
summ = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5)));
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-5)));
mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
}
@ -644,12 +644,12 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ, mull;
summ = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4)));
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-4)));
mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
else { /* order == 3 */
@ -660,11 +660,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ, mull;
summ = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3)));
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-3)));
mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
}
@ -676,10 +676,10 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ, mull;
summ = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2)));
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-2)));
mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
else { /* order == 1 */
@ -688,9 +688,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in
for(i = 0; i < (int)data_len-7; i+=8) {
__m256i summ;
summ = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1)));
summ = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(const void*)(data+i-1)));
summ = _mm256_sra_epi32(summ, cnt);
_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
_mm256_storeu_si256((__m256i*)(void*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(data+i)), summ));
}
}
}
@ -765,7 +765,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
int i;
FLAC__int64 sum;
const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
const __m256i pack = _mm256_loadu_si256((const __m256i *)pack_arr);
const __m256i pack = _mm256_loadu_si256((const __m256i *)(const void*)pack_arr);
FLAC__ASSERT(order > 0);
FLAC__ASSERT(order <= 32);
@ -791,20 +791,20 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
for(i = 0; i < (int)data_len-3; i+=4) {
__m256i summ, mull;
summ = _mm256_mul_epi32(q11, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-12))));
mull = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11)))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
summ = _mm256_mul_epi32(q11, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-12))));
mull = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-11)))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
}
}
else { /* order == 11 */
@ -823,19 +823,19 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
for(i = 0; i < (int)data_len-3; i+=4) {
__m256i summ, mull;
summ = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11))));
mull = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
summ = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-11))));
mull = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
}
}
}
@ -855,18 +855,18 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
for(i = 0; i < (int)data_len-3; i+=4) {
__m256i summ, mull;
summ = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10))));
mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
summ = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-10))));
mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
}
}
else { /* order == 9 */
@ -883,17 +883,17 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
for(i = 0; i < (int)data_len-3; i+=4) {
__m256i summ, mull;
summ = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 ))));
mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
summ = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-9 ))));
mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
}
}
}
@ -913,16 +913,16 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
for(i = 0; i < (int)data_len-3; i+=4) {
__m256i summ, mull;
summ = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 ))));
mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
summ = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-8 ))));
mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
}
}
else { /* order == 7 */
@ -937,15 +937,15 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
for(i = 0; i < (int)data_len-3; i+=4) {
__m256i summ, mull;
summ = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 ))));
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
summ = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-7 ))));
mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
}
}
}
@ -961,14 +961,14 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
for(i = 0; i < (int)data_len-3; i+=4) {
__m256i summ, mull;
summ = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 ))));
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
summ = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-6 ))));
mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
}
}
else { /* order == 5 */
@ -981,13 +981,13 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
for(i = 0; i < (int)data_len-3; i+=4) {
__m256i summ, mull;
summ = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 ))));
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
summ = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-5 ))));
mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
}
}
}
@ -1003,12 +1003,12 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
for(i = 0; i < (int)data_len-3; i+=4) {
__m256i summ, mull;
summ = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 ))));
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
summ = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-4 ))));
mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
}
}
else { /* order == 3 */
@ -1019,11 +1019,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
for(i = 0; i < (int)data_len-3; i+=4) {
__m256i summ, mull;
summ = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 ))));
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
summ = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-3 ))));
mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
}
}
}
@ -1035,10 +1035,10 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
for(i = 0; i < (int)data_len-3; i+=4) {
__m256i summ, mull;
summ = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 ))));
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
summ = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-2 ))));
mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
}
}
else { /* order == 1 */
@ -1047,9 +1047,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
for(i = 0; i < (int)data_len-3; i+=4) {
__m256i summ;
summ = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 ))));
summ = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(const void*)(data+i-1 ))));
summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), _mm256_castsi256_si128(summ)));
}
}
}

View File

@ -80,20 +80,20 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ, mull;
summ = _mm_madd_epi16(q11, _mm_loadu_si128((const __m128i*)(data+i-12)));
mull = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_madd_epi16(q11, _mm_loadu_si128((const __m128i*)(const void*)(data+i-12)));
mull = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(const void*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
else { /* order == 11 */
@ -112,19 +112,19 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ, mull;
summ = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11)));
mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(const void*)(data+i-11)));
mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
}
@ -144,18 +144,18 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ, mull;
summ = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10)));
mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10)));
mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
else { /* order == 9 */
@ -172,17 +172,17 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ, mull;
summ = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9)));
mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9)));
mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
}
@ -202,16 +202,16 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ, mull;
summ = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8)));
mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8)));
mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
else { /* order == 7 */
@ -226,15 +226,15 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ, mull;
summ = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7)));
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7)));
mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
}
@ -250,14 +250,14 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ, mull;
summ = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6)));
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6)));
mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
else { /* order == 5 */
@ -270,13 +270,13 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ, mull;
summ = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5)));
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5)));
mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
}
@ -292,12 +292,12 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ, mull;
summ = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4)));
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4)));
mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
else { /* order == 3 */
@ -308,11 +308,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ, mull;
summ = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3)));
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3)));
mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
}
@ -324,10 +324,10 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ, mull;
summ = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2)));
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2)));
mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
else { /* order == 1 */
@ -336,9 +336,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ;
summ = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1)));
summ = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1)));
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
}
@ -417,12 +417,12 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
if(order > 10) { /* order == 11, 12 */
if(order == 12) {
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); // 0 0 q[1] q[0]
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); // 0 0 q[3] q[2]
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); // 0 0 q[5] q[4]
xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); // 0 0 q[7] q[6]
xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); // 0 0 q[9] q[8]
xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0 0 q[11] q[10]
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)); // 0 0 q[1] q[0]
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)); // 0 0 q[3] q[2]
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4)); // 0 0 q[5] q[4]
xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6)); // 0 0 q[7] q[6]
xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8)); // 0 0 q[9] q[8]
xmm5 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+10)); // 0 0 q[11] q[10]
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0]
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2]
@ -435,41 +435,41 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
//sum = 0;
//sum += qlp_coeff[11] * data[i-12];
//sum += qlp_coeff[10] * data[i-11];
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12)); // 0 0 d[i-11] d[i-12]
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-12)); // 0 0 d[i-11] d[i-12]
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
//sum += qlp_coeff[9] * data[i-10];
//sum += qlp_coeff[8] * data[i-9];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm4);
xmm7 = _mm_add_epi32(xmm7, xmm6);
//sum += qlp_coeff[7] * data[i-8];
//sum += qlp_coeff[6] * data[i-7];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm3);
xmm7 = _mm_add_epi32(xmm7, xmm6);
//sum += qlp_coeff[5] * data[i-6];
//sum += qlp_coeff[4] * data[i-5];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm2);
xmm7 = _mm_add_epi32(xmm7, xmm6);
//sum += qlp_coeff[3] * data[i-4];
//sum += qlp_coeff[2] * data[i-3];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm1);
xmm7 = _mm_add_epi32(xmm7, xmm6);
//sum += qlp_coeff[1] * data[i-2];
//sum += qlp_coeff[0] * data[i-1];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm0);
xmm7 = _mm_add_epi32(xmm7, xmm6);
@ -480,11 +480,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
}
else { /* order == 11 */
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8));
xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
@ -501,35 +501,35 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
//sum += qlp_coeff[9] * data[i-10];
//sum += qlp_coeff[8] * data[i-9];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm4);
xmm7 = _mm_add_epi32(xmm7, xmm6);
//sum += qlp_coeff[7] * data[i-8];
//sum += qlp_coeff[6] * data[i-7];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm3);
xmm7 = _mm_add_epi32(xmm7, xmm6);
//sum += qlp_coeff[5] * data[i-6];
//sum += qlp_coeff[4] * data[i-5];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm2);
xmm7 = _mm_add_epi32(xmm7, xmm6);
//sum += qlp_coeff[3] * data[i-4];
//sum += qlp_coeff[2] * data[i-3];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm1);
xmm7 = _mm_add_epi32(xmm7, xmm6);
//sum += qlp_coeff[1] * data[i-2];
//sum += qlp_coeff[0] * data[i-1];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm0);
xmm7 = _mm_add_epi32(xmm7, xmm6);
@ -542,11 +542,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
else { /* order == 9, 10 */
if(order == 10) {
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8));
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
@ -558,34 +558,34 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
//sum = 0;
//sum += qlp_coeff[9] * data[i-10];
//sum += qlp_coeff[8] * data[i-9];
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
xmm7 = _mm_mul_epu32(xmm7, xmm4);
//sum += qlp_coeff[7] * data[i-8];
//sum += qlp_coeff[6] * data[i-7];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm3);
xmm7 = _mm_add_epi32(xmm7, xmm6);
//sum += qlp_coeff[5] * data[i-6];
//sum += qlp_coeff[4] * data[i-5];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm2);
xmm7 = _mm_add_epi32(xmm7, xmm6);
//sum += qlp_coeff[3] * data[i-4];
//sum += qlp_coeff[2] * data[i-3];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm1);
xmm7 = _mm_add_epi32(xmm7, xmm6);
//sum += qlp_coeff[1] * data[i-2];
//sum += qlp_coeff[0] * data[i-1];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm0);
xmm7 = _mm_add_epi32(xmm7, xmm6);
@ -596,10 +596,10 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
}
else { /* order == 9 */
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
@ -615,28 +615,28 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
//sum += qlp_coeff[7] * data[i-8];
//sum += qlp_coeff[6] * data[i-7];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm3);
xmm7 = _mm_add_epi32(xmm7, xmm6);
//sum += qlp_coeff[5] * data[i-6];
//sum += qlp_coeff[4] * data[i-5];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm2);
xmm7 = _mm_add_epi32(xmm7, xmm6);
//sum += qlp_coeff[3] * data[i-4];
//sum += qlp_coeff[2] * data[i-3];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm1);
xmm7 = _mm_add_epi32(xmm7, xmm6);
//sum += qlp_coeff[1] * data[i-2];
//sum += qlp_coeff[0] * data[i-1];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm0);
xmm7 = _mm_add_epi32(xmm7, xmm6);
@ -651,10 +651,10 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
if(order > 6) { /* order == 7, 8 */
if(order == 8) {
__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
@ -665,27 +665,27 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
//sum = 0;
//sum += qlp_coeff[7] * data[i-8];
//sum += qlp_coeff[6] * data[i-7];
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
xmm7 = _mm_mul_epu32(xmm7, xmm3);
//sum += qlp_coeff[5] * data[i-6];
//sum += qlp_coeff[4] * data[i-5];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm2);
xmm7 = _mm_add_epi32(xmm7, xmm6);
//sum += qlp_coeff[3] * data[i-4];
//sum += qlp_coeff[2] * data[i-3];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm1);
xmm7 = _mm_add_epi32(xmm7, xmm6);
//sum += qlp_coeff[1] * data[i-2];
//sum += qlp_coeff[0] * data[i-1];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm0);
xmm7 = _mm_add_epi32(xmm7, xmm6);
@ -696,9 +696,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
}
else { /* order == 7 */
__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
@ -713,21 +713,21 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
//sum += qlp_coeff[5] * data[i-6];
//sum += qlp_coeff[4] * data[i-5];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm2);
xmm7 = _mm_add_epi32(xmm7, xmm6);
//sum += qlp_coeff[3] * data[i-4];
//sum += qlp_coeff[2] * data[i-3];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm1);
xmm7 = _mm_add_epi32(xmm7, xmm6);
//sum += qlp_coeff[1] * data[i-2];
//sum += qlp_coeff[0] * data[i-1];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm0);
xmm7 = _mm_add_epi32(xmm7, xmm6);
@ -740,9 +740,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
else { /* order == 5, 6 */
if(order == 6) {
__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
@ -752,20 +752,20 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
//sum = 0;
//sum += qlp_coeff[5] * data[i-6];
//sum += qlp_coeff[4] * data[i-5];
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
xmm7 = _mm_mul_epu32(xmm7, xmm2);
//sum += qlp_coeff[3] * data[i-4];
//sum += qlp_coeff[2] * data[i-3];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm1);
xmm7 = _mm_add_epi32(xmm7, xmm6);
//sum += qlp_coeff[1] * data[i-2];
//sum += qlp_coeff[0] * data[i-1];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm0);
xmm7 = _mm_add_epi32(xmm7, xmm6);
@ -776,8 +776,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
}
else { /* order == 5 */
__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
@ -791,14 +791,14 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
//sum += qlp_coeff[3] * data[i-4];
//sum += qlp_coeff[2] * data[i-3];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm1);
xmm7 = _mm_add_epi32(xmm7, xmm6);
//sum += qlp_coeff[1] * data[i-2];
//sum += qlp_coeff[0] * data[i-1];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm0);
xmm7 = _mm_add_epi32(xmm7, xmm6);
@ -813,8 +813,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
if(order > 2) { /* order == 3, 4 */
if(order == 4) {
__m128i xmm0, xmm1, xmm6, xmm7;
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
@ -823,13 +823,13 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
//sum = 0;
//sum += qlp_coeff[3] * data[i-4];
//sum += qlp_coeff[2] * data[i-3];
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
xmm7 = _mm_mul_epu32(xmm7, xmm1);
//sum += qlp_coeff[1] * data[i-2];
//sum += qlp_coeff[0] * data[i-1];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm0);
xmm7 = _mm_add_epi32(xmm7, xmm6);
@ -840,7 +840,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
}
else { /* order == 3 */
__m128i xmm0, xmm1, xmm6, xmm7;
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
@ -853,7 +853,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
//sum += qlp_coeff[1] * data[i-2];
//sum += qlp_coeff[0] * data[i-1];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epu32(xmm6, xmm0);
xmm7 = _mm_add_epi32(xmm7, xmm6);
@ -866,14 +866,14 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__in
else { /* order == 1, 2 */
if(order == 2) {
__m128i xmm0, xmm7;
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
for(i = 0; i < (int)data_len; i++) {
//sum = 0;
//sum += qlp_coeff[1] * data[i-2];
//sum += qlp_coeff[0] * data[i-1];
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
xmm7 = _mm_mul_epu32(xmm7, xmm0);

View File

@ -67,12 +67,12 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
if(order > 10) { /* order == 11, 12 */
if(order == 12) {
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); // 0 0 q[1] q[0]
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); // 0 0 q[3] q[2]
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); // 0 0 q[5] q[4]
xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); // 0 0 q[7] q[6]
xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); // 0 0 q[9] q[8]
xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0 0 q[11] q[10]
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)); // 0 0 q[1] q[0]
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)); // 0 0 q[3] q[2]
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4)); // 0 0 q[5] q[4]
xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6)); // 0 0 q[7] q[6]
xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8)); // 0 0 q[9] q[8]
xmm5 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+10)); // 0 0 q[11] q[10]
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0]
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2]
@ -85,41 +85,41 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
//sum = 0;
//sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
//sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12)); // 0 0 d[i-11] d[i-12]
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-12)); // 0 0 d[i-11] d[i-12]
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
xmm7 = _mm_mul_epi32(xmm7, xmm5);
//sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
//sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm4);
xmm7 = _mm_add_epi64(xmm7, xmm6);
//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm3);
xmm7 = _mm_add_epi64(xmm7, xmm6);
//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm2);
xmm7 = _mm_add_epi64(xmm7, xmm6);
//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm1);
xmm7 = _mm_add_epi64(xmm7, xmm6);
//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm0);
xmm7 = _mm_add_epi64(xmm7, xmm6);
@ -130,11 +130,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
}
else { /* order == 11 */
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8));
xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
@ -151,35 +151,35 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
//sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
//sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm4);
xmm7 = _mm_add_epi64(xmm7, xmm6);
//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm3);
xmm7 = _mm_add_epi64(xmm7, xmm6);
//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm2);
xmm7 = _mm_add_epi64(xmm7, xmm6);
//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm1);
xmm7 = _mm_add_epi64(xmm7, xmm6);
//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm0);
xmm7 = _mm_add_epi64(xmm7, xmm6);
@ -192,11 +192,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
else { /* order == 9, 10 */
if(order == 10) {
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
xmm4 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8));
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
@ -208,34 +208,34 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
//sum = 0;
//sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
//sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-10));
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
xmm7 = _mm_mul_epi32(xmm7, xmm4);
//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm3);
xmm7 = _mm_add_epi64(xmm7, xmm6);
//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm2);
xmm7 = _mm_add_epi64(xmm7, xmm6);
//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm1);
xmm7 = _mm_add_epi64(xmm7, xmm6);
//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm0);
xmm7 = _mm_add_epi64(xmm7, xmm6);
@ -246,10 +246,10 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
}
else { /* order == 9 */
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
@ -265,28 +265,28 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm3);
xmm7 = _mm_add_epi64(xmm7, xmm6);
//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm2);
xmm7 = _mm_add_epi64(xmm7, xmm6);
//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm1);
xmm7 = _mm_add_epi64(xmm7, xmm6);
//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm0);
xmm7 = _mm_add_epi64(xmm7, xmm6);
@ -301,10 +301,10 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
if(order > 6) { /* order == 7, 8 */
if(order == 8) {
__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
xmm3 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6));
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
@ -315,27 +315,27 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
//sum = 0;
//sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
//sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-8));
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
xmm7 = _mm_mul_epi32(xmm7, xmm3);
//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm2);
xmm7 = _mm_add_epi64(xmm7, xmm6);
//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm1);
xmm7 = _mm_add_epi64(xmm7, xmm6);
//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm0);
xmm7 = _mm_add_epi64(xmm7, xmm6);
@ -346,9 +346,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
}
else { /* order == 7 */
__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
@ -363,21 +363,21 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm2);
xmm7 = _mm_add_epi64(xmm7, xmm6);
//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm1);
xmm7 = _mm_add_epi64(xmm7, xmm6);
//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm0);
xmm7 = _mm_add_epi64(xmm7, xmm6);
@ -390,9 +390,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
else { /* order == 5, 6 */
if(order == 6) {
__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
xmm2 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4));
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
@ -402,20 +402,20 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
//sum = 0;
//sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
//sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-6));
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
xmm7 = _mm_mul_epi32(xmm7, xmm2);
//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm1);
xmm7 = _mm_add_epi64(xmm7, xmm6);
//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm0);
xmm7 = _mm_add_epi64(xmm7, xmm6);
@ -426,8 +426,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
}
else { /* order == 5 */
__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
@ -441,14 +441,14 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm1);
xmm7 = _mm_add_epi64(xmm7, xmm6);
//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm0);
xmm7 = _mm_add_epi64(xmm7, xmm6);
@ -463,8 +463,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
if(order > 2) { /* order == 3, 4 */
if(order == 4) {
__m128i xmm0, xmm1, xmm6, xmm7;
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
xmm1 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2));
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
@ -473,13 +473,13 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
//sum = 0;
//sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
//sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-4));
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
xmm7 = _mm_mul_epi32(xmm7, xmm1);
//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm0);
xmm7 = _mm_add_epi64(xmm7, xmm6);
@ -490,7 +490,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
}
else { /* order == 3 */
__m128i xmm0, xmm1, xmm6, xmm7;
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
@ -503,7 +503,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
xmm6 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
xmm6 = _mm_mul_epi32(xmm6, xmm0);
xmm7 = _mm_add_epi64(xmm7, xmm6);
@ -516,14 +516,14 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
else { /* order == 1, 2 */
if(order == 2) {
__m128i xmm0, xmm7;
xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
xmm0 = _mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0));
xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
for(i = 0; i < (int)data_len; i++) {
//sum = 0;
//sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
//sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
xmm7 = _mm_loadl_epi64((const __m128i*)(const void*)(data+i-2));
xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
xmm7 = _mm_mul_epi32(xmm7, xmm0);
@ -606,22 +606,22 @@ void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], ui
if(order > 10) { /* order == 11, 12 */
__m128i qlp[6], dat[6];
__m128i summ, temp;
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+0))); // 0 q[1] 0 q[0]
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+2))); // 0 q[3] 0 q[2]
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+4))); // 0 q[5] 0 q[4]
qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+6))); // 0 q[7] 0 q[6]
qlp[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+8))); // 0 q[9] 0 q[8]
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0))); // 0 q[1] 0 q[0]
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2))); // 0 q[3] 0 q[2]
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4))); // 0 q[5] 0 q[4]
qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6))); // 0 q[7] 0 q[6]
qlp[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8))); // 0 q[9] 0 q[8]
if (order == 12)
qlp[5] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+10))); // 0 q[11] 0 q[10]
qlp[5] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+10))); // 0 q[11] 0 q[10]
else
qlp[5] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[10])); // 0 0 0 q[10]
dat[5] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-12)), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
dat[4] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-10)), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-10] 0 d[i-9]
dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-8 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-8] 0 d[i-7]
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-6 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-6] 0 d[i-5]
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-4 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-4] 0 d[i-3]
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-2 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-2] 0 d[i-1]
dat[5] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-12)), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
dat[4] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-10)), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-10] 0 d[i-9]
dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-8 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-8] 0 d[i-7]
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-6] 0 d[i-5]
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-4] 0 d[i-3]
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-2] 0 d[i-1]
summ = _mm_mul_epi32(dat[5], qlp[5]) ;
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4]));
@ -660,20 +660,20 @@ void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], ui
else { /* order == 9, 10 */
__m128i qlp[5], dat[5];
__m128i summ, temp;
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+0)));
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+2)));
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+4)));
qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+6)));
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)));
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)));
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4)));
qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6)));
if (order == 10)
qlp[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+8)));
qlp[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8)));
else
qlp[4] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[8]));
dat[4] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-10)), _MM_SHUFFLE(2,0,3,1));
dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-8 )), _MM_SHUFFLE(2,0,3,1));
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
dat[4] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-10)), _MM_SHUFFLE(2,0,3,1));
dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-8 )), _MM_SHUFFLE(2,0,3,1));
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
summ = _mm_mul_epi32(dat[4], qlp[4]) ;
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
@ -711,18 +711,18 @@ void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], ui
if(order > 6) { /* order == 7, 8 */
__m128i qlp[4], dat[4];
__m128i summ, temp;
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+0)));
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+2)));
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+4)));
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)));
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)));
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4)));
if (order == 8)
qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+6)));
qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6)));
else
qlp[3] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[6]));
dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-8 )), _MM_SHUFFLE(2,0,3,1));
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-8 )), _MM_SHUFFLE(2,0,3,1));
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
summ = _mm_mul_epi32(dat[3], qlp[3]) ;
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
@ -755,16 +755,16 @@ void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], ui
else { /* order == 5, 6 */
__m128i qlp[3], dat[3];
__m128i summ, temp;
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+0)));
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+2)));
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)));
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)));
if (order == 6)
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+4)));
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4)));
else
qlp[2] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[4]));
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
summ = _mm_mul_epi32(dat[2], qlp[2]) ;
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
@ -796,14 +796,14 @@ void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], ui
if(order > 2) { /* order == 3, 4 */
__m128i qlp[2], dat[2];
__m128i summ, temp;
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+0)));
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)));
if (order == 4)
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff+2)));
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)));
else
qlp[1] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[2]));
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
summ = _mm_mul_epi32(dat[1], qlp[1]) ;
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
@ -831,9 +831,9 @@ void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], ui
if(order == 2) {
__m128i qlp0, dat0;
__m128i summ, temp;
qlp0 = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(qlp_coeff)));
qlp0 = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff)));
dat0 = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
dat0 = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
summ = _mm_mul_epi32(dat0, qlp0);
@ -878,7 +878,7 @@ void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], ui
__m128i qlp[16];
for(i = 0; i < (int)order/2; i++)
qlp[i] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(qlp_coeff+i*2)), _MM_SHUFFLE(2,0,3,1)); // 0 q[2*i] 0 q[2*i+1]
qlp[i] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+i*2)), _MM_SHUFFLE(2,0,3,1)); // 0 q[2*i] 0 q[2*i+1]
if(order & 1)
qlp[i] = _mm_shuffle_epi32(_mm_cvtsi32_si128(qlp_coeff[i*2]), _MM_SHUFFLE(2,0,3,1));
@ -888,46 +888,46 @@ void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], ui
switch((order+1) / 2) {
case 16: /* order == 31, 32 */
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-32)));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-32)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[15])); /* Falls through. */
case 15:
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-30)));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-30)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[14])); /* Falls through. */
case 14:
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-28)));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-28)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[13])); /* Falls through. */
case 13:
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-26)));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-26)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[12])); /* Falls through. */
case 12:
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-24)));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-24)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[11])); /* Falls through. */
case 11:
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-22)));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-22)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[10])); /* Falls through. */
case 10:
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-20)));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-20)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[9])); /* Falls through. */
case 9:
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-18)));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-18)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[8])); /* Falls through. */
case 8:
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-16)));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-16)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[7])); /* Falls through. */
case 7: /* order == 13, 14 */
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-14)));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-14)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[6]));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-12)));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-12)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[5]));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-10)));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-10)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[4]));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-8)));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-8)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[3]));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-6)));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-6)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[2]));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-4)));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-4)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[1]));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(datai-2)));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-2)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[0]));
}
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
@ -958,9 +958,9 @@ void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_
__m128i qlp[3], dat[3];
__m128i summ, temp;
qlp[0] = _mm_loadu_si128((const __m128i*)(qlp_coeff + 0)); // q[3] q[2] q[1] q[0]
qlp[1] = _mm_loadu_si128((const __m128i*)(qlp_coeff + 4)); // q[7] q[6] q[5] q[4]
qlp[2] = _mm_loadu_si128((const __m128i*)(qlp_coeff + 8)); // q[11] q[10] q[9] q[8]
qlp[0] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 0)); // q[3] q[2] q[1] q[0]
qlp[1] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 4)); // q[7] q[6] q[5] q[4]
qlp[2] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 8)); // q[11] q[10] q[9] q[8]
switch (order)
{
case 9:
@ -971,9 +971,9 @@ void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_
qlp[2] = _mm_slli_si128(qlp[2], 4); qlp[2] = _mm_srli_si128(qlp[2], 4); break; // 0 q[10] q[9] q[8]
}
dat[2] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(data - 12)), _MM_SHUFFLE(0, 1, 2, 3)); // d[i-12] d[i-11] d[i-10] d[i-9]
dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(data - 8)), _MM_SHUFFLE(0, 1, 2, 3)); // d[i-8] d[i-7] d[i-6] d[i-5]
dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(data - 4)), _MM_SHUFFLE(0, 1, 2, 3)); // d[i-4] d[i-3] d[i-2] d[i-1]
dat[2] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 12)), _MM_SHUFFLE(0, 1, 2, 3)); // d[i-12] d[i-11] d[i-10] d[i-9]
dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 8)), _MM_SHUFFLE(0, 1, 2, 3)); // d[i-8] d[i-7] d[i-6] d[i-5]
dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 4)), _MM_SHUFFLE(0, 1, 2, 3)); // d[i-4] d[i-3] d[i-2] d[i-1]
for (i = 0;;) {
summ = _mm_mullo_epi32(dat[2], qlp[2]);
@ -1000,11 +1000,11 @@ void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_
__m128i qlp[2], dat[2];
__m128i summ, temp;
qlp[0] = _mm_loadu_si128((const __m128i*)(qlp_coeff + 0));
qlp[1] = _mm_loadu_si128((const __m128i*)(qlp_coeff + 4));
qlp[0] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 0));
qlp[1] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 4));
dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(data - 8)), _MM_SHUFFLE(0, 1, 2, 3));
dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(data - 4)), _MM_SHUFFLE(0, 1, 2, 3));
dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 8)), _MM_SHUFFLE(0, 1, 2, 3));
dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 4)), _MM_SHUFFLE(0, 1, 2, 3));
for (i = 0;;) {
summ = _mm_add_epi32(_mm_mullo_epi32(dat[1], qlp[1]), _mm_mullo_epi32(dat[0], qlp[0]));
@ -1053,9 +1053,9 @@ void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint
__m128i qlp[2], dat[2];
__m128i summ, temp;
qlp[0] = _mm_loadu_si128((const __m128i*)(qlp_coeff+0)); // q[3] q[2] q[1] q[0]
temp = _mm_loadu_si128((const __m128i*)(qlp_coeff+4)); // q[7] q[6] q[5] q[4]
qlp[1] = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); // q[11] q[10] q[9] q[8]
qlp[0] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+0)); // q[3] q[2] q[1] q[0]
temp = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+4)); // q[7] q[6] q[5] q[4]
qlp[1] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+8)); // q[11] q[10] q[9] q[8]
switch(order)
{
case 9:
@ -1068,9 +1068,9 @@ void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint
qlp[0] = _mm_packs_epi32(qlp[0], temp); // q[7] q[6] q[5] q[4] q[3] q[2] q[1] q[0]
qlp[1] = _mm_packs_epi32(qlp[1], _mm_setzero_si128()); // 0 0 0 0 q[11] q[10] q[9] q[8]
dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(data-12)), _MM_SHUFFLE(0,1,2,3)); // d[i-12] d[i-11] d[i-10] d[i-9]
temp = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(data-8)), _MM_SHUFFLE(0,1,2,3)); // d[i-8] d[i-7] d[i-6] d[i-5]
dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(data-4)), _MM_SHUFFLE(0,1,2,3)); // d[i-4] d[i-3] d[i-2] d[i-1]
dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-12)), _MM_SHUFFLE(0,1,2,3)); // d[i-12] d[i-11] d[i-10] d[i-9]
temp = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-8)), _MM_SHUFFLE(0,1,2,3)); // d[i-8] d[i-7] d[i-6] d[i-5]
dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-4)), _MM_SHUFFLE(0,1,2,3)); // d[i-4] d[i-3] d[i-2] d[i-1]
dat[1] = _mm_packs_epi32(dat[1], _mm_setzero_si128()); // 0 0 0 0 d[i-12] d[i-11] d[i-10] d[i-9]
dat[0] = _mm_packs_epi32(dat[0], temp); // d[i-8] d[i-7] d[i-6] d[i-5] d[i-4] d[i-3] d[i-2] d[i-1]
@ -1098,12 +1098,12 @@ void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint
__m128i qlp0, dat0;
__m128i summ, temp;
qlp0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0)); // q[3] q[2] q[1] q[0]
temp = _mm_loadu_si128((const __m128i*)(qlp_coeff+4)); // q[7] q[6] q[5] q[4]
qlp0 = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+0)); // q[3] q[2] q[1] q[0]
temp = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+4)); // q[7] q[6] q[5] q[4]
qlp0 = _mm_packs_epi32(qlp0, temp); // q[7] q[6] q[5] q[4] q[3] q[2] q[1] q[0]
temp = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(data-8)), _MM_SHUFFLE(0,1,2,3));
dat0 = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(data-4)), _MM_SHUFFLE(0,1,2,3));
temp = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-8)), _MM_SHUFFLE(0,1,2,3));
dat0 = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-4)), _MM_SHUFFLE(0,1,2,3));
dat0 = _mm_packs_epi32(dat0, temp); // d[i-8] d[i-7] d[i-6] d[i-5] d[i-4] d[i-3] d[i-2] d[i-1]
for(i = 0;;) {
@ -1164,20 +1164,20 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ, mull;
summ = _mm_mullo_epi32(q11, _mm_loadu_si128((const __m128i*)(data+i-12)));
mull = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_mullo_epi32(q11, _mm_loadu_si128((const __m128i*)(const void*)(data+i-12)));
mull = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(const void*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
else { /* order == 11 */
@ -1196,19 +1196,19 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ, mull;
summ = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(data+i-11)));
mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_mullo_epi32(q10, _mm_loadu_si128((const __m128i*)(const void*)(data+i-11)));
mull = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
}
@ -1228,18 +1228,18 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ, mull;
summ = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(data+i-10)));
mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_mullo_epi32(q9, _mm_loadu_si128((const __m128i*)(const void*)(data+i-10)));
mull = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
else { /* order == 9 */
@ -1256,17 +1256,17 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ, mull;
summ = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(data+i-9)));
mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_mullo_epi32(q8, _mm_loadu_si128((const __m128i*)(const void*)(data+i-9)));
mull = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
}
@ -1286,16 +1286,16 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ, mull;
summ = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(data+i-8)));
mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_mullo_epi32(q7, _mm_loadu_si128((const __m128i*)(const void*)(data+i-8)));
mull = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
else { /* order == 7 */
@ -1310,15 +1310,15 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ, mull;
summ = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(data+i-7)));
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_mullo_epi32(q6, _mm_loadu_si128((const __m128i*)(const void*)(data+i-7)));
mull = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
}
@ -1334,14 +1334,14 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ, mull;
summ = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(data+i-6)));
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_mullo_epi32(q5, _mm_loadu_si128((const __m128i*)(const void*)(data+i-6)));
mull = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
else { /* order == 5 */
@ -1354,13 +1354,13 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ, mull;
summ = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(data+i-5)));
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_mullo_epi32(q4, _mm_loadu_si128((const __m128i*)(const void*)(data+i-5)));
mull = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
}
@ -1376,12 +1376,12 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ, mull;
summ = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(data+i-4)));
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_mullo_epi32(q3, _mm_loadu_si128((const __m128i*)(const void*)(data+i-4)));
mull = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
else { /* order == 3 */
@ -1392,11 +1392,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ, mull;
summ = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(data+i-3)));
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_mullo_epi32(q2, _mm_loadu_si128((const __m128i*)(const void*)(data+i-3)));
mull = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
}
@ -1408,10 +1408,10 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ, mull;
summ = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(data+i-2)));
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_mullo_epi32(q1, _mm_loadu_si128((const __m128i*)(const void*)(data+i-2)));
mull = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
else { /* order == 1 */
@ -1420,9 +1420,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i
for(i = 0; i < (int)data_len-3; i+=4) {
__m128i summ;
summ = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(data+i-1)));
summ = _mm_mullo_epi32(q0, _mm_loadu_si128((const __m128i*)(const void*)(data+i-1)));
summ = _mm_sra_epi32(summ, cnt);
_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
_mm_storeu_si128((__m128i*)(void*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data+i)), summ));
}
}
}

View File

@ -67,14 +67,14 @@ void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual
end += default_partition_samples;
for( ; (int)residual_sample < (int)end-7; residual_sample+=8) {
__m256i res256 = _mm256_abs_epi32(_mm256_loadu_si256((const __m256i*)(residual+residual_sample)));
__m256i res256 = _mm256_abs_epi32(_mm256_loadu_si256((const __m256i*)(const void*)(residual+residual_sample)));
sum256 = _mm256_add_epi32(sum256, res256);
}
sum128 = _mm_add_epi32(_mm256_extracti128_si256(sum256, 1), _mm256_castsi256_si128(sum256));
for( ; (int)residual_sample < (int)end-3; residual_sample+=4) {
__m128i res128 = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample)));
__m128i res128 = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(const void*)(residual+residual_sample)));
sum128 = _mm_add_epi32(sum128, res128);
}
@ -99,7 +99,7 @@ void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual
end += default_partition_samples;
for( ; (int)residual_sample < (int)end-3; residual_sample+=4) {
__m128i res128 = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample)));
__m128i res128 = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(const void*)(residual+residual_sample)));
__m256i res256 = _mm256_cvtepu32_epi64(res128);
sum256 = _mm256_add_epi64(sum256, res256);
}
@ -107,7 +107,7 @@ void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual
sum128 = _mm_add_epi64(_mm256_extracti128_si256(sum256, 1), _mm256_castsi256_si128(sum256));
for( ; (int)residual_sample < (int)end-1; residual_sample+=2) {
__m128i res128 = _mm_abs_epi32(_mm_loadl_epi64((const __m128i*)(residual+residual_sample)));
__m128i res128 = _mm_abs_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(residual+residual_sample)));
res128 = _mm_cvtepu32_epi64(res128);
sum128 = _mm_add_epi64(sum128, res128);
}
@ -118,7 +118,7 @@ void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual
}
sum128 = _mm_add_epi64(sum128, _mm_srli_si128(sum128, 8));
_mm_storel_epi64((__m128i*)(abs_residual_partition_sums+partition), sum128);
_mm_storel_epi64((__m128i*)(void*)(abs_residual_partition_sums+partition), sum128);
}
}
}

View File

@ -88,7 +88,7 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual
}
for( ; residual_sample < e3; residual_sample+=4) {
__m128i mm_res = local_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample)));
__m128i mm_res = local_abs_epi32(_mm_loadu_si128((const __m128i*)(const void*)(residual+residual_sample)));
mm_sum = _mm_add_epi32(mm_sum, mm_res);
}
@ -121,7 +121,7 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual
}
for( ; residual_sample < e3; residual_sample+=2) {
__m128i mm_res = local_abs_epi32(_mm_loadl_epi64((const __m128i*)(residual+residual_sample))); /* 0 0 |r1| |r0| */
__m128i mm_res = local_abs_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(residual+residual_sample))); /* 0 0 |r1| |r0| */
mm_res = _mm_shuffle_epi32(mm_res, _MM_SHUFFLE(3,1,2,0)); /* 0 |r1| 0 |r0| == |r1_64| |r0_64| */
mm_sum = _mm_add_epi64(mm_sum, mm_res);
}
@ -132,7 +132,7 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual
}
mm_sum = _mm_add_epi64(mm_sum, _mm_srli_si128(mm_sum, 8));
_mm_storel_epi64((__m128i*)(abs_residual_partition_sums+partition), mm_sum);
_mm_storel_epi64((__m128i*)(void*)(abs_residual_partition_sums+partition), mm_sum);
}
}
}

View File

@ -77,7 +77,7 @@ void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residua
}
for( ; residual_sample < e3; residual_sample+=4) {
__m128i mm_res = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample)));
__m128i mm_res = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(const void*)(residual+residual_sample)));
mm_sum = _mm_add_epi32(mm_sum, mm_res);
}
@ -110,7 +110,7 @@ void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residua
}
for( ; residual_sample < e3; residual_sample+=2) {
__m128i mm_res = _mm_abs_epi32(_mm_loadl_epi64((const __m128i*)(residual+residual_sample))); /* 0 0 |r1| |r0| */
__m128i mm_res = _mm_abs_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(residual+residual_sample))); /* 0 0 |r1| |r0| */
mm_res = _mm_shuffle_epi32(mm_res, _MM_SHUFFLE(3,1,2,0)); /* 0 |r1| 0 |r0| == |r1_64| |r0_64| */
mm_sum = _mm_add_epi64(mm_sum, mm_res);
}
@ -121,7 +121,7 @@ void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residua
}
mm_sum = _mm_add_epi64(mm_sum, _mm_srli_si128(mm_sum, 8));
_mm_storel_epi64((__m128i*)(abs_residual_partition_sums+partition), mm_sum);
_mm_storel_epi64((__m128i*)(void*)(abs_residual_partition_sums+partition), mm_sum);
}
}
}