fixed_intrin_sse[23].c : Add new, simpler SSE code.

It's simpler but not faster so it is disabled by default. Maybe it
will be faster on newer CPUs though..

Patch-from: lvqcl <lvqcl.mail@gmail.com>
This commit is contained in:
Erik de Castro Lopo 2014-09-21 09:28:36 +10:00
parent 6abc480387
commit 71875b0c75
2 changed files with 24 additions and 4 deletions

View File

@ -80,6 +80,7 @@ unsigned FLAC__fixed_compute_best_predictor_intrin_sse2(const FLAC__int32 data[]
__m128i err0, err1, tmp;
err0 = _mm_cvtsi32_si128(data[i]); // 0 0 0 e0
err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0)); // e0 e0 e0 e0
#if 1 /* OPT_SSE */
err1 = _mm_sub_epi32(err1, last_error);
last_error = _mm_srli_si128(last_error, 4); // 0 le0 le1 le2
err1 = _mm_sub_epi32(err1, last_error);
@ -87,7 +88,11 @@ unsigned FLAC__fixed_compute_best_predictor_intrin_sse2(const FLAC__int32 data[]
err1 = _mm_sub_epi32(err1, last_error);
last_error = _mm_srli_si128(last_error, 4); // 0 0 0 le0
err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4
#else
last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8)); // le0 le1 le2+le0 le3+le1
last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4)); // le0 le1+le0 le2+le0+le1 le3+le1+le2+le0
err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4
#endif
tmp = _mm_slli_si128(err0, 12); // e0 0 0 0
last_error = _mm_srli_si128(err1, 4); // 0 e0 e1 e2
last_error = _mm_or_si128(last_error, tmp); // e0 e1 e2 e3
@ -172,6 +177,7 @@ unsigned FLAC__fixed_compute_best_predictor_wide_intrin_sse2(const FLAC__int32 d
__m128i err0, err1, tmp;
err0 = _mm_cvtsi32_si128(data[i]); // 0 0 0 e0
err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0)); // e0 e0 e0 e0
#if 1 /* OPT_SSE */
err1 = _mm_sub_epi32(err1, last_error);
last_error = _mm_srli_si128(last_error, 4); // 0 le0 le1 le2
err1 = _mm_sub_epi32(err1, last_error);
@ -179,7 +185,11 @@ unsigned FLAC__fixed_compute_best_predictor_wide_intrin_sse2(const FLAC__int32 d
err1 = _mm_sub_epi32(err1, last_error);
last_error = _mm_srli_si128(last_error, 4); // 0 0 0 le0
err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4
#else
last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8)); // le0 le1 le2+le0 le3+le1
last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4)); // le0 le1+le0 le2+le0+le1 le3+le1+le2+le0
err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4
#endif
tmp = _mm_slli_si128(err0, 12); // e0 0 0 0
last_error = _mm_srli_si128(err1, 4); // 0 e0 e1 e2
last_error = _mm_or_si128(last_error, tmp); // e0 e1 e2 e3

View File

@ -80,6 +80,7 @@ unsigned FLAC__fixed_compute_best_predictor_intrin_ssse3(const FLAC__int32 data[
__m128i err0, err1;
err0 = _mm_cvtsi32_si128(data[i]); // 0 0 0 e0
err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0)); // e0 e0 e0 e0
#if 1 /* OPT_SSE */
err1 = _mm_sub_epi32(err1, last_error);
last_error = _mm_srli_si128(last_error, 4); // 0 le0 le1 le2
err1 = _mm_sub_epi32(err1, last_error);
@ -87,7 +88,11 @@ unsigned FLAC__fixed_compute_best_predictor_intrin_ssse3(const FLAC__int32 data[
err1 = _mm_sub_epi32(err1, last_error);
last_error = _mm_srli_si128(last_error, 4); // 0 0 0 le0
err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4
#else
last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8)); // le0 le1 le2+le0 le3+le1
last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4)); // le0 le1+le0 le2+le0+le1 le3+le1+le2+le0
err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4
#endif
last_error = _mm_alignr_epi8(err0, err1, 4); // e0 e1 e2 e3
err0 = _mm_abs_epi32(err0);
@ -166,6 +171,7 @@ unsigned FLAC__fixed_compute_best_predictor_wide_intrin_ssse3(const FLAC__int32
__m128i err0, err1;
err0 = _mm_cvtsi32_si128(data[i]); // 0 0 0 e0
err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0)); // e0 e0 e0 e0
#if 1 /* OPT_SSE */
err1 = _mm_sub_epi32(err1, last_error);
last_error = _mm_srli_si128(last_error, 4); // 0 le0 le1 le2
err1 = _mm_sub_epi32(err1, last_error);
@ -173,7 +179,11 @@ unsigned FLAC__fixed_compute_best_predictor_wide_intrin_ssse3(const FLAC__int32
err1 = _mm_sub_epi32(err1, last_error);
last_error = _mm_srli_si128(last_error, 4); // 0 0 0 le0
err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4
#else
last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8)); // le0 le1 le2+le0 le3+le1
last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4)); // le0 le1+le0 le2+le0+le1 le3+le1+le2+le0
err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4
#endif
last_error = _mm_alignr_epi8(err0, err1, 4); // e0 e1 e2 e3
err0 = _mm_abs_epi32(err0);