Fixed SSE AVC444v2 chroma decoder.

This commit is contained in:
Armin Novak 2017-04-18 15:03:38 +02:00
parent a2167d5417
commit 06d4e9ac73

View File

@ -945,8 +945,10 @@ static pstatus_t ssse3_ChromaV2ToYUV444(const BYTE* pSrc[3], const UINT32 srcSte
const UINT32 quaterWidth = (nWidth + 3) / 4;
const UINT32 quaterPad = quaterWidth % 16;
const __m128i zero = _mm_setzero_si128();
const __m128i mask = _mm_set_epi8(0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0,
0x80);
const __m128i mask = _mm_set_epi8(0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0,
0x80, 0);
const __m128i mask2 = _mm_set_epi8(0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0,
0x80);
const __m128i shuffle1 = _mm_set_epi8(0x80, 15, 0x80, 14, 0x80, 13, 0x80, 12, 0x80, 11, 0x80, 10,
0x80, 9, 0x80, 8);
const __m128i shuffle2 = _mm_set_epi8(0x80, 7, 0x80, 6, 0x80, 5, 0x80, 4, 0x80, 3, 0x80, 2, 0x80, 1,
@ -965,17 +967,17 @@ static pstatus_t ssse3_ChromaV2ToYUV444(const BYTE* pSrc[3], const UINT32 srcSte
{
{
const __m128i u = _mm_loadu_si128((__m128i*)&pYaU[x]);
const __m128i u2 = _mm_unpackhi_epi8(u, zero);
const __m128i u1 = _mm_unpacklo_epi8(u, zero);
const __m128i u2 = _mm_unpackhi_epi8(zero, u);
const __m128i u1 = _mm_unpacklo_epi8(zero, u);
_mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
_mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
}
{
const __m128i u = _mm_loadu_si128((__m128i*)&pYaV[x]);
const __m128i u2 = _mm_unpackhi_epi8(u, zero);
const __m128i u1 = _mm_unpacklo_epi8(u, zero);
_mm_maskmoveu_si128(u1, mask, (char*)&pV[2 * x]);
_mm_maskmoveu_si128(u2, mask, (char*)&pV[2 * x + 16]);
const __m128i v = _mm_loadu_si128((__m128i*)&pYaV[x]);
const __m128i v2 = _mm_unpackhi_epi8(zero, v);
const __m128i v1 = _mm_unpacklo_epi8(zero, v);
_mm_maskmoveu_si128(v1, mask, (char*)&pV[2 * x]);
_mm_maskmoveu_si128(v2, mask, (char*)&pV[2 * x + 16]);
}
}
@ -1008,24 +1010,24 @@ static pstatus_t ssse3_ChromaV2ToYUV444(const BYTE* pSrc[3], const UINT32 srcSte
const __m128i u2 = _mm_shuffle_epi8(uLow, shuffle1);
const __m128i u3 = _mm_shuffle_epi8(uHigh, shuffle2);
const __m128i u4 = _mm_shuffle_epi8(uHigh, shuffle1);
_mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x + 0]);
_mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
_mm_maskmoveu_si128(u3, mask, (char*)&pU[2 * x + 32]);
_mm_maskmoveu_si128(u4, mask, (char*)&pU[2 * x + 48]);
_mm_maskmoveu_si128(u1, mask2, (char*)&pU[4 * x + 0]);
_mm_maskmoveu_si128(u2, mask2, (char*)&pU[4 * x + 16]);
_mm_maskmoveu_si128(u3, mask2, (char*)&pU[4 * x + 32]);
_mm_maskmoveu_si128(u4, mask2, (char*)&pU[4 * x + 48]);
}
{
const __m128i uU = _mm_loadu_si128((__m128i*)&pUaV[x]);
const __m128i uV = _mm_loadu_si128((__m128i*)&pVaV[x]);
const __m128i uHigh = _mm_unpackhi_epi8(uU, uV);
const __m128i uLow = _mm_unpacklo_epi8(uU, uV);
const __m128i u1 = _mm_shuffle_epi8(uLow, shuffle2);
const __m128i u2 = _mm_shuffle_epi8(uLow, shuffle1);
const __m128i u3 = _mm_shuffle_epi8(uHigh, shuffle2);
const __m128i u4 = _mm_shuffle_epi8(uHigh, shuffle1);
_mm_maskmoveu_si128(u1, mask, (char*)&pV[2 * x + 0]);
_mm_maskmoveu_si128(u2, mask, (char*)&pV[2 * x + 16]);
_mm_maskmoveu_si128(u3, mask, (char*)&pV[2 * x + 32]);
_mm_maskmoveu_si128(u4, mask, (char*)&pV[2 * x + 48]);
const __m128i vU = _mm_loadu_si128((__m128i*)&pUaV[x]);
const __m128i vV = _mm_loadu_si128((__m128i*)&pVaV[x]);
const __m128i vHigh = _mm_unpackhi_epi8(vU, vV);
const __m128i vLow = _mm_unpacklo_epi8(vU, vV);
const __m128i v1 = _mm_shuffle_epi8(vLow, shuffle2);
const __m128i v2 = _mm_shuffle_epi8(vLow, shuffle1);
const __m128i v3 = _mm_shuffle_epi8(vHigh, shuffle2);
const __m128i v4 = _mm_shuffle_epi8(vHigh, shuffle1);
_mm_maskmoveu_si128(v1, mask2, (char*)&pV[4 * x + 0]);
_mm_maskmoveu_si128(v2, mask2, (char*)&pV[4 * x + 16]);
_mm_maskmoveu_si128(v3, mask2, (char*)&pV[4 * x + 32]);
_mm_maskmoveu_si128(v4, mask2, (char*)&pV[4 * x + 48]);
}
}