Added SSSE3 UV average to AVC444v1

This commit is contained in:
Armin Novak 2018-01-31 11:33:55 +01:00
parent 46159c4cab
commit eb8e9cb410

View File

@ -515,83 +515,184 @@ static pstatus_t ssse3_RGBToYUV420(
/* SSSE3 RGB -> AVC444-YUV conversion **/
/****************************************************************************/
static INLINE void ssse3_RGBToAVC444YUV_BGRX_ROW(
const BYTE* src, BYTE* ydst, BYTE* udst1, BYTE* udst2, BYTE* vdst1, BYTE* vdst2, BOOL isEvenRow,
UINT32 width)
static INLINE void ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
const BYTE* srcEven, const BYTE* srcOdd, BYTE* b1Even, BYTE* b1Odd, BYTE* b2,
BYTE* b3, BYTE* b4, BYTE* b5, BYTE* b6, BYTE* b7, UINT32 width)
{
UINT32 x;
__m128i vector128, y_factors, u_factors, v_factors, smask;
__m128i x1, x2, x3, x4, y, y1, y2, u, u1, u2, v, v1, v2;
const __m128i* argb = (const __m128i*) src;
__m128i* py = (__m128i*) ydst;
__m64* pu1 = (__m64*) udst1;
__m64* pu2 = (__m64*) udst2;
__m64* pv1 = (__m64*) vdst1;
__m64* pv2 = (__m64*) vdst2;
y_factors = _mm_load_si128((__m128i*)bgrx_y_factors);
u_factors = _mm_load_si128((__m128i*)bgrx_u_factors);
v_factors = _mm_load_si128((__m128i*)bgrx_v_factors);
vector128 = _mm_load_si128((__m128i*)const_buf_128b);
smask = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
const __m128i* argbEven = (const __m128i*) srcEven;
const __m128i* argbOdd = (const __m128i*) srcOdd;
const __m128i y_factors = _mm_load_si128((__m128i*)bgrx_y_factors);
const __m128i u_factors = _mm_load_si128((__m128i*)bgrx_u_factors);
const __m128i v_factors = _mm_load_si128((__m128i*)bgrx_v_factors);
const __m128i vector128 = _mm_load_si128((__m128i*)const_buf_128b);
for (x = 0; x < width; x += 16)
{
/* store 16 rgba pixels in 4 128 bit registers */
x1 = _mm_load_si128(argb++); // 1st 4 pixels
x2 = _mm_load_si128(argb++); // 2nd 4 pixels
x3 = _mm_load_si128(argb++); // 3rd 4 pixels
x4 = _mm_load_si128(argb++); // 4th 4 pixels
/* Y: multiplications with subtotals and horizontal sums */
y1 = _mm_hadd_epi16(_mm_maddubs_epi16(x1, y_factors), _mm_maddubs_epi16(x2, y_factors));
y2 = _mm_hadd_epi16(_mm_maddubs_epi16(x3, y_factors), _mm_maddubs_epi16(x4, y_factors));
/* Y: shift the results (logical) */
y1 = _mm_srli_epi16(y1, 7);
y2 = _mm_srli_epi16(y2, 7);
/* Y: pack (unsigned) 16 words into bytes */
y = _mm_packus_epi16(y1, y2);
/* U: multiplications with subtotals and horizontal sums */
u1 = _mm_hadd_epi16(_mm_maddubs_epi16(x1, u_factors), _mm_maddubs_epi16(x2, u_factors));
u2 = _mm_hadd_epi16(_mm_maddubs_epi16(x3, u_factors), _mm_maddubs_epi16(x4, u_factors));
/* U: shift the results (arithmetic) */
u1 = _mm_srai_epi16(u1, 7);
u2 = _mm_srai_epi16(u2, 7);
/* U: pack (signed) 16 words into bytes */
u = _mm_packs_epi16(u1, u2);
/* U: add 128 */
u = _mm_add_epi8(u, vector128);
/* V: multiplications with subtotals and horizontal sums */
v1 = _mm_hadd_epi16(_mm_maddubs_epi16(x1, v_factors), _mm_maddubs_epi16(x2, v_factors));
v2 = _mm_hadd_epi16(_mm_maddubs_epi16(x3, v_factors), _mm_maddubs_epi16(x4, v_factors));
/* V: shift the results (arithmetic) */
v1 = _mm_srai_epi16(v1, 7);
v2 = _mm_srai_epi16(v2, 7);
/* V: pack (signed) 16 words into bytes */
v = _mm_packs_epi16(v1, v2);
/* V: add 128 */
v = _mm_add_epi8(v, vector128);
/* store y */
_mm_storeu_si128(py++, y);
/* store u and v */
if (isEvenRow)
const __m128i xe1 = _mm_load_si128(argbEven++); // 1st 4 pixels
const __m128i xe2 = _mm_load_si128(argbEven++); // 2nd 4 pixels
const __m128i xe3 = _mm_load_si128(argbEven++); // 3rd 4 pixels
const __m128i xe4 = _mm_load_si128(argbEven++); // 4th 4 pixels
const __m128i xo1 = _mm_load_si128(argbOdd++); // 1st 4 pixels
const __m128i xo2 = _mm_load_si128(argbOdd++); // 2nd 4 pixels
const __m128i xo3 = _mm_load_si128(argbOdd++); // 3rd 4 pixels
const __m128i xo4 = _mm_load_si128(argbOdd++); // 4th 4 pixels
{
u = _mm_shuffle_epi8(u, smask);
v = _mm_shuffle_epi8(v, smask);
_mm_storel_pi(pu1++, _mm_castsi128_ps(u));
_mm_storeh_pi(pu2++, _mm_castsi128_ps(u));
_mm_storel_pi(pv1++, _mm_castsi128_ps(v));
_mm_storeh_pi(pv2++, _mm_castsi128_ps(v));
/* Y: multiplications with subtotals and horizontal sums */
const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
_mm_maddubs_epi16(xe2, y_factors)), 7);
const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
_mm_maddubs_epi16(xe4, y_factors)), 7);
const __m128i ye = _mm_packus_epi16(ye1, ye2);
const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
_mm_maddubs_epi16(xo2, y_factors)), 7);
const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
_mm_maddubs_epi16(xo4, y_factors)), 7);
const __m128i yo = _mm_packus_epi16(yo1, yo2);
/* store y [b1] */
_mm_storeu_si128((__m128i*)b1Even, ye);
b1Even += 16;
if (b1Odd)
{
_mm_storeu_si128((__m128i*)b1Odd, yo);
b1Odd += 16;
}
}
{
/* We have now
* 16 even U values in ue
* 16 odd U values in uo
*
* We need to split these according to
* 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
__m128i ue, uo;
{
const __m128i ue1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
_mm_maddubs_epi16(xe2, u_factors)), 7);
const __m128i ue2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
_mm_maddubs_epi16(xe4, u_factors)), 7);
ue = _mm_add_epi8(_mm_packs_epi16(ue1, ue2), vector128);
}
if (b1Odd)
{
const __m128i uo1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
_mm_maddubs_epi16(xo2, u_factors)), 7);
const __m128i uo2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
_mm_maddubs_epi16(xo4, u_factors)), 7);
uo = _mm_add_epi8(_mm_packs_epi16(uo1, uo2), vector128);
}
/* Now we need the following storage distribution:
* 2x 2y -> b2
* x 2y+1 -> b4
* 2x+1 2y -> b6 */
if (b1Odd) /* b2 */
{
const __m128i ueh = _mm_unpackhi_epi8(ue, _mm_setzero_si128());
const __m128i uoh = _mm_unpackhi_epi8(uo, _mm_setzero_si128());
const __m128i hi = _mm_add_epi16(ueh, uoh);
const __m128i uel = _mm_unpacklo_epi8(ue, _mm_setzero_si128());
const __m128i uol = _mm_unpacklo_epi8(uo, _mm_setzero_si128());
const __m128i lo = _mm_add_epi16(uel, uol);
const __m128i added = _mm_hadd_epi16(lo, hi);
const __m128i avg16 = _mm_srai_epi16(added, 2);
const __m128i avg = _mm_packus_epi16(avg16, avg16);
_mm_storel_epi64((__m128i*)b2, avg);
}
else
{
_mm_storel_pi(pu1, _mm_castsi128_ps(u));
_mm_storeh_pi(pu2, _mm_castsi128_ps(u));
_mm_storel_pi(pv1, _mm_castsi128_ps(v));
_mm_storeh_pi(pv2, _mm_castsi128_ps(v));
pu1 += 2;
pu2 += 2;
pv1 += 2;
pv2 += 2;
const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
14, 12, 10, 8, 6, 4, 2, 0);
const __m128i ud = _mm_shuffle_epi8(ue, mask);
_mm_storel_epi64((__m128i*)b2, ud);
}
b2 += 8;
if (b1Odd) /* b4 */
{
_mm_store_si128((__m128i*)b4, uo);
b4 += 16;
}
{
/* b6 */
const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
15, 13, 11, 9, 7, 5, 3, 1);
const __m128i ude = _mm_shuffle_epi8(ue, mask);
_mm_storel_epi64((__m128i*)b6, ude);
b6 += 8;
}
}
{
/* We have now
* 16 even V values in ue
* 16 odd V values in uo
*
* We need to split these according to
* 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
__m128i ve, vo;
{
const __m128i ve1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
_mm_maddubs_epi16(xe2, v_factors)), 7);
const __m128i ve2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
_mm_maddubs_epi16(xe4, v_factors)), 7);
ve = _mm_add_epi8(_mm_packs_epi16(ve1, ve2), vector128);
}
if (b1Odd)
{
const __m128i vo1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
_mm_maddubs_epi16(xo2, v_factors)), 7);
const __m128i vo2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
_mm_maddubs_epi16(xo4, v_factors)), 7);
vo = _mm_add_epi8(_mm_packs_epi16(vo1, vo2), vector128);
}
/* Now we need the following storage distribution:
* 2x 2y -> b3
* x 2y+1 -> b5
* 2x+1 2y -> b7 */
if (b1Odd) /* b3 */
{
const __m128i veh = _mm_unpackhi_epi8(ve, _mm_setzero_si128());
const __m128i voh = _mm_unpackhi_epi8(vo, _mm_setzero_si128());
const __m128i hi = _mm_add_epi16(veh, voh);
const __m128i vel = _mm_unpacklo_epi8(ve, _mm_setzero_si128());
const __m128i vol = _mm_unpacklo_epi8(vo, _mm_setzero_si128());
const __m128i lo = _mm_add_epi16(vel, vol);
const __m128i added = _mm_hadd_epi16(lo, hi);
const __m128i avg16 = _mm_srai_epi16(added, 2);
const __m128i avg = _mm_packus_epi16(avg16, avg16);
_mm_storel_epi64((__m128i*)b3, avg);
}
else
{
const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
14, 12, 10, 8, 6, 4, 2, 0);
const __m128i vd = _mm_shuffle_epi8(ve, mask);
_mm_storel_epi64((__m128i*)b3, vd);
}
b3 += 8;
if (b1Odd) /* b5 */
{
_mm_store_si128((__m128i*)b5, vo);
b5 += 16;
}
{
/* b7 */
const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
15, 13, 11, 9, 7, 5, 3, 1);
const __m128i vde = _mm_shuffle_epi8(ve, mask);
_mm_storel_epi64((__m128i*)b7, vde);
b7 += 8;
}
}
}
}
@ -603,9 +704,7 @@ static pstatus_t ssse3_RGBToAVC444YUV_BGRX(
BYTE* pDst2[3], const UINT32 dst2Step[3],
const prim_size_t* roi)
{
UINT32 y, numRows;
BOOL evenRow = TRUE;
BYTE* b1, *b2, *b3, *b4, *b5, *b6, *b7;
UINT32 y;
const BYTE* pMaxSrc = pSrc + (roi->height - 1) * srcStep;
if (roi->height < 1 || roi->width < 1)
@ -614,28 +713,23 @@ static pstatus_t ssse3_RGBToAVC444YUV_BGRX(
if (roi->width % 16 || (unsigned long)pSrc % 16 || srcStep % 16)
return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step, roi);
numRows = (roi->height + 1) & ~1;
for (y = 0; y < numRows; y++, evenRow = !evenRow)
for (y = 0; y < roi->height; y += 2)
{
const BYTE* src = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
UINT32 i = y >> 1;
b1 = pDst1[0] + y * dst1Step[0];
if (evenRow)
{
b2 = pDst1[1] + i * dst1Step[1];
b3 = pDst1[2] + i * dst1Step[2];
b6 = pDst2[1] + i * dst2Step[1];
b7 = pDst2[2] + i * dst2Step[2];
ssse3_RGBToAVC444YUV_BGRX_ROW(src, b1, b2, b6, b3, b7, TRUE, roi->width);
}
else
{
b4 = pDst2[0] + dst2Step[0] * ((i & ~7) + i);
b5 = b4 + 8 * dst2Step[0];
ssse3_RGBToAVC444YUV_BGRX_ROW(src, b1, b4, b4 + 8, b5, b5 + 8, FALSE, roi->width);
}
const BOOL last = (y >= (roi->height - 1));
const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc;
const UINT32 i = y >> 1;
const UINT32 n = (i & ~7) + i;
BYTE* b1Even = pDst1[0] + y * dst1Step[0];
BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL;
BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
BYTE* b4 = pDst2[0] + dst2Step[0] * n;
BYTE* b5 = b4 + 8 * dst2Step[0];
BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6, b7,
roi->width);
}
return PRIMITIVES_SUCCESS;
@ -775,8 +869,11 @@ static INLINE void ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
14, 10, 6, 2, 12, 8, 4, 0);
const __m128i ud = _mm_shuffle_epi8(uo, mask);
_mm_stream_si32((int*)uChromaDst1, ((int*)&ud)[0]);
_mm_stream_si32((int*)vChromaDst1, ((int*)&ud)[1]);
int* uDst1 = (int*)uChromaDst1;
int* vDst1 = (int*)vChromaDst1;
const int* src = (const int*)&ud;
_mm_stream_si32(uDst1, src[0]);
_mm_stream_si32(vDst1, src[1]);
uChromaDst1 += 4;
vChromaDst1 += 4;
}
@ -839,9 +936,12 @@ static INLINE void ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
{
const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
14, 10, 6, 2, 12, 8, 4, 0);
__m128i vd = _mm_shuffle_epi8(vo, mask);
_mm_stream_si32((int*)uChromaDst2, ((int*)&vd)[0]);
_mm_stream_si32((int*)vChromaDst2, ((int*)&vd)[1]);
const __m128i vd = _mm_shuffle_epi8(vo, mask);
int* uDst2 = (int*)uChromaDst2;
int* vDst2 = (int*)vChromaDst2;
const int* src = (const int*)&vd;
_mm_stream_si32(uDst2, src[0]);
_mm_stream_si32(vDst2, src[1]);
uChromaDst2 += 4;
vChromaDst2 += 4;
}