Added SSSE3 UV average to AVC444v1
This commit is contained in:
parent
46159c4cab
commit
eb8e9cb410
@ -515,83 +515,184 @@ static pstatus_t ssse3_RGBToYUV420(
|
||||
/* SSSE3 RGB -> AVC444-YUV conversion **/
|
||||
/****************************************************************************/
|
||||
|
||||
static INLINE void ssse3_RGBToAVC444YUV_BGRX_ROW(
|
||||
const BYTE* src, BYTE* ydst, BYTE* udst1, BYTE* udst2, BYTE* vdst1, BYTE* vdst2, BOOL isEvenRow,
|
||||
UINT32 width)
|
||||
static INLINE void ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
|
||||
const BYTE* srcEven, const BYTE* srcOdd, BYTE* b1Even, BYTE* b1Odd, BYTE* b2,
|
||||
BYTE* b3, BYTE* b4, BYTE* b5, BYTE* b6, BYTE* b7, UINT32 width)
|
||||
{
|
||||
UINT32 x;
|
||||
__m128i vector128, y_factors, u_factors, v_factors, smask;
|
||||
__m128i x1, x2, x3, x4, y, y1, y2, u, u1, u2, v, v1, v2;
|
||||
const __m128i* argb = (const __m128i*) src;
|
||||
__m128i* py = (__m128i*) ydst;
|
||||
__m64* pu1 = (__m64*) udst1;
|
||||
__m64* pu2 = (__m64*) udst2;
|
||||
__m64* pv1 = (__m64*) vdst1;
|
||||
__m64* pv2 = (__m64*) vdst2;
|
||||
y_factors = _mm_load_si128((__m128i*)bgrx_y_factors);
|
||||
u_factors = _mm_load_si128((__m128i*)bgrx_u_factors);
|
||||
v_factors = _mm_load_si128((__m128i*)bgrx_v_factors);
|
||||
vector128 = _mm_load_si128((__m128i*)const_buf_128b);
|
||||
smask = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
|
||||
const __m128i* argbEven = (const __m128i*) srcEven;
|
||||
const __m128i* argbOdd = (const __m128i*) srcOdd;
|
||||
const __m128i y_factors = _mm_load_si128((__m128i*)bgrx_y_factors);
|
||||
const __m128i u_factors = _mm_load_si128((__m128i*)bgrx_u_factors);
|
||||
const __m128i v_factors = _mm_load_si128((__m128i*)bgrx_v_factors);
|
||||
const __m128i vector128 = _mm_load_si128((__m128i*)const_buf_128b);
|
||||
|
||||
for (x = 0; x < width; x += 16)
|
||||
{
|
||||
/* store 16 rgba pixels in 4 128 bit registers */
|
||||
x1 = _mm_load_si128(argb++); // 1st 4 pixels
|
||||
x2 = _mm_load_si128(argb++); // 2nd 4 pixels
|
||||
x3 = _mm_load_si128(argb++); // 3rd 4 pixels
|
||||
x4 = _mm_load_si128(argb++); // 4th 4 pixels
|
||||
/* Y: multiplications with subtotals and horizontal sums */
|
||||
y1 = _mm_hadd_epi16(_mm_maddubs_epi16(x1, y_factors), _mm_maddubs_epi16(x2, y_factors));
|
||||
y2 = _mm_hadd_epi16(_mm_maddubs_epi16(x3, y_factors), _mm_maddubs_epi16(x4, y_factors));
|
||||
/* Y: shift the results (logical) */
|
||||
y1 = _mm_srli_epi16(y1, 7);
|
||||
y2 = _mm_srli_epi16(y2, 7);
|
||||
/* Y: pack (unsigned) 16 words into bytes */
|
||||
y = _mm_packus_epi16(y1, y2);
|
||||
/* U: multiplications with subtotals and horizontal sums */
|
||||
u1 = _mm_hadd_epi16(_mm_maddubs_epi16(x1, u_factors), _mm_maddubs_epi16(x2, u_factors));
|
||||
u2 = _mm_hadd_epi16(_mm_maddubs_epi16(x3, u_factors), _mm_maddubs_epi16(x4, u_factors));
|
||||
/* U: shift the results (arithmetic) */
|
||||
u1 = _mm_srai_epi16(u1, 7);
|
||||
u2 = _mm_srai_epi16(u2, 7);
|
||||
/* U: pack (signed) 16 words into bytes */
|
||||
u = _mm_packs_epi16(u1, u2);
|
||||
/* U: add 128 */
|
||||
u = _mm_add_epi8(u, vector128);
|
||||
/* V: multiplications with subtotals and horizontal sums */
|
||||
v1 = _mm_hadd_epi16(_mm_maddubs_epi16(x1, v_factors), _mm_maddubs_epi16(x2, v_factors));
|
||||
v2 = _mm_hadd_epi16(_mm_maddubs_epi16(x3, v_factors), _mm_maddubs_epi16(x4, v_factors));
|
||||
/* V: shift the results (arithmetic) */
|
||||
v1 = _mm_srai_epi16(v1, 7);
|
||||
v2 = _mm_srai_epi16(v2, 7);
|
||||
/* V: pack (signed) 16 words into bytes */
|
||||
v = _mm_packs_epi16(v1, v2);
|
||||
/* V: add 128 */
|
||||
v = _mm_add_epi8(v, vector128);
|
||||
/* store y */
|
||||
_mm_storeu_si128(py++, y);
|
||||
|
||||
/* store u and v */
|
||||
if (isEvenRow)
|
||||
const __m128i xe1 = _mm_load_si128(argbEven++); // 1st 4 pixels
|
||||
const __m128i xe2 = _mm_load_si128(argbEven++); // 2nd 4 pixels
|
||||
const __m128i xe3 = _mm_load_si128(argbEven++); // 3rd 4 pixels
|
||||
const __m128i xe4 = _mm_load_si128(argbEven++); // 4th 4 pixels
|
||||
const __m128i xo1 = _mm_load_si128(argbOdd++); // 1st 4 pixels
|
||||
const __m128i xo2 = _mm_load_si128(argbOdd++); // 2nd 4 pixels
|
||||
const __m128i xo3 = _mm_load_si128(argbOdd++); // 3rd 4 pixels
|
||||
const __m128i xo4 = _mm_load_si128(argbOdd++); // 4th 4 pixels
|
||||
{
|
||||
u = _mm_shuffle_epi8(u, smask);
|
||||
v = _mm_shuffle_epi8(v, smask);
|
||||
_mm_storel_pi(pu1++, _mm_castsi128_ps(u));
|
||||
_mm_storeh_pi(pu2++, _mm_castsi128_ps(u));
|
||||
_mm_storel_pi(pv1++, _mm_castsi128_ps(v));
|
||||
_mm_storeh_pi(pv2++, _mm_castsi128_ps(v));
|
||||
/* Y: multiplications with subtotals and horizontal sums */
|
||||
const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
|
||||
_mm_maddubs_epi16(xe2, y_factors)), 7);
|
||||
const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
|
||||
_mm_maddubs_epi16(xe4, y_factors)), 7);
|
||||
const __m128i ye = _mm_packus_epi16(ye1, ye2);
|
||||
const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
|
||||
_mm_maddubs_epi16(xo2, y_factors)), 7);
|
||||
const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
|
||||
_mm_maddubs_epi16(xo4, y_factors)), 7);
|
||||
const __m128i yo = _mm_packus_epi16(yo1, yo2);
|
||||
/* store y [b1] */
|
||||
_mm_storeu_si128((__m128i*)b1Even, ye);
|
||||
b1Even += 16;
|
||||
|
||||
if (b1Odd)
|
||||
{
|
||||
_mm_storeu_si128((__m128i*)b1Odd, yo);
|
||||
b1Odd += 16;
|
||||
}
|
||||
}
|
||||
{
|
||||
/* We have now
|
||||
* 16 even U values in ue
|
||||
* 16 odd U values in uo
|
||||
*
|
||||
* We need to split these according to
|
||||
* 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
|
||||
__m128i ue, uo;
|
||||
{
|
||||
const __m128i ue1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
|
||||
_mm_maddubs_epi16(xe2, u_factors)), 7);
|
||||
const __m128i ue2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
|
||||
_mm_maddubs_epi16(xe4, u_factors)), 7);
|
||||
ue = _mm_add_epi8(_mm_packs_epi16(ue1, ue2), vector128);
|
||||
}
|
||||
|
||||
if (b1Odd)
|
||||
{
|
||||
const __m128i uo1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
|
||||
_mm_maddubs_epi16(xo2, u_factors)), 7);
|
||||
const __m128i uo2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
|
||||
_mm_maddubs_epi16(xo4, u_factors)), 7);
|
||||
uo = _mm_add_epi8(_mm_packs_epi16(uo1, uo2), vector128);
|
||||
}
|
||||
|
||||
/* Now we need the following storage distribution:
|
||||
* 2x 2y -> b2
|
||||
* x 2y+1 -> b4
|
||||
* 2x+1 2y -> b6 */
|
||||
if (b1Odd) /* b2 */
|
||||
{
|
||||
const __m128i ueh = _mm_unpackhi_epi8(ue, _mm_setzero_si128());
|
||||
const __m128i uoh = _mm_unpackhi_epi8(uo, _mm_setzero_si128());
|
||||
const __m128i hi = _mm_add_epi16(ueh, uoh);
|
||||
const __m128i uel = _mm_unpacklo_epi8(ue, _mm_setzero_si128());
|
||||
const __m128i uol = _mm_unpacklo_epi8(uo, _mm_setzero_si128());
|
||||
const __m128i lo = _mm_add_epi16(uel, uol);
|
||||
const __m128i added = _mm_hadd_epi16(lo, hi);
|
||||
const __m128i avg16 = _mm_srai_epi16(added, 2);
|
||||
const __m128i avg = _mm_packus_epi16(avg16, avg16);
|
||||
_mm_storel_epi64((__m128i*)b2, avg);
|
||||
}
|
||||
else
|
||||
{
|
||||
_mm_storel_pi(pu1, _mm_castsi128_ps(u));
|
||||
_mm_storeh_pi(pu2, _mm_castsi128_ps(u));
|
||||
_mm_storel_pi(pv1, _mm_castsi128_ps(v));
|
||||
_mm_storeh_pi(pv2, _mm_castsi128_ps(v));
|
||||
pu1 += 2;
|
||||
pu2 += 2;
|
||||
pv1 += 2;
|
||||
pv2 += 2;
|
||||
const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||
14, 12, 10, 8, 6, 4, 2, 0);
|
||||
const __m128i ud = _mm_shuffle_epi8(ue, mask);
|
||||
_mm_storel_epi64((__m128i*)b2, ud);
|
||||
}
|
||||
|
||||
b2 += 8;
|
||||
|
||||
if (b1Odd) /* b4 */
|
||||
{
|
||||
_mm_store_si128((__m128i*)b4, uo);
|
||||
b4 += 16;
|
||||
}
|
||||
|
||||
{
|
||||
/* b6 */
|
||||
const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||
15, 13, 11, 9, 7, 5, 3, 1);
|
||||
const __m128i ude = _mm_shuffle_epi8(ue, mask);
|
||||
_mm_storel_epi64((__m128i*)b6, ude);
|
||||
b6 += 8;
|
||||
}
|
||||
}
|
||||
{
|
||||
/* We have now
|
||||
* 16 even V values in ue
|
||||
* 16 odd V values in uo
|
||||
*
|
||||
* We need to split these according to
|
||||
* 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
|
||||
__m128i ve, vo;
|
||||
{
|
||||
const __m128i ve1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
|
||||
_mm_maddubs_epi16(xe2, v_factors)), 7);
|
||||
const __m128i ve2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
|
||||
_mm_maddubs_epi16(xe4, v_factors)), 7);
|
||||
ve = _mm_add_epi8(_mm_packs_epi16(ve1, ve2), vector128);
|
||||
}
|
||||
|
||||
if (b1Odd)
|
||||
{
|
||||
const __m128i vo1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
|
||||
_mm_maddubs_epi16(xo2, v_factors)), 7);
|
||||
const __m128i vo2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
|
||||
_mm_maddubs_epi16(xo4, v_factors)), 7);
|
||||
vo = _mm_add_epi8(_mm_packs_epi16(vo1, vo2), vector128);
|
||||
}
|
||||
|
||||
/* Now we need the following storage distribution:
|
||||
* 2x 2y -> b3
|
||||
* x 2y+1 -> b5
|
||||
* 2x+1 2y -> b7 */
|
||||
if (b1Odd) /* b3 */
|
||||
{
|
||||
const __m128i veh = _mm_unpackhi_epi8(ve, _mm_setzero_si128());
|
||||
const __m128i voh = _mm_unpackhi_epi8(vo, _mm_setzero_si128());
|
||||
const __m128i hi = _mm_add_epi16(veh, voh);
|
||||
const __m128i vel = _mm_unpacklo_epi8(ve, _mm_setzero_si128());
|
||||
const __m128i vol = _mm_unpacklo_epi8(vo, _mm_setzero_si128());
|
||||
const __m128i lo = _mm_add_epi16(vel, vol);
|
||||
const __m128i added = _mm_hadd_epi16(lo, hi);
|
||||
const __m128i avg16 = _mm_srai_epi16(added, 2);
|
||||
const __m128i avg = _mm_packus_epi16(avg16, avg16);
|
||||
_mm_storel_epi64((__m128i*)b3, avg);
|
||||
}
|
||||
else
|
||||
{
|
||||
const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||
14, 12, 10, 8, 6, 4, 2, 0);
|
||||
const __m128i vd = _mm_shuffle_epi8(ve, mask);
|
||||
_mm_storel_epi64((__m128i*)b3, vd);
|
||||
}
|
||||
|
||||
b3 += 8;
|
||||
|
||||
if (b1Odd) /* b5 */
|
||||
{
|
||||
_mm_store_si128((__m128i*)b5, vo);
|
||||
b5 += 16;
|
||||
}
|
||||
|
||||
{
|
||||
/* b7 */
|
||||
const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||
15, 13, 11, 9, 7, 5, 3, 1);
|
||||
const __m128i vde = _mm_shuffle_epi8(ve, mask);
|
||||
_mm_storel_epi64((__m128i*)b7, vde);
|
||||
b7 += 8;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -603,9 +704,7 @@ static pstatus_t ssse3_RGBToAVC444YUV_BGRX(
|
||||
BYTE* pDst2[3], const UINT32 dst2Step[3],
|
||||
const prim_size_t* roi)
|
||||
{
|
||||
UINT32 y, numRows;
|
||||
BOOL evenRow = TRUE;
|
||||
BYTE* b1, *b2, *b3, *b4, *b5, *b6, *b7;
|
||||
UINT32 y;
|
||||
const BYTE* pMaxSrc = pSrc + (roi->height - 1) * srcStep;
|
||||
|
||||
if (roi->height < 1 || roi->width < 1)
|
||||
@ -614,28 +713,23 @@ static pstatus_t ssse3_RGBToAVC444YUV_BGRX(
|
||||
if (roi->width % 16 || (unsigned long)pSrc % 16 || srcStep % 16)
|
||||
return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step, roi);
|
||||
|
||||
numRows = (roi->height + 1) & ~1;
|
||||
|
||||
for (y = 0; y < numRows; y++, evenRow = !evenRow)
|
||||
for (y = 0; y < roi->height; y += 2)
|
||||
{
|
||||
const BYTE* src = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
|
||||
UINT32 i = y >> 1;
|
||||
b1 = pDst1[0] + y * dst1Step[0];
|
||||
|
||||
if (evenRow)
|
||||
{
|
||||
b2 = pDst1[1] + i * dst1Step[1];
|
||||
b3 = pDst1[2] + i * dst1Step[2];
|
||||
b6 = pDst2[1] + i * dst2Step[1];
|
||||
b7 = pDst2[2] + i * dst2Step[2];
|
||||
ssse3_RGBToAVC444YUV_BGRX_ROW(src, b1, b2, b6, b3, b7, TRUE, roi->width);
|
||||
}
|
||||
else
|
||||
{
|
||||
b4 = pDst2[0] + dst2Step[0] * ((i & ~7) + i);
|
||||
b5 = b4 + 8 * dst2Step[0];
|
||||
ssse3_RGBToAVC444YUV_BGRX_ROW(src, b1, b4, b4 + 8, b5, b5 + 8, FALSE, roi->width);
|
||||
}
|
||||
const BOOL last = (y >= (roi->height - 1));
|
||||
const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
|
||||
const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc;
|
||||
const UINT32 i = y >> 1;
|
||||
const UINT32 n = (i & ~7) + i;
|
||||
BYTE* b1Even = pDst1[0] + y * dst1Step[0];
|
||||
BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL;
|
||||
BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
|
||||
BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
|
||||
BYTE* b4 = pDst2[0] + dst2Step[0] * n;
|
||||
BYTE* b5 = b4 + 8 * dst2Step[0];
|
||||
BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
|
||||
BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
|
||||
ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6, b7,
|
||||
roi->width);
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
@ -775,8 +869,11 @@ static INLINE void ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
|
||||
const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||
14, 10, 6, 2, 12, 8, 4, 0);
|
||||
const __m128i ud = _mm_shuffle_epi8(uo, mask);
|
||||
_mm_stream_si32((int*)uChromaDst1, ((int*)&ud)[0]);
|
||||
_mm_stream_si32((int*)vChromaDst1, ((int*)&ud)[1]);
|
||||
int* uDst1 = (int*)uChromaDst1;
|
||||
int* vDst1 = (int*)vChromaDst1;
|
||||
const int* src = (const int*)&ud;
|
||||
_mm_stream_si32(uDst1, src[0]);
|
||||
_mm_stream_si32(vDst1, src[1]);
|
||||
uChromaDst1 += 4;
|
||||
vChromaDst1 += 4;
|
||||
}
|
||||
@ -839,9 +936,12 @@ static INLINE void ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
|
||||
{
|
||||
const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
|
||||
14, 10, 6, 2, 12, 8, 4, 0);
|
||||
__m128i vd = _mm_shuffle_epi8(vo, mask);
|
||||
_mm_stream_si32((int*)uChromaDst2, ((int*)&vd)[0]);
|
||||
_mm_stream_si32((int*)vChromaDst2, ((int*)&vd)[1]);
|
||||
const __m128i vd = _mm_shuffle_epi8(vo, mask);
|
||||
int* uDst2 = (int*)uChromaDst2;
|
||||
int* vDst2 = (int*)vChromaDst2;
|
||||
const int* src = (const int*)&vd;
|
||||
_mm_stream_si32(uDst2, src[0]);
|
||||
_mm_stream_si32(vDst2, src[1]);
|
||||
uChromaDst2 += 4;
|
||||
vChromaDst2 += 4;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user