1305 lines
43 KiB
C
1305 lines
43 KiB
C
/**
|
|
* FreeRDP: A Remote Desktop Protocol Implementation
|
|
* Optimized YUV/RGB conversion operations
|
|
*
|
|
* Copyright 2014 Thomas Erbesdobler
|
|
* Copyright 2016-2017 Armin Novak <armin.novak@thincast.com>
|
|
* Copyright 2016-2017 Norbert Federa <norbert.federa@thincast.com>
|
|
* Copyright 2016-2017 Thincast Technologies GmbH
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "config.h"
|
|
#endif
|
|
|
|
#include <winpr/sysinfo.h>
|
|
#include <winpr/crt.h>
|
|
#include <freerdp/types.h>
|
|
#include <freerdp/primitives.h>
|
|
|
|
#include "prim_internal.h"
|
|
|
|
#ifdef WITH_SSE2
|
|
|
|
#include <emmintrin.h>
|
|
#include <tmmintrin.h>
|
|
#elif defined(WITH_NEON)
|
|
#include <arm_neon.h>
|
|
#endif /* WITH_SSE2 else WITH_NEON */
|
|
|
|
static primitives_t* generic = NULL;
|
|
|
|
#ifdef WITH_SSE2
|
|
/****************************************************************************/
|
|
/* SSSE3 YUV420 -> RGB conversion */
|
|
/****************************************************************************/
|
|
|
|
static pstatus_t ssse3_YUV420ToRGB_BGRX(
|
|
const BYTE** pSrc, const UINT32* srcStep,
|
|
BYTE* pDst, UINT32 dstStep, UINT32 dstFormat,
|
|
const prim_size_t* roi)
|
|
{
|
|
UINT32 lastRow, lastCol;
|
|
BYTE* UData, *VData, *YData;
|
|
UINT32 i, nWidth, nHeight, VaddDst, VaddY, VaddU, VaddV;
|
|
__m128i r0, r1, r2, r3, r4, r5, r6, r7;
|
|
__m128i* buffer;
|
|
/* last_line: if the last (U,V doubled) line should be skipped, set to 10B
|
|
* last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */
|
|
buffer = _aligned_malloc(4 * 16, 16);
|
|
YData = (BYTE*) pSrc[0];
|
|
UData = (BYTE*) pSrc[1];
|
|
VData = (BYTE*) pSrc[2];
|
|
nWidth = roi->width;
|
|
nHeight = roi->height;
|
|
|
|
if ((lastCol = (nWidth & 3)))
|
|
{
|
|
switch (lastCol)
|
|
{
|
|
case 1:
|
|
r7 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
|
|
break;
|
|
|
|
case 2:
|
|
r7 = _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
|
|
break;
|
|
|
|
case 3:
|
|
r7 = _mm_set_epi32(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
|
|
break;
|
|
}
|
|
|
|
_mm_store_si128(buffer + 3, r7);
|
|
lastCol = 1;
|
|
}
|
|
|
|
nWidth += 3;
|
|
nWidth = nWidth >> 2;
|
|
lastRow = nHeight & 1;
|
|
nHeight++;
|
|
nHeight = nHeight >> 1;
|
|
VaddDst = (dstStep << 1) - (nWidth << 4);
|
|
VaddY = (srcStep[0] << 1) - (nWidth << 2);
|
|
VaddU = srcStep[1] - (((nWidth << 1) + 2) & 0xFFFC);
|
|
VaddV = srcStep[2] - (((nWidth << 1) + 2) & 0xFFFC);
|
|
|
|
while (nHeight-- > 0)
|
|
{
|
|
if (nHeight == 0)
|
|
lastRow <<= 1;
|
|
|
|
i = 0;
|
|
|
|
do
|
|
{
|
|
if (!(i & 0x01))
|
|
{
|
|
/* Y-, U- and V-data is stored in different arrays.
|
|
* We start with processing U-data.
|
|
*
|
|
* at first we fetch four U-values from its array and shuffle them like this:
|
|
* 0d0d 0c0c 0b0b 0a0a
|
|
* we've done two things: converting the values to signed words and duplicating
|
|
* each value, because always two pixel "share" the same U- (and V-) data */
|
|
r0 = _mm_cvtsi32_si128(*(UINT32*)UData);
|
|
r5 = _mm_set_epi32(0x80038003, 0x80028002, 0x80018001, 0x80008000);
|
|
r0 = _mm_shuffle_epi8(r0, r5);
|
|
UData += 4;
|
|
/* then we subtract 128 from each value, so we get D */
|
|
r3 = _mm_set_epi16(128, 128, 128, 128, 128, 128, 128, 128);
|
|
r0 = _mm_subs_epi16(r0, r3);
|
|
/* we need to do two things with our D, so let's store it for later use */
|
|
r2 = r0;
|
|
/* now we can multiply our D with 48 and unpack it to xmm4:xmm0
|
|
* this is what we need to get G data later on */
|
|
r4 = r0;
|
|
r7 = _mm_set_epi16(48, 48, 48, 48, 48, 48, 48, 48);
|
|
r0 = _mm_mullo_epi16(r0, r7);
|
|
r4 = _mm_mulhi_epi16(r4, r7);
|
|
r7 = r0;
|
|
r0 = _mm_unpacklo_epi16(r0, r4);
|
|
r4 = _mm_unpackhi_epi16(r7, r4);
|
|
/* to get B data, we need to prepare a second value, D*475 */
|
|
r1 = r2;
|
|
r7 = _mm_set_epi16(475, 475, 475, 475, 475, 475, 475, 475);
|
|
r1 = _mm_mullo_epi16(r1, r7);
|
|
r2 = _mm_mulhi_epi16(r2, r7);
|
|
r7 = r1;
|
|
r1 = _mm_unpacklo_epi16(r1, r2);
|
|
r7 = _mm_unpackhi_epi16(r7, r2);
|
|
/* so we got something like this: xmm7:xmm1
|
|
* this pair contains values for 16 pixel:
|
|
* aabbccdd
|
|
* aabbccdd, but we can only work on four pixel at once, so we need to save upper values */
|
|
_mm_store_si128(buffer + 1, r7);
|
|
/* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */
|
|
r2 = _mm_cvtsi32_si128(*(UINT32*)VData);
|
|
r2 = _mm_shuffle_epi8(r2, r5);
|
|
VData += 4;
|
|
r2 = _mm_subs_epi16(r2, r3);
|
|
r5 = r2;
|
|
/* this is also known as E*403, we need it to convert R data */
|
|
r3 = r2;
|
|
r7 = _mm_set_epi16(403, 403, 403, 403, 403, 403, 403, 403);
|
|
r2 = _mm_mullo_epi16(r2, r7);
|
|
r3 = _mm_mulhi_epi16(r3, r7);
|
|
r7 = r2;
|
|
r2 = _mm_unpacklo_epi16(r2, r3);
|
|
r7 = _mm_unpackhi_epi16(r7, r3);
|
|
/* and preserve upper four values for future ... */
|
|
_mm_store_si128(buffer + 2, r7);
|
|
/* doing this step: E*120 */
|
|
r3 = r5;
|
|
r7 = _mm_set_epi16(120, 120, 120, 120, 120, 120, 120, 120);
|
|
r3 = _mm_mullo_epi16(r3, r7);
|
|
r5 = _mm_mulhi_epi16(r5, r7);
|
|
r7 = r3;
|
|
r3 = _mm_unpacklo_epi16(r3, r5);
|
|
r7 = _mm_unpackhi_epi16(r7, r5);
|
|
/* now we complete what we've begun above:
|
|
* (48*D) + (120*E) = (48*D +120*E) */
|
|
r0 = _mm_add_epi32(r0, r3);
|
|
r4 = _mm_add_epi32(r4, r7);
|
|
/* and store to memory ! */
|
|
_mm_store_si128(buffer, r4);
|
|
}
|
|
else
|
|
{
|
|
/* maybe you've wondered about the conditional above ?
|
|
* Well, we prepared UV data for eight pixel in each line, but can only process four
|
|
* per loop. So we need to load the upper four pixel data from memory each secound loop! */
|
|
r1 = _mm_load_si128(buffer + 1);
|
|
r2 = _mm_load_si128(buffer + 2);
|
|
r0 = _mm_load_si128(buffer);
|
|
}
|
|
|
|
if (++i == nWidth)
|
|
lastCol <<= 1;
|
|
|
|
/* We didn't produce any output yet, so let's do so!
|
|
* Ok, fetch four pixel from the Y-data array and shuffle them like this:
|
|
* 00d0 00c0 00b0 00a0, to get signed dwords and multiply by 256 */
|
|
r4 = _mm_cvtsi32_si128(*(UINT32*)YData);
|
|
r7 = _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080);
|
|
r4 = _mm_shuffle_epi8(r4, r7);
|
|
r5 = r4;
|
|
r6 = r4;
|
|
/* no we can perform the "real" conversion itself and produce output! */
|
|
r4 = _mm_add_epi32(r4, r2);
|
|
r5 = _mm_sub_epi32(r5, r0);
|
|
r6 = _mm_add_epi32(r6, r1);
|
|
/* in the end, we only need bytes for RGB values.
|
|
* So, what do we do? right! shifting left makes values bigger and thats always good.
|
|
* before we had dwords of data, and by shifting left and treating the result
|
|
* as packed words, we get not only signed words, but do also divide by 256
|
|
* imagine, data is now ordered this way: ddx0 ccx0 bbx0 aax0, and x is the least
|
|
* significant byte, that we don't need anymore, because we've done some rounding */
|
|
r4 = _mm_slli_epi32(r4, 8);
|
|
r5 = _mm_slli_epi32(r5, 8);
|
|
r6 = _mm_slli_epi32(r6, 8);
|
|
/* one thing we still have to face is the clip() function ...
|
|
* we have still signed words, and there are those min/max instructions in SSE2 ...
|
|
* the max instruction takes always the bigger of the two operands and stores it in the first one,
|
|
* and it operates with signs !
|
|
* if we feed it with our values and zeros, it takes the zeros if our values are smaller than
|
|
* zero and otherwise our values */
|
|
r7 = _mm_set_epi32(0, 0, 0, 0);
|
|
r4 = _mm_max_epi16(r4, r7);
|
|
r5 = _mm_max_epi16(r5, r7);
|
|
r6 = _mm_max_epi16(r6, r7);
|
|
/* the same thing just completely different can be used to limit our values to 255,
|
|
* but now using the min instruction and 255s */
|
|
r7 = _mm_set_epi32(0x00FF0000, 0x00FF0000, 0x00FF0000, 0x00FF0000);
|
|
r4 = _mm_min_epi16(r4, r7);
|
|
r5 = _mm_min_epi16(r5, r7);
|
|
r6 = _mm_min_epi16(r6, r7);
|
|
/* Now we got our bytes.
|
|
* the moment has come to assemble the three channels R,G and B to the xrgb dwords
|
|
* on Red channel we just have to and each futural dword with 00FF0000H */
|
|
//r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
|
|
r4 = _mm_and_si128(r4, r7);
|
|
/* on Green channel we have to shuffle somehow, so we get something like this:
|
|
* 00d0 00c0 00b0 00a0 */
|
|
r7 = _mm_set_epi32(0x80800E80, 0x80800A80, 0x80800680, 0x80800280);
|
|
r5 = _mm_shuffle_epi8(r5, r7);
|
|
/* and on Blue channel that one:
|
|
* 000d 000c 000b 000a */
|
|
r7 = _mm_set_epi32(0x8080800E, 0x8080800A, 0x80808006, 0x80808002);
|
|
r6 = _mm_shuffle_epi8(r6, r7);
|
|
/* and at last we or it together and get this one:
|
|
* xrgb xrgb xrgb xrgb */
|
|
r4 = _mm_or_si128(r4, r5);
|
|
r4 = _mm_or_si128(r4, r6);
|
|
|
|
/* Only thing to do know is writing data to memory, but this gets a bit more
|
|
* complicated if the width is not a multiple of four and it is the last column in line. */
|
|
if (lastCol & 0x02)
|
|
{
|
|
/* let's say, we need to only convert six pixel in width
|
|
* Ok, the first 4 pixel will be converted just like every 4 pixel else, but
|
|
* if it's the last loop in line, last_column is shifted left by one (curious? have a look above),
|
|
* and we land here. Through initialisation a mask was prepared. In this case it looks like
|
|
* 0000FFFFH 0000FFFFH 0000FFFFH 0000FFFFH */
|
|
r6 = _mm_load_si128(buffer + 3);
|
|
/* we and our output data with this mask to get only the valid pixel */
|
|
r4 = _mm_and_si128(r4, r6);
|
|
/* then we fetch memory from the destination array ... */
|
|
r5 = _mm_lddqu_si128((__m128i*)pDst);
|
|
/* ... and and it with the inverse mask. We get only those pixel, which should not be updated */
|
|
r6 = _mm_andnot_si128(r6, r5);
|
|
/* we only have to or the two values together and write it back to the destination array,
|
|
* and only the pixel that should be updated really get changed. */
|
|
r4 = _mm_or_si128(r4, r6);
|
|
}
|
|
|
|
_mm_storeu_si128((__m128i*)pDst, r4);
|
|
|
|
if (!(lastRow & 0x02))
|
|
{
|
|
/* Because UV data is the same for two lines, we can process the secound line just here,
|
|
* in the same loop. Only thing we need to do is to add some offsets to the Y- and destination
|
|
* pointer. These offsets are iStride[0] and the target scanline.
|
|
* But if we don't need to process the secound line, like if we are in the last line of processing nine lines,
|
|
* we just skip all this. */
|
|
r4 = _mm_cvtsi32_si128(*(UINT32*)(YData + srcStep[0]));
|
|
r7 = _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080);
|
|
r4 = _mm_shuffle_epi8(r4, r7);
|
|
r5 = r4;
|
|
r6 = r4;
|
|
r4 = _mm_add_epi32(r4, r2);
|
|
r5 = _mm_sub_epi32(r5, r0);
|
|
r6 = _mm_add_epi32(r6, r1);
|
|
r4 = _mm_slli_epi32(r4, 8);
|
|
r5 = _mm_slli_epi32(r5, 8);
|
|
r6 = _mm_slli_epi32(r6, 8);
|
|
r7 = _mm_set_epi32(0, 0, 0, 0);
|
|
r4 = _mm_max_epi16(r4, r7);
|
|
r5 = _mm_max_epi16(r5, r7);
|
|
r6 = _mm_max_epi16(r6, r7);
|
|
r7 = _mm_set_epi32(0x00FF0000, 0x00FF0000, 0x00FF0000, 0x00FF0000);
|
|
r4 = _mm_min_epi16(r4, r7);
|
|
r5 = _mm_min_epi16(r5, r7);
|
|
r6 = _mm_min_epi16(r6, r7);
|
|
r7 = _mm_set_epi32(0x00FF0000, 0x00FF0000, 0x00FF0000, 0x00FF0000);
|
|
r4 = _mm_and_si128(r4, r7);
|
|
r7 = _mm_set_epi32(0x80800E80, 0x80800A80, 0x80800680, 0x80800280);
|
|
r5 = _mm_shuffle_epi8(r5, r7);
|
|
r7 = _mm_set_epi32(0x8080800E, 0x8080800A, 0x80808006, 0x80808002);
|
|
r6 = _mm_shuffle_epi8(r6, r7);
|
|
r4 = _mm_or_si128(r4, r5);
|
|
r4 = _mm_or_si128(r4, r6);
|
|
|
|
if (lastCol & 0x02)
|
|
{
|
|
r6 = _mm_load_si128(buffer + 3);
|
|
r4 = _mm_and_si128(r4, r6);
|
|
r5 = _mm_lddqu_si128((__m128i*)(pDst + dstStep));
|
|
r6 = _mm_andnot_si128(r6, r5);
|
|
r4 = _mm_or_si128(r4, r6);
|
|
/* only thing is, we should shift [rbp-42] back here, because we have processed the last column,
|
|
* and this "special condition" can be released */
|
|
lastCol >>= 1;
|
|
}
|
|
|
|
_mm_storeu_si128((__m128i*)(pDst + dstStep), r4);
|
|
}
|
|
|
|
/* after all we have to increase the destination- and Y-data pointer by four pixel */
|
|
pDst += 16;
|
|
YData += 4;
|
|
}
|
|
while (i < nWidth);
|
|
|
|
/* after each line we have to add the scanline to the destination pointer, because
|
|
* we are processing two lines at once, but only increasing the destination pointer
|
|
* in the first line. Well, we only have one pointer, so it's the easiest way to access
|
|
* the secound line with the one pointer and an offset (scanline)
|
|
* if we're not converting the full width of the scanline, like only 64 pixel, but the
|
|
* output buffer was "designed" for 1920p HD, we have to add the remaining length for each line,
|
|
* to get into the next line. */
|
|
pDst += VaddDst;
|
|
/* same thing has to be done for Y-data, but with iStride[0] instead of the target scanline */
|
|
YData += VaddY;
|
|
/* and again for UV data, but here it's enough to add the remaining length, because
|
|
* UV data is the same for two lines and there exists only one "UV line" on two "real lines" */
|
|
UData += VaddU;
|
|
VData += VaddV;
|
|
}
|
|
|
|
_aligned_free(buffer);
|
|
return PRIMITIVES_SUCCESS;
|
|
}
|
|
|
|
static pstatus_t ssse3_YUV420ToRGB(
|
|
const BYTE** pSrc, const UINT32* srcStep,
|
|
BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
|
|
const prim_size_t* roi)
|
|
{
|
|
switch (DstFormat)
|
|
{
|
|
case PIXEL_FORMAT_BGRX32:
|
|
case PIXEL_FORMAT_BGRA32:
|
|
return ssse3_YUV420ToRGB_BGRX(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
|
|
|
default:
|
|
return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
|
}
|
|
}
|
|
|
|
static __m128i* ssse3_YUV444Pixel(__m128i* dst, __m128i Yraw, __m128i Uraw, __m128i Vraw,
|
|
__m128i mapY, __m128i mapUV)
|
|
{
|
|
const __m128i c128 = _mm_set1_epi16(128);
|
|
__m128i BGRX = _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000);
|
|
{
|
|
__m128i C, D, E;
|
|
/* Load Y values and expand to 32 bit */
|
|
{
|
|
C = _mm_shuffle_epi8(Yraw, mapY); /* Reorder and multiply by 256 */
|
|
}
|
|
/* Load U values and expand to 32 bit */
|
|
{
|
|
const __m128i U = _mm_shuffle_epi8(Uraw, mapUV); /* Reorder dcba */
|
|
D = _mm_sub_epi16(U, c128); /* D = U - 128 */
|
|
}
|
|
/* Load V values and expand to 32 bit */
|
|
{
|
|
const __m128i V = _mm_shuffle_epi8(Vraw, mapUV); /* Reorder dcba */
|
|
E = _mm_sub_epi16(V, c128); /* E = V - 128 */
|
|
}
|
|
/* Get the R value */
|
|
{
|
|
const __m128i c403 = _mm_set1_epi16(403);
|
|
const __m128i e403 = _mm_unpackhi_epi16(_mm_mullo_epi16(E, c403), _mm_mulhi_epi16(E, c403));
|
|
const __m128i Rs = _mm_add_epi32(C, e403);
|
|
const __m128i R32 = _mm_srai_epi32(Rs, 8);
|
|
const __m128i R16 = _mm_packs_epi32(R32, _mm_setzero_si128());
|
|
const __m128i R = _mm_packus_epi16(R16, _mm_setzero_si128());
|
|
const __m128i mask = _mm_set_epi32(0x80038080, 0x80028080, 0x80018080, 0x80008080);
|
|
const __m128i packed = _mm_shuffle_epi8(R, mask);
|
|
BGRX = _mm_or_si128(BGRX, packed);
|
|
}
|
|
/* Get the G value */
|
|
{
|
|
const __m128i c48 = _mm_set1_epi16(48);
|
|
const __m128i d48 = _mm_unpackhi_epi16(_mm_mullo_epi16(D, c48), _mm_mulhi_epi16(D, c48));
|
|
const __m128i c120 = _mm_set1_epi16(120);
|
|
const __m128i e120 = _mm_unpackhi_epi16(_mm_mullo_epi16(E, c120), _mm_mulhi_epi16(E, c120));
|
|
const __m128i de = _mm_add_epi32(d48, e120);
|
|
const __m128i Gs = _mm_sub_epi32(C, de);
|
|
const __m128i G32 = _mm_srai_epi32(Gs, 8);
|
|
const __m128i G16 = _mm_packs_epi32(G32, _mm_setzero_si128());
|
|
const __m128i G = _mm_packus_epi16(G16, _mm_setzero_si128());
|
|
const __m128i mask = _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080);
|
|
const __m128i packed = _mm_shuffle_epi8(G, mask);
|
|
BGRX = _mm_or_si128(BGRX, packed);
|
|
}
|
|
/* Get the B value */
|
|
{
|
|
const __m128i c475 = _mm_set1_epi16(475);
|
|
const __m128i d475 = _mm_unpackhi_epi16(_mm_mullo_epi16(D, c475), _mm_mulhi_epi16(D, c475));
|
|
const __m128i Bs = _mm_add_epi32(C, d475);
|
|
const __m128i B32 = _mm_srai_epi32(Bs, 8);
|
|
const __m128i B16 = _mm_packs_epi32(B32, _mm_setzero_si128());
|
|
const __m128i B = _mm_packus_epi16(B16, _mm_setzero_si128());
|
|
const __m128i mask = _mm_set_epi32(0x80808003, 0x80808002, 0x80808001, 0x80808000);
|
|
const __m128i packed = _mm_shuffle_epi8(B, mask);
|
|
BGRX = _mm_or_si128(BGRX, packed);
|
|
}
|
|
}
|
|
_mm_store_si128(dst++, BGRX);
|
|
return dst;
|
|
}
|
|
|
|
static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(
|
|
const BYTE** pSrc, const UINT32* srcStep,
|
|
BYTE* pDst, UINT32 dstStep,
|
|
const prim_size_t* roi)
|
|
{
|
|
const UINT32 nWidth = roi->width;
|
|
const UINT32 nHeight = roi->height;
|
|
const UINT32 pad = roi->width % 16;
|
|
UINT32 y;
|
|
|
|
for (y = 0; y < nHeight; y++)
|
|
{
|
|
UINT32 x;
|
|
__m128i* dst = (__m128i*)(pDst + dstStep * y);
|
|
const BYTE* YData = pSrc[0] + y * srcStep[0];
|
|
const BYTE* UData = pSrc[1] + y * srcStep[1];
|
|
const BYTE* VData = pSrc[2] + y * srcStep[2];
|
|
|
|
for (x = 0; x < nWidth - pad; x += 16)
|
|
{
|
|
__m128i Y = _mm_load_si128((__m128i*)YData);
|
|
__m128i U = _mm_load_si128((__m128i*)UData);
|
|
__m128i V = _mm_load_si128((__m128i*)VData);
|
|
YData += 16;
|
|
UData += 16;
|
|
VData += 16;
|
|
{
|
|
const __m128i mapY = _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080);
|
|
const __m128i mapUV = _mm_set_epi32(0x80038002, 0x80018000, 0x80808080, 0x80808080);
|
|
dst = ssse3_YUV444Pixel(dst, Y, U, V, mapY, mapUV);
|
|
}
|
|
{
|
|
const __m128i mapY = _mm_set_epi32(0x80800780, 0x80800680, 0x80800580, 0x80800480);
|
|
const __m128i mapUV = _mm_set_epi32(0x80078006, 0x80058004, 0x80808080, 0x80808080);
|
|
dst = ssse3_YUV444Pixel(dst, Y, U, V, mapY, mapUV);
|
|
}
|
|
{
|
|
const __m128i mapY = _mm_set_epi32(0x80800B80, 0x80800A80, 0x80800980, 0x80800880);
|
|
const __m128i mapUV = _mm_set_epi32(0x800B800A, 0x80098008, 0x80808080, 0x80808080);
|
|
dst = ssse3_YUV444Pixel(dst, Y, U, V, mapY, mapUV);
|
|
}
|
|
{
|
|
const __m128i mapY = _mm_set_epi32(0x80800F80, 0x80800E80, 0x80800D80, 0x80800C80);
|
|
const __m128i mapUV = _mm_set_epi32(0x800F800E, 0x800D800C, 0x80808080, 0x80808080);
|
|
dst = ssse3_YUV444Pixel(dst, Y, U, V, mapY, mapUV);
|
|
}
|
|
}
|
|
|
|
for (x = 0; x < pad; x++)
|
|
{
|
|
const BYTE Y = *YData++;
|
|
const BYTE U = *UData++;
|
|
const BYTE V = *VData++;
|
|
const BYTE r = YUV2R(Y, U, V);
|
|
const BYTE g = YUV2G(Y, U, V);
|
|
const BYTE b = YUV2B(Y, U, V);
|
|
dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0xFF);
|
|
}
|
|
}
|
|
|
|
return PRIMITIVES_SUCCESS;
|
|
}
|
|
|
|
static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R(const BYTE** pSrc, const UINT32* srcStep,
|
|
BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
|
|
const prim_size_t* roi)
|
|
{
|
|
if ((unsigned long)pSrc[0] % 16 || (unsigned long)pSrc[1] % 16 || (unsigned long)pSrc[2] % 16 ||
|
|
(unsigned long)pDst % 16)
|
|
return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
|
|
|
switch (DstFormat)
|
|
{
|
|
case PIXEL_FORMAT_BGRX32:
|
|
case PIXEL_FORMAT_BGRA32:
|
|
return ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
|
|
|
|
default:
|
|
return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
|
}
|
|
}
|
|
|
|
/****************************************************************************/
|
|
/* SSSE3 RGB -> YUV420 conversion **/
|
|
/****************************************************************************/
|
|
|
|
|
|
/**
|
|
* Note (nfedera):
|
|
* The used forward transformation factors from RGB to YUV are based on the
|
|
* values specified in [Rec. ITU-R BT.709-6] Section 3:
|
|
* http://www.itu.int/rec/R-REC-BT.709-6-201506-I/en
|
|
*
|
|
* Y = 0.21260 * R + 0.71520 * G + 0.07220 * B + 0;
|
|
* U = -0.11457 * R - 0.38543 * G + 0.50000 * B + 128;
|
|
* V = 0.50000 * R - 0.45415 * G - 0.04585 * B + 128;
|
|
*
|
|
* The most accurate integer artmethic approximation when using 8-bit signed
|
|
* integer factors with 16-bit signed integer intermediate results is:
|
|
*
|
|
* Y = ( ( 27 * R + 92 * G + 9 * B) >> 7 );
|
|
* U = ( (-15 * R - 49 * G + 64 * B) >> 7 ) + 128;
|
|
* V = ( ( 64 * R - 58 * G - 6 * B) >> 7 ) + 128;
|
|
*
|
|
*/
|
|
|
|
PRIM_ALIGN_128 static const BYTE bgrx_y_factors[] =
|
|
{
|
|
9, 92, 27, 0, 9, 92, 27, 0, 9, 92, 27, 0, 9, 92, 27, 0
|
|
};
|
|
PRIM_ALIGN_128 static const BYTE bgrx_u_factors[] =
|
|
{
|
|
64, -49, -15, 0, 64, -49, -15, 0, 64, -49, -15, 0, 64, -49, -15, 0
|
|
};
|
|
PRIM_ALIGN_128 static const BYTE bgrx_v_factors[] =
|
|
{
|
|
-6, -58, 64, 0, -6, -58, 64, 0, -6, -58, 64, 0, -6, -58, 64, 0
|
|
};
|
|
|
|
PRIM_ALIGN_128 static const BYTE const_buf_128b[] =
|
|
{
|
|
128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
|
|
};
|
|
|
|
/*
|
|
TODO:
|
|
RGB[AX] can simply be supported using the following factors. And instead of loading the
|
|
globals directly the functions below could be passed pointers to the correct vectors
|
|
depending on the source picture format.
|
|
|
|
PRIM_ALIGN_128 static const BYTE rgbx_y_factors[] = {
|
|
27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0
|
|
};
|
|
PRIM_ALIGN_128 static const BYTE rgbx_u_factors[] = {
|
|
-15, -49, 64, 0, -15, -49, 64, 0, -15, -49, 64, 0, -15, -49, 64, 0
|
|
};
|
|
PRIM_ALIGN_128 static const BYTE rgbx_v_factors[] = {
|
|
64, -58, -6, 0, 64, -58, -6, 0, 64, -58, -6, 0, 64, -58, -6, 0
|
|
};
|
|
*/
|
|
|
|
|
|
/* compute the luma (Y) component from a single rgb source line */
|
|
|
|
static INLINE void ssse3_RGBToYUV420_BGRX_Y(
|
|
const BYTE* src, BYTE* dst, UINT32 width)
|
|
{
|
|
UINT32 x;
|
|
__m128i y_factors, x0, x1, x2, x3;
|
|
const __m128i* argb = (const __m128i*) src;
|
|
__m128i* ydst = (__m128i*) dst;
|
|
y_factors = _mm_load_si128((__m128i*)bgrx_y_factors);
|
|
|
|
for (x = 0; x < width; x += 16)
|
|
{
|
|
/* store 16 rgba pixels in 4 128 bit registers */
|
|
x0 = _mm_load_si128(argb++); // 1st 4 pixels
|
|
x1 = _mm_load_si128(argb++); // 2nd 4 pixels
|
|
x2 = _mm_load_si128(argb++); // 3rd 4 pixels
|
|
x3 = _mm_load_si128(argb++); // 4th 4 pixels
|
|
/* multiplications and subtotals */
|
|
x0 = _mm_maddubs_epi16(x0, y_factors);
|
|
x1 = _mm_maddubs_epi16(x1, y_factors);
|
|
x2 = _mm_maddubs_epi16(x2, y_factors);
|
|
x3 = _mm_maddubs_epi16(x3, y_factors);
|
|
/* the total sums */
|
|
x0 = _mm_hadd_epi16(x0, x1);
|
|
x2 = _mm_hadd_epi16(x2, x3);
|
|
/* shift the results */
|
|
x0 = _mm_srli_epi16(x0, 7);
|
|
x2 = _mm_srli_epi16(x2, 7);
|
|
/* pack the 16 words into bytes */
|
|
x0 = _mm_packus_epi16(x0, x2);
|
|
/* save to y plane */
|
|
_mm_storeu_si128(ydst++, x0);
|
|
}
|
|
}
|
|
|
|
/* compute the chrominance (UV) components from two rgb source lines */
|
|
|
|
static INLINE void ssse3_RGBToYUV420_BGRX_UV(
|
|
const BYTE* src1, const BYTE* src2,
|
|
BYTE* dst1, BYTE* dst2, UINT32 width)
|
|
{
|
|
UINT32 x;
|
|
__m128i vector128, u_factors, v_factors, x0, x1, x2, x3, x4, x5;
|
|
const __m128i* rgb1 = (const __m128i*)src1;
|
|
const __m128i* rgb2 = (const __m128i*)src2;
|
|
__m64* udst = (__m64*)dst1;
|
|
__m64* vdst = (__m64*)dst2;
|
|
vector128 = _mm_load_si128((__m128i*)const_buf_128b);
|
|
u_factors = _mm_load_si128((__m128i*)bgrx_u_factors);
|
|
v_factors = _mm_load_si128((__m128i*)bgrx_v_factors);
|
|
|
|
for (x = 0; x < width; x += 16)
|
|
{
|
|
/* subsample 16x2 pixels into 16x1 pixels */
|
|
x0 = _mm_load_si128(rgb1++);
|
|
x4 = _mm_load_si128(rgb2++);
|
|
x0 = _mm_avg_epu8(x0, x4);
|
|
x1 = _mm_load_si128(rgb1++);
|
|
x4 = _mm_load_si128(rgb2++);
|
|
x1 = _mm_avg_epu8(x1, x4);
|
|
x2 = _mm_load_si128(rgb1++);
|
|
x4 = _mm_load_si128(rgb2++);
|
|
x2 = _mm_avg_epu8(x2, x4);
|
|
x3 = _mm_load_si128(rgb1++);
|
|
x4 = _mm_load_si128(rgb2++);
|
|
x3 = _mm_avg_epu8(x3, x4);
|
|
// subsample these 16x1 pixels into 8x1 pixels */
|
|
/**
|
|
* shuffle controls
|
|
* c = a[0],a[2],b[0],b[2] == 10 00 10 00 = 0x88
|
|
* c = a[1],a[3],b[1],b[3] == 11 01 11 01 = 0xdd
|
|
*/
|
|
x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0x88));
|
|
x0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0xdd));
|
|
x0 = _mm_avg_epu8(x0, x4);
|
|
x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0x88));
|
|
x1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0xdd));
|
|
x1 = _mm_avg_epu8(x1, x4);
|
|
/* multiplications and subtotals */
|
|
x2 = _mm_maddubs_epi16(x0, u_factors);
|
|
x3 = _mm_maddubs_epi16(x1, u_factors);
|
|
x4 = _mm_maddubs_epi16(x0, v_factors);
|
|
x5 = _mm_maddubs_epi16(x1, v_factors);
|
|
/* the total sums */
|
|
x0 = _mm_hadd_epi16(x2, x3);
|
|
x1 = _mm_hadd_epi16(x4, x5);
|
|
/* shift the results */
|
|
x0 = _mm_srai_epi16(x0, 7);
|
|
x1 = _mm_srai_epi16(x1, 7);
|
|
/* pack the 16 words into bytes */
|
|
x0 = _mm_packs_epi16(x0, x1);
|
|
/* add 128 */
|
|
x0 = _mm_add_epi8(x0, vector128);
|
|
/* the lower 8 bytes go to the u plane */
|
|
_mm_storel_pi(udst++, _mm_castsi128_ps(x0));
|
|
/* the upper 8 bytes go to the v plane */
|
|
_mm_storeh_pi(vdst++, _mm_castsi128_ps(x0));
|
|
}
|
|
}
|
|
|
|
static pstatus_t ssse3_RGBToYUV420_BGRX(
|
|
const BYTE* pSrc, UINT32 srcFormat, UINT32 srcStep,
|
|
BYTE* pDst[3], UINT32 dstStep[3],
|
|
const prim_size_t* roi)
|
|
{
|
|
UINT32 y;
|
|
const BYTE* argb = pSrc;
|
|
BYTE* ydst = pDst[0];
|
|
BYTE* udst = pDst[1];
|
|
BYTE* vdst = pDst[2];
|
|
|
|
if (roi->height < 1 || roi->width < 1)
|
|
{
|
|
return !PRIMITIVES_SUCCESS;
|
|
}
|
|
|
|
if (roi->width % 16 || (unsigned long)pSrc % 16 || srcStep % 16)
|
|
{
|
|
return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
|
|
}
|
|
|
|
for (y = 0; y < roi->height - 1; y += 2)
|
|
{
|
|
const BYTE* line1 = argb;
|
|
const BYTE* line2 = argb + srcStep;
|
|
ssse3_RGBToYUV420_BGRX_UV(line1, line2, udst, vdst, roi->width);
|
|
ssse3_RGBToYUV420_BGRX_Y(line1, ydst, roi->width);
|
|
ssse3_RGBToYUV420_BGRX_Y(line2, ydst + dstStep[0], roi->width);
|
|
argb += 2 * srcStep;
|
|
ydst += 2 * dstStep[0];
|
|
udst += 1 * dstStep[1];
|
|
vdst += 1 * dstStep[2];
|
|
}
|
|
|
|
if (roi->height & 1)
|
|
{
|
|
/* pass the same last line of an odd height twice for UV */
|
|
ssse3_RGBToYUV420_BGRX_UV(argb, argb, udst, vdst, roi->width);
|
|
ssse3_RGBToYUV420_BGRX_Y(argb, ydst, roi->width);
|
|
}
|
|
|
|
return PRIMITIVES_SUCCESS;
|
|
}
|
|
|
|
static pstatus_t ssse3_RGBToYUV420(
|
|
const BYTE* pSrc, UINT32 srcFormat, UINT32 srcStep,
|
|
BYTE* pDst[3], UINT32 dstStep[3],
|
|
const prim_size_t* roi)
|
|
{
|
|
switch (srcFormat)
|
|
{
|
|
case PIXEL_FORMAT_BGRX32:
|
|
case PIXEL_FORMAT_BGRA32:
|
|
return ssse3_RGBToYUV420_BGRX(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
|
|
|
|
default:
|
|
return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
|
|
}
|
|
}
|
|
|
|
#elif defined(WITH_NEON)
|
|
|
|
static INLINE uint8x8_t neon_YUV2R(int32x4_t Ch, int32x4_t Cl,
|
|
int16x4_t Dh, int16x4_t Dl,
|
|
int16x4_t Eh, int16x4_t El)
|
|
{
|
|
/* R = (256 * Y + 403 * (V - 128)) >> 8 */
|
|
const int16x4_t c403 = vdup_n_s16(403);
|
|
const int32x4_t CEh = vmlal_s16(Ch, Eh, c403);
|
|
const int32x4_t CEl = vmlal_s16(Cl, El, c403);
|
|
const int32x4_t Rh = vrshrq_n_s32(CEh, 8);
|
|
const int32x4_t Rl = vrshrq_n_s32(CEl, 8);
|
|
const int16x8_t R = vcombine_s16(vqmovn_s32(Rl), vqmovn_s32(Rh));
|
|
return vqmovun_s16(R);
|
|
}
|
|
|
|
static INLINE uint8x8_t neon_YUV2G(int32x4_t Ch, int32x4_t Cl,
|
|
int16x4_t Dh, int16x4_t Dl,
|
|
int16x4_t Eh, int16x4_t El)
|
|
{
|
|
/* G = (256L * Y - 48 * (U - 128) - 120 * (V - 128)) >> 8 */
|
|
const int16x4_t c48 = vdup_n_s16(48);
|
|
const int16x4_t c120 = vdup_n_s16(120);
|
|
const int32x4_t CDh = vmlsl_s16(Ch, Dh, c48);
|
|
const int32x4_t CDl = vmlsl_s16(Cl, Dl, c48);
|
|
const int32x4_t CDEh = vmlsl_s16(CDh, Eh, c120);
|
|
const int32x4_t CDEl = vmlsl_s16(CDl, El, c120);
|
|
const int32x4_t Gh = vrshrq_n_s32(CDEh, 8);
|
|
const int32x4_t Gl = vrshrq_n_s32(CDEl, 8);
|
|
const int16x8_t G = vcombine_s16(vqmovn_s32(Gl), vqmovn_s32(Gh));
|
|
return vqmovun_s16(G);
|
|
}
|
|
|
|
static INLINE uint8x8_t neon_YUV2B(int32x4_t Ch, int32x4_t Cl,
|
|
int16x4_t Dh, int16x4_t Dl,
|
|
int16x4_t Eh, int16x4_t El)
|
|
{
|
|
/* B = (256L * Y + 475 * (U - 128)) >> 8*/
|
|
const int16x4_t c475 = vdup_n_s16(475);
|
|
const int32x4_t CDh = vmlal_s16(Ch, Dh, c475);
|
|
const int32x4_t CDl = vmlal_s16(Ch, Dl, c475);
|
|
const int32x4_t Bh = vrshrq_n_s32(CDh, 8);
|
|
const int32x4_t Bl = vrshrq_n_s32(CDl, 8);
|
|
const int16x8_t B = vcombine_s16(vqmovn_s32(Bl), vqmovn_s32(Bh));
|
|
return vqmovun_s16(B);
|
|
}
|
|
|
|
static INLINE pstatus_t neon_YUV420ToX(
|
|
const BYTE* pSrc[3], const UINT32 srcStep[3],
|
|
BYTE* pDst, UINT32 dstStep,
|
|
const prim_size_t* roi, const uint8_t rPos, const uint8_t gPos,
|
|
const uint8_t bPos, const uint8_t aPos)
|
|
{
|
|
UINT32 y;
|
|
const UINT32 nWidth = roi->width;
|
|
const UINT32 nHeight = roi->height;
|
|
const int16x8_t c128 = vdupq_n_s16(128);
|
|
|
|
for (y = 0; y < nHeight; y += 2)
|
|
{
|
|
const uint8_t* pY1 = pSrc[0] + y * srcStep[0];
|
|
const uint8_t* pY2 = pY1 + srcStep[0];
|
|
const uint8_t* pU = pSrc[1] + (y / 2) * srcStep[1];
|
|
const uint8_t* pV = pSrc[2] + (y / 2) * srcStep[2];
|
|
uint8_t* pRGB1 = pDst + y * dstStep;
|
|
uint8_t* pRGB2 = pRGB1 + dstStep;
|
|
UINT32 x;
|
|
const BOOL lastY = y >= nHeight - 1;
|
|
|
|
for (x = 0; x < nWidth;)
|
|
{
|
|
const BOOL lastX = (nWidth - x) < 16;
|
|
const uint8x8_t Uraw = vld1_u8(pU);
|
|
const uint8x8x2_t Uu = vzip_u8(Uraw, Uraw);
|
|
const int16x8_t U1 = vreinterpretq_s16_u16(vmovl_u8(Uu.val[0]));
|
|
const int16x8_t U2 = vreinterpretq_s16_u16(vmovl_u8(Uu.val[1]));
|
|
const uint8x8_t Vraw = vld1_u8(pV);
|
|
const uint8x8x2_t Vu = vzip_u8(Vraw, Vraw);
|
|
const int16x8_t V1 = vreinterpretq_s16_u16(vmovl_u8(Vu.val[0]));
|
|
const int16x8_t V2 = vreinterpretq_s16_u16(vmovl_u8(Vu.val[1]));
|
|
const int16x8_t D1 = vsubq_s16(U1, c128);
|
|
const int16x4_t D1h = vget_high_s16(D1);
|
|
const int16x4_t D1l = vget_low_s16(D1);
|
|
const int16x8_t E1 = vsubq_s16(V1, c128);
|
|
const int16x4_t E1h = vget_high_s16(E1);
|
|
const int16x4_t E1l = vget_low_s16(E1);
|
|
const int16x8_t D2 = vsubq_s16(U2, c128);
|
|
const int16x4_t D2h = vget_high_s16(D2);
|
|
const int16x4_t D2l = vget_low_s16(D2);
|
|
const int16x8_t E2 = vsubq_s16(V2, c128);
|
|
const int16x4_t E2h = vget_high_s16(E2);
|
|
const int16x4_t E2l = vget_low_s16(E2);
|
|
uint8x8x4_t bgrx;
|
|
bgrx.val[aPos] = vdup_n_u8(0xFF);
|
|
{
|
|
const uint8x8_t Y1u = vld1_u8(pY1);
|
|
const int16x8_t Y1 = vreinterpretq_s16_u16(vmovl_u8(Y1u));
|
|
const int32x4_t Ch = vmulq_n_s32(vmovl_s16(vget_high_s16(Y1)), 256); /* Y * 256 */
|
|
const int32x4_t Cl = vmulq_n_s32(vmovl_s16(vget_low_s16(Y1)), 256); /* Y * 256 */
|
|
bgrx.val[rPos] = neon_YUV2R(Ch, Cl, D1h, D1l, E1h, E1l);
|
|
bgrx.val[gPos] = neon_YUV2G(Ch, Cl, D1h, D1l, E1h, E1l);
|
|
bgrx.val[bPos] = neon_YUV2B(Ch, Cl, D1h, D1l, E1h, E1l);
|
|
vst4_u8(pRGB1, bgrx);
|
|
pRGB1 += 32;
|
|
pY1 += 8;
|
|
x += 8;
|
|
}
|
|
|
|
if (!lastX)
|
|
{
|
|
const uint8x8_t Y1u = vld1_u8(pY1);
|
|
const int16x8_t Y1 = vreinterpretq_s16_u16(vmovl_u8(Y1u));
|
|
const int32x4_t Ch = vmulq_n_s32(vmovl_s16(vget_high_s16(Y1)), 256); /* Y * 256 */
|
|
const int32x4_t Cl = vmulq_n_s32(vmovl_s16(vget_low_s16(Y1)), 256); /* Y * 256 */
|
|
bgrx.val[rPos] = neon_YUV2R(Ch, Cl, D2h, D2l, E2h, E2l);
|
|
bgrx.val[gPos] = neon_YUV2G(Ch, Cl, D2h, D2l, E2h, E2l);
|
|
bgrx.val[bPos] = neon_YUV2B(Ch, Cl, D2h, D2l, E2h, E2l);
|
|
vst4_u8(pRGB1, bgrx);
|
|
pRGB1 += 32;
|
|
pY1 += 8;
|
|
x += 8;
|
|
}
|
|
|
|
if (!lastY)
|
|
{
|
|
const uint8x8_t Y2u = vld1_u8(pY2);
|
|
const int16x8_t Y2 = vreinterpretq_s16_u16(vmovl_u8(Y2u));
|
|
const int32x4_t Ch = vmulq_n_s32(vmovl_s16(vget_high_s16(Y2)), 256); /* Y * 256 */
|
|
const int32x4_t Cl = vmulq_n_s32(vmovl_s16(vget_low_s16(Y2)), 256); /* Y * 256 */
|
|
bgrx.val[rPos] = neon_YUV2R(Ch, Cl, D1h, D1l, E1h, E1l);
|
|
bgrx.val[gPos] = neon_YUV2G(Ch, Cl, D1h, D1l, E1h, E1l);
|
|
bgrx.val[bPos] = neon_YUV2B(Ch, Cl, D1h, D1l, E1h, E1l);
|
|
vst4_u8(pRGB2, bgrx);
|
|
pRGB2 += 32;
|
|
pY2 += 8;
|
|
|
|
if (!lastX)
|
|
{
|
|
const uint8x8_t Y2u = vld1_u8(pY2);
|
|
const int16x8_t Y2 = vreinterpretq_s16_u16(vmovl_u8(Y2u));
|
|
const int32x4_t Ch = vmulq_n_s32(vmovl_s16(vget_high_s16(Y2)), 256); /* Y * 256 */
|
|
const int32x4_t Cl = vmulq_n_s32(vmovl_s16(vget_low_s16(Y2)), 256); /* Y * 256 */
|
|
bgrx.val[rPos] = neon_YUV2R(Ch, Cl, D2h, D2l, E2h, E2l);
|
|
bgrx.val[gPos] = neon_YUV2G(Ch, Cl, D2h, D2l, E2h, E2l);
|
|
bgrx.val[bPos] = neon_YUV2B(Ch, Cl, D2h, D2l, E2h, E2l);
|
|
vst4_u8(pRGB2, bgrx);
|
|
pRGB2 += 32;
|
|
pY2 += 8;
|
|
}
|
|
}
|
|
|
|
pU += 8;
|
|
pV += 8;
|
|
}
|
|
}
|
|
|
|
return PRIMITIVES_SUCCESS;
|
|
}
|
|
|
|
static pstatus_t neon_YUV420ToRGB_8u_P3AC4R(
|
|
const BYTE* pSrc[3], const UINT32 srcStep[3],
|
|
BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
|
|
const prim_size_t* roi)
|
|
{
|
|
switch (DstFormat)
|
|
{
|
|
case PIXEL_FORMAT_BGRA32:
|
|
case PIXEL_FORMAT_BGRX32:
|
|
return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
|
|
|
|
case PIXEL_FORMAT_RGBA32:
|
|
case PIXEL_FORMAT_RGBX32:
|
|
return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
|
|
|
|
case PIXEL_FORMAT_ARGB32:
|
|
case PIXEL_FORMAT_XRGB32:
|
|
return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
|
|
|
|
case PIXEL_FORMAT_ABGR32:
|
|
case PIXEL_FORMAT_XBGR32:
|
|
return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
|
|
|
|
default:
|
|
return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
|
}
|
|
}
|
|
|
|
static INLINE pstatus_t neon_YUV444ToX(
|
|
const BYTE* pSrc[3], const UINT32 srcStep[3],
|
|
BYTE* pDst, UINT32 dstStep,
|
|
const prim_size_t* roi, const uint8_t rPos, const uint8_t gPos,
|
|
const uint8_t bPos, const uint8_t aPos)
|
|
{
|
|
UINT32 y;
|
|
const UINT32 nWidth = roi->width;
|
|
const UINT32 nHeight = roi->height;
|
|
const UINT32 yPad = srcStep[0] - roi->width;
|
|
const UINT32 uPad = srcStep[1] - roi->width;
|
|
const UINT32 vPad = srcStep[2] - roi->width;
|
|
const UINT32 dPad = dstStep - roi->width * 4;
|
|
const uint8_t* pY = pSrc[0];
|
|
const uint8_t* pU = pSrc[1];
|
|
const uint8_t* pV = pSrc[2];
|
|
uint8_t* pRGB = pDst;
|
|
const int16x8_t c128 = vdupq_n_s16(128);
|
|
const int16x4_t c48 = vdup_n_s16(48);
|
|
const int16x4_t c120 = vdup_n_s16(120);
|
|
const int16x4_t c403 = vdup_n_s16(403);
|
|
const int16x4_t c475 = vdup_n_s16(475);
|
|
const DWORD pad = nWidth % 8;
|
|
|
|
for (y = 0; y < nHeight; y++)
|
|
{
|
|
UINT32 x;
|
|
|
|
for (x = 0; x < nWidth - pad; x += 8)
|
|
{
|
|
const uint8x8_t Yu = vld1_u8(pY);
|
|
const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(Yu));
|
|
const uint8x8_t Uu = vld1_u8(pU);
|
|
const int16x8_t U = vreinterpretq_s16_u16(vmovl_u8(Uu));
|
|
const uint8x8_t Vu = vld1_u8(pV);
|
|
const int16x8_t V = vreinterpretq_s16_u16(vmovl_u8(Vu));
|
|
/* Do the calculations on Y in 32bit width, the result of 255 * 256 does not fit
|
|
* a signed 16 bit value. */
|
|
const int32x4_t Ch = vmulq_n_s32(vmovl_s16(vget_high_s16(Y)), 256); /* Y * 256 */
|
|
const int32x4_t Cl = vmulq_n_s32(vmovl_s16(vget_low_s16(Y)), 256); /* Y * 256 */
|
|
const int16x8_t D = vsubq_s16(U, c128);
|
|
const int16x4_t Dh = vget_high_s16(D);
|
|
const int16x4_t Dl = vget_low_s16(D);
|
|
const int16x8_t E = vsubq_s16(V, c128);
|
|
const int16x4_t Eh = vget_high_s16(E);
|
|
const int16x4_t El = vget_low_s16(E);
|
|
uint8x8x4_t bgrx;
|
|
{
|
|
/* B = (256L * Y + 475 * (U - 128)) >> 8*/
|
|
const int32x4_t CDh = vmlal_s16(Ch, Dh, c475);
|
|
const int32x4_t CDl = vmlal_s16(Cl, Dl, c475);
|
|
const int32x4_t Bh = vrshrq_n_s32(CDh, 8);
|
|
const int32x4_t Bl = vrshrq_n_s32(CDl, 8);
|
|
const int16x8_t B = vcombine_s16(vqmovn_s32(Bl), vqmovn_s32(Bh));
|
|
bgrx.val[bPos] = vqmovun_s16(B);
|
|
}
|
|
{
|
|
/* G = (256L * Y - 48 * (U - 128) - 120 * (V - 128)) >> 8 */
|
|
const int32x4_t CDh = vmlsl_s16(Ch, Dh, c48);
|
|
const int32x4_t CDl = vmlsl_s16(Cl, Dl, c48);
|
|
const int32x4_t CDEh = vmlsl_s16(CDh, Eh, c120);
|
|
const int32x4_t CDEl = vmlsl_s16(CDl, El, c120);
|
|
const int32x4_t Gh = vrshrq_n_s32(CDEh, 8);
|
|
const int32x4_t Gl = vrshrq_n_s32(CDEl, 8);
|
|
const int16x8_t G = vcombine_s16(vqmovn_s32(Gl), vqmovn_s32(Gh));
|
|
bgrx.val[gPos] = vqmovun_s16(G);
|
|
}
|
|
{
|
|
/* R = (256 * Y + 403 * (V - 128)) >> 8 */
|
|
const int32x4_t CEh = vmlal_s16(Ch, Eh, c403);
|
|
const int32x4_t CEl = vmlal_s16(Cl, El, c403);
|
|
const int32x4_t Rh = vrshrq_n_s32(CEh, 8);
|
|
const int32x4_t Rl = vrshrq_n_s32(CEl, 8);
|
|
const int16x8_t R = vcombine_s16(vqmovn_s32(Rl), vqmovn_s32(Rh));
|
|
bgrx.val[rPos] = vqmovun_s16(R);
|
|
}
|
|
{
|
|
/* A */
|
|
bgrx.val[aPos] = vdup_n_u8(0xFF);
|
|
}
|
|
vst4_u8(pRGB, bgrx);
|
|
pRGB += 32;
|
|
pY += 8;
|
|
pU += 8;
|
|
pV += 8;
|
|
}
|
|
|
|
for (x = 0; x < pad; x++)
|
|
{
|
|
const BYTE Y = *pY++;
|
|
const BYTE U = *pU++;
|
|
const BYTE V = *pV++;
|
|
const BYTE r = YUV2R(Y, U, V);
|
|
const BYTE g = YUV2G(Y, U, V);
|
|
const BYTE b = YUV2B(Y, U, V);
|
|
pRGB[aPos] = 0xFF;
|
|
pRGB[rPos] = r;
|
|
pRGB[gPos] = g;
|
|
pRGB[bPos] = b;
|
|
pRGB += 4;
|
|
}
|
|
|
|
pRGB += dPad;
|
|
pY += yPad;
|
|
pU += uPad;
|
|
pV += vPad;
|
|
}
|
|
|
|
return PRIMITIVES_SUCCESS;
|
|
}
|
|
|
|
static pstatus_t neon_YUV444ToRGB_8u_P3AC4R(
|
|
const BYTE* pSrc[3], const UINT32 srcStep[3],
|
|
BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
|
|
const prim_size_t* roi)
|
|
{
|
|
switch (DstFormat)
|
|
{
|
|
case PIXEL_FORMAT_BGRA32:
|
|
case PIXEL_FORMAT_BGRX32:
|
|
return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
|
|
|
|
case PIXEL_FORMAT_RGBA32:
|
|
case PIXEL_FORMAT_RGBX32:
|
|
return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
|
|
|
|
case PIXEL_FORMAT_ARGB32:
|
|
case PIXEL_FORMAT_XRGB32:
|
|
return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
|
|
|
|
case PIXEL_FORMAT_ABGR32:
|
|
case PIXEL_FORMAT_XBGR32:
|
|
return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
|
|
|
|
default:
|
|
return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
|
}
|
|
}
|
|
|
|
static pstatus_t neon_YUV420CombineToYUV444(
|
|
const BYTE* pMainSrc[3], const UINT32 srcMainStep[3],
|
|
const BYTE* pAuxSrc[3], const UINT32 srcAuxStep[3],
|
|
BYTE* pDst[3], const UINT32 dstStep[3],
|
|
const prim_size_t* roi)
|
|
{
|
|
UINT32 x, y;
|
|
const UINT32 nWidth = roi->width;
|
|
const UINT32 nHeight = roi->height;
|
|
const UINT32 halfWidth = nWidth / 2, halfHeight = nHeight / 2;
|
|
const UINT32 oddY = 1;
|
|
const UINT32 evenY = 0;
|
|
/* The auxilary frame is aligned to multiples of 16x16.
|
|
* We need the padded height for B4 and B5 conversion. */
|
|
const UINT32 padHeigth = roi->height + 16 - roi->height % 16;
|
|
|
|
if (pMainSrc && pMainSrc[0] && pMainSrc[1] && pMainSrc[2])
|
|
{
|
|
/* Y data is already here... */
|
|
/* B1 */
|
|
for (y = 0; y < nHeight; y++)
|
|
{
|
|
const BYTE* Ym = pMainSrc[0] + srcMainStep[0] * y;
|
|
BYTE* pY = pDst[0] + dstStep[0] * y;
|
|
memcpy(pY, Ym, nWidth);
|
|
}
|
|
|
|
/* The first half of U, V are already here part of this frame. */
|
|
/* B2 and B3 */
|
|
for (y = 0; y < halfHeight; y++)
|
|
{
|
|
const UINT32 val2y = (2 * y + evenY);
|
|
const BYTE* Um = pMainSrc[1] + srcMainStep[1] * y;
|
|
const BYTE* Vm = pMainSrc[2] + srcMainStep[2] * y;
|
|
BYTE* pU = pDst[1] + dstStep[1] * val2y;
|
|
BYTE* pV = pDst[2] + dstStep[2] * val2y;
|
|
BYTE* pU1 = pU + dstStep[1];
|
|
BYTE* pV1 = pV + dstStep[2];
|
|
|
|
for (x = 0; x + 16 < halfWidth; x += 16)
|
|
{
|
|
{
|
|
const uint8x16_t u = vld1q_u8(Um);
|
|
uint8x16x2_t u2x;
|
|
u2x.val[0] = u;
|
|
u2x.val[1] = u;
|
|
vst2q_u8(pU, u2x);
|
|
vst2q_u8(pU1, u2x);
|
|
Um += 16;
|
|
pU += 32;
|
|
pU1 += 32;
|
|
}
|
|
{
|
|
const uint8x16_t v = vld1q_u8(Vm);
|
|
uint8x16x2_t v2x;
|
|
v2x.val[0] = v;
|
|
v2x.val[1] = v;
|
|
vst2q_u8(pV, v2x);
|
|
vst2q_u8(pV1, v2x);
|
|
Vm += 16;
|
|
pV += 32;
|
|
pV1 += 32;
|
|
}
|
|
}
|
|
|
|
for (; x < halfWidth; x++)
|
|
{
|
|
const BYTE u = *Um++;
|
|
const BYTE v = *Vm++;
|
|
*pU++ = u;
|
|
*pU++ = u;
|
|
*pU1++ = u;
|
|
*pU1++ = u;
|
|
*pV++ = v;
|
|
*pV++ = v;
|
|
*pV1++ = v;
|
|
*pV1++ = v;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!pAuxSrc || !pAuxSrc[0] || !pAuxSrc[1] || !pAuxSrc[2])
|
|
return PRIMITIVES_SUCCESS;
|
|
|
|
/* The second half of U and V is a bit more tricky... */
|
|
/* B4 and B5 */
|
|
for (y = 0; y < padHeigth; y += 16)
|
|
{
|
|
const BYTE* Ya = pAuxSrc[0] + srcAuxStep[0] * y;
|
|
UINT32 x;
|
|
BYTE* pU = pDst[1] + dstStep[1] * (y + 1);
|
|
BYTE* pV = pDst[2] + dstStep[2] * (y + 1);
|
|
|
|
for (x = 0; x < 8; x++)
|
|
{
|
|
if (y + x >= nHeight)
|
|
continue;
|
|
|
|
memcpy(pU, Ya, nWidth);
|
|
pU += dstStep[1] * 2;
|
|
Ya += srcAuxStep[0];
|
|
}
|
|
|
|
for (x = 0; x < 8; x++)
|
|
{
|
|
if (y + x >= nHeight)
|
|
continue;
|
|
|
|
memcpy(pV, Ya, nWidth);
|
|
pV += dstStep[1] * 2;
|
|
Ya += srcAuxStep[0];
|
|
}
|
|
}
|
|
|
|
/* B6 and B7 */
|
|
for (y = 0; y < halfHeight; y++)
|
|
{
|
|
const UINT32 val2y = (y * 2 + evenY);
|
|
const BYTE* Ua = pAuxSrc[1] + srcAuxStep[1] * y;
|
|
const BYTE* Va = pAuxSrc[2] + srcAuxStep[2] * y;
|
|
BYTE* pU = pDst[1] + dstStep[1] * val2y;
|
|
BYTE* pV = pDst[2] + dstStep[2] * val2y;
|
|
|
|
for (x = 0; x + 16 < halfWidth; x += 16)
|
|
{
|
|
{
|
|
const uint8x16_t u = vld1q_u8(Ua);
|
|
uint8x16x2_t uu = vld2q_u8(pU);
|
|
uu.val[1] = u;
|
|
vst2q_u8(pU, uu);
|
|
Ua += 16;
|
|
pU += 32;
|
|
}
|
|
{
|
|
const uint8x16_t v = vld1q_u8(Va);
|
|
uint8x16x2_t vv = vld2q_u8(pV);
|
|
vv.val[1] = v;
|
|
vst2q_u8(pV, vv);
|
|
Va += 16;
|
|
pV += 32;
|
|
}
|
|
}
|
|
|
|
for (; x < halfWidth; x++)
|
|
{
|
|
pU++;
|
|
*pU++ = *Ua++;
|
|
pV++;
|
|
*pV++ = *Va++;
|
|
}
|
|
}
|
|
|
|
/* Filter */
|
|
for (y = 0; y < halfHeight; y++)
|
|
{
|
|
const UINT32 val2y = (y * 2 + evenY);
|
|
const UINT32 val2y1 = val2y + oddY;
|
|
BYTE* pU = pDst[1] + dstStep[1] * val2y;
|
|
BYTE* pV = pDst[2] + dstStep[2] * val2y;
|
|
BYTE* pU1 = pU + dstStep[1];
|
|
BYTE* pV1 = pV + dstStep[2];
|
|
|
|
if (val2y1 > nHeight)
|
|
continue;
|
|
|
|
for (x = 0; x + 16 < halfWidth; x += 16)
|
|
{
|
|
{
|
|
/* U = (U2x,2y << 2) - U2x1,2y - U2x,2y1 - U2x1,2y1 */
|
|
uint8x8x2_t u = vld2_u8(pU);
|
|
const int16x8_t up = vreinterpretq_s16_u16(vshll_n_u8(u.val[0], 2)); /* Ux2,2y << 2 */
|
|
const uint8x8x2_t u1 = vld2_u8(pU1);
|
|
const uint16x8_t usub = vaddl_u8(u1.val[1], u1.val[0]); /* U2x,2y1 + U2x1,2y1 */
|
|
const int16x8_t us = vreinterpretq_s16_u16(vaddw_u8(usub,
|
|
u.val[1])); /* U2x1,2y + U2x,2y1 + U2x1,2y1 */
|
|
const int16x8_t un = vsubq_s16(up, us);
|
|
const uint8x8_t u8 = vqmovun_s16(un); /* CLIP(un) */
|
|
u.val[0] = u8;
|
|
vst2_u8(pU, u);
|
|
pU += 16;
|
|
pU1 += 16;
|
|
}
|
|
{
|
|
/* V = (V2x,2y << 2) - V2x1,2y - V2x,2y1 - V2x1,2y1 */
|
|
uint8x8x2_t v = vld2_u8(pV);
|
|
const int16x8_t vp = vreinterpretq_s16_u16(vshll_n_u8(v.val[0], 2)); /* Vx2,2y << 2 */
|
|
const uint8x8x2_t v1 = vld2_u8(pV1);
|
|
const uint16x8_t vsub = vaddl_u8(v1.val[1], v1.val[0]); /* V2x,2y1 + V2x1,2y1 */
|
|
const int16x8_t vs = vreinterpretq_s16_u16(vaddw_u8(vsub,
|
|
v.val[1])); /* V2x1,2y + V2x,2y1 + V2x1,2y1 */
|
|
const int16x8_t vn = vsubq_s16(vp, vs);
|
|
const uint8x8_t v8 = vqmovun_s16(vn); /* CLIP(vn) */
|
|
v.val[0] = v8;
|
|
vst2_u8(pV, v);
|
|
pV += 16;
|
|
pV1 += 16;
|
|
}
|
|
}
|
|
|
|
for (; x < halfWidth; x++)
|
|
{
|
|
const UINT32 val2x = (x * 2);
|
|
const UINT32 val2x1 = val2x + 1;
|
|
const INT32 up = pU[val2x] * 4;
|
|
const INT32 vp = pV[val2x] * 4;
|
|
INT32 u2020;
|
|
INT32 v2020;
|
|
|
|
if (val2x1 > nWidth)
|
|
continue;
|
|
|
|
u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1];
|
|
v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1];
|
|
*pU = CLIP(u2020);
|
|
*pV = CLIP(v2020);
|
|
pU += 2;
|
|
pV += 2;
|
|
}
|
|
}
|
|
|
|
return PRIMITIVES_SUCCESS;
|
|
}
|
|
#endif
|
|
|
|
void primitives_init_YUV_opt(primitives_t* prims)
|
|
{
|
|
generic = primitives_get_generic();
|
|
primitives_init_YUV(prims);
|
|
#ifdef WITH_SSE2
|
|
|
|
if (IsProcessorFeaturePresentEx(PF_EX_SSSE3)
|
|
&& IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
|
|
{
|
|
prims->RGBToYUV420_8u_P3AC4R = ssse3_RGBToYUV420;
|
|
prims->YUV420ToRGB_8u_P3AC4R = ssse3_YUV420ToRGB;
|
|
prims->YUV444ToRGB_8u_P3AC4R = ssse3_YUV444ToRGB_8u_P3AC4R;
|
|
}
|
|
|
|
#elif defined(WITH_NEON)
|
|
|
|
if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
|
|
{
|
|
prims->YUV420ToRGB_8u_P3AC4R = neon_YUV420ToRGB_8u_P3AC4R;
|
|
prims->YUV444ToRGB_8u_P3AC4R = neon_YUV444ToRGB_8u_P3AC4R;
|
|
prims->YUV420CombineToYUV444 = neon_YUV420CombineToYUV444;
|
|
}
|
|
|
|
#endif
|
|
}
|