Merge pull request #3743 from nfedera/ssse3_rgbx_to_yuv420
primitives: added SSSE3 rgb to yuv420 encoder
This commit is contained in:
commit
1b522e8d90
@ -1,7 +1,11 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Generic YUV/RGB conversion operations
|
||||
*
|
||||
* Copyright 2014 Marc-Andre Moreau <marcandre.moreau@gmail.com>
|
||||
* Copyright 2015-2017 Armin Novak <armin.novak@thincast.com>
|
||||
* Copyright 2015-2017 Vic Lee
|
||||
* Copyright 2015-2017 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -1,12 +1,25 @@
|
||||
/** function for converting YUV420p data to the RGB format (but without any special upconverting)
|
||||
* It's completely written in nasm-x86-assembly for intel processors supporting SSSE3 and higher.
|
||||
* The target dstStep (6th parameter) must be a multiple of 16.
|
||||
* srcStep[0] must be (target dstStep) / 4 or bigger and srcStep[1] the next multiple of four
|
||||
* of the half of srcStep[0] or bigger
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Optimized YUV/RGB conversion operations
|
||||
*
|
||||
* Copyright 2014 Thomas Erbesdobler
|
||||
* Copyright 2016-2017 Armin Novak <armin.novak@thincast.com>
|
||||
* Copyright 2016-2017 Norbert Federa <norbert.federa@thincast.com>
|
||||
* Copyright 2016-2017 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
@ -25,10 +38,15 @@ static primitives_t* generic = NULL;
|
||||
#include <emmintrin.h>
|
||||
#include <tmmintrin.h>
|
||||
|
||||
static pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R_BGRX(
|
||||
const BYTE** pSrc, const UINT32* srcStep,
|
||||
BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
|
||||
const prim_size_t* roi)
|
||||
|
||||
/****************************************************************************/
|
||||
/* SSSE3 YUV420 -> RGB conversion */
|
||||
/****************************************************************************/
|
||||
|
||||
static pstatus_t ssse3_YUV420ToRGB_BGRX(
|
||||
const BYTE** pSrc, const UINT32* srcStep,
|
||||
BYTE* pDst, UINT32 dstStep, UINT32 dstFormat,
|
||||
const prim_size_t* roi)
|
||||
{
|
||||
UINT32 lastRow, lastCol;
|
||||
BYTE* UData, *VData, *YData;
|
||||
@ -321,21 +339,269 @@ static pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R_BGRX(
|
||||
_aligned_free(buffer);
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
static pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE** pSrc, const UINT32* srcStep,
|
||||
BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
|
||||
const prim_size_t* roi)
|
||||
|
||||
static pstatus_t ssse3_YUV420ToRGB(
|
||||
const BYTE** pSrc, const UINT32* srcStep,
|
||||
BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
|
||||
const prim_size_t* roi)
|
||||
{
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
return ssse3_YUV420ToRGB_8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
||||
return ssse3_YUV420ToRGB_BGRX(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
||||
|
||||
default:
|
||||
return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/****************************************************************************/
|
||||
/* SSSE3 RGB -> YUV420 conversion **/
|
||||
/****************************************************************************/
|
||||
|
||||
|
||||
/**
|
||||
* Note (nfedera):
|
||||
* The used forward transformation factors from RGB to YUV are based on the
|
||||
* values specified in [Rec. ITU-R BT.709-6] Section 3:
|
||||
* http://www.itu.int/rec/R-REC-BT.709-6-201506-I/en
|
||||
*
|
||||
* Y = 0.21260 * R + 0.71520 * G + 0.07220 * B + 0;
|
||||
* U = -0.11457 * R - 0.38543 * G + 0.50000 * B + 128;
|
||||
* V = 0.50000 * R - 0.45415 * G - 0.04585 * B + 128;
|
||||
*
|
||||
* The most accurate integer artmethic approximation when using 8-bit signed
|
||||
* integer factors with 16-bit signed integer intermediate results is:
|
||||
*
|
||||
* Y = ( ( 27 * R + 92 * G + 9 * B) >> 7 );
|
||||
* U = ( (-15 * R - 49 * G + 64 * B) >> 7 ) + 128;
|
||||
* V = ( ( 64 * R - 58 * G - 6 * B) >> 7 ) + 128;
|
||||
*
|
||||
*/
|
||||
|
||||
PRIM_ALIGN_128 static const BYTE bgrx_y_factors[] = {
|
||||
9, 92, 27, 0, 9, 92, 27, 0, 9, 92, 27, 0, 9, 92, 27, 0
|
||||
};
|
||||
PRIM_ALIGN_128 static const BYTE bgrx_u_factors[] = {
|
||||
64, -49, -15, 0, 64, -49, -15, 0, 64, -49, -15, 0, 64, -49, -15, 0
|
||||
};
|
||||
PRIM_ALIGN_128 static const BYTE bgrx_v_factors[] = {
|
||||
-6, -58, 64, 0, -6, -58, 64, 0, -6, -58, 64, 0, -6, -58, 64, 0
|
||||
};
|
||||
|
||||
PRIM_ALIGN_128 static const BYTE const_buf_128b[] = {
|
||||
128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
|
||||
};
|
||||
|
||||
/*
|
||||
TODO:
|
||||
RGB[AX] can simply be supported using the following factors. And instead of loading the
|
||||
globals directly the functions below could be passed pointers to the correct vectors
|
||||
depending on the source picture format.
|
||||
|
||||
PRIM_ALIGN_128 static const BYTE rgbx_y_factors[] = {
|
||||
27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0
|
||||
};
|
||||
PRIM_ALIGN_128 static const BYTE rgbx_u_factors[] = {
|
||||
-15, -49, 64, 0, -15, -49, 64, 0, -15, -49, 64, 0, -15, -49, 64, 0
|
||||
};
|
||||
PRIM_ALIGN_128 static const BYTE rgbx_v_factors[] = {
|
||||
64, -58, -6, 0, 64, -58, -6, 0, 64, -58, -6, 0, 64, -58, -6, 0
|
||||
};
|
||||
*/
|
||||
|
||||
|
||||
/* compute the luma (Y) component from a single rgb source line */
|
||||
|
||||
static INLINE void ssse3_RGBToYUV420_BGRX_Y(
|
||||
const BYTE* src, BYTE* dst, UINT32 width)
|
||||
{
|
||||
UINT32 x;
|
||||
__m128i y_factors, x0, x1, x2, x3;
|
||||
const __m128i* argb = (const __m128i*) src;
|
||||
__m128i* ydst = (__m128i*) dst;
|
||||
|
||||
y_factors = _mm_load_si128((__m128i*)bgrx_y_factors);
|
||||
|
||||
for (x = 0; x < width; x += 16)
|
||||
{
|
||||
/* store 16 rgba pixels in 4 128 bit registers */
|
||||
x0 = _mm_load_si128(argb++); // 1st 4 pixels
|
||||
x1 = _mm_load_si128(argb++); // 2nd 4 pixels
|
||||
x2 = _mm_load_si128(argb++); // 3rd 4 pixels
|
||||
x3 = _mm_load_si128(argb++); // 4th 4 pixels
|
||||
|
||||
/* multiplications and subtotals */
|
||||
x0 = _mm_maddubs_epi16(x0, y_factors);
|
||||
x1 = _mm_maddubs_epi16(x1, y_factors);
|
||||
x2 = _mm_maddubs_epi16(x2, y_factors);
|
||||
x3 = _mm_maddubs_epi16(x3, y_factors);
|
||||
|
||||
/* the total sums */
|
||||
x0 = _mm_hadd_epi16(x0, x1);
|
||||
x2 = _mm_hadd_epi16(x2, x3);
|
||||
|
||||
/* shift the results */
|
||||
x0 = _mm_srli_epi16(x0, 7);
|
||||
x2 = _mm_srli_epi16(x2, 7);
|
||||
|
||||
/* pack the 16 words into bytes */
|
||||
x0 = _mm_packus_epi16(x0, x2);
|
||||
|
||||
/* save to y plane */
|
||||
_mm_storeu_si128(ydst++, x0);
|
||||
}
|
||||
}
|
||||
|
||||
/* compute the chrominance (UV) components from two rgb source lines */
|
||||
|
||||
static INLINE void ssse3_RGBToYUV420_BGRX_UV(
|
||||
const BYTE* src1, const BYTE* src2,
|
||||
BYTE* dst1, BYTE* dst2, UINT32 width)
|
||||
{
|
||||
UINT32 x;
|
||||
__m128i vector128, u_factors, v_factors, x0, x1, x2, x3, x4, x5;
|
||||
|
||||
const __m128i* rgb1 = (const __m128i*)src1;
|
||||
const __m128i* rgb2 = (const __m128i*)src2;
|
||||
|
||||
__m64* udst = (__m64*)dst1;
|
||||
__m64* vdst = (__m64*)dst2;
|
||||
|
||||
vector128 = _mm_load_si128((__m128i*)const_buf_128b);
|
||||
u_factors = _mm_load_si128((__m128i*)bgrx_u_factors);
|
||||
v_factors = _mm_load_si128((__m128i*)bgrx_v_factors);
|
||||
|
||||
for (x = 0; x < width; x += 16)
|
||||
{
|
||||
/* subsample 16x2 pixels into 16x1 pixels */
|
||||
x0 = _mm_load_si128(rgb1++);
|
||||
x4 = _mm_load_si128(rgb2++);
|
||||
x0 = _mm_avg_epu8(x0, x4);
|
||||
|
||||
x1 = _mm_load_si128(rgb1++);
|
||||
x4 = _mm_load_si128(rgb2++);
|
||||
x1 = _mm_avg_epu8(x1, x4);
|
||||
|
||||
x2 = _mm_load_si128(rgb1++);
|
||||
x4 = _mm_load_si128(rgb2++);
|
||||
x2 = _mm_avg_epu8(x2, x4);
|
||||
|
||||
x3 = _mm_load_si128(rgb1++);
|
||||
x4 = _mm_load_si128(rgb2++);
|
||||
x3 = _mm_avg_epu8(x3, x4);
|
||||
|
||||
// subsample these 16x1 pixels into 8x1 pixels */
|
||||
|
||||
/**
|
||||
* shuffle controls
|
||||
* c = a[0],a[2],b[0],b[2] == 10 00 10 00 = 0x88
|
||||
* c = a[1],a[3],b[1],b[3] == 11 01 11 01 = 0xdd
|
||||
*/
|
||||
|
||||
x4 = _mm_castps_si128( _mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0x88) );
|
||||
x0 = _mm_castps_si128( _mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0xdd) );
|
||||
x0 = _mm_avg_epu8(x0, x4);
|
||||
|
||||
x4 = _mm_castps_si128( _mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0x88) );
|
||||
x1 = _mm_castps_si128( _mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0xdd) );
|
||||
x1 = _mm_avg_epu8(x1, x4);
|
||||
|
||||
/* multiplications and subtotals */
|
||||
x2 = _mm_maddubs_epi16(x0, u_factors);
|
||||
x3 = _mm_maddubs_epi16(x1, u_factors);
|
||||
|
||||
x4 = _mm_maddubs_epi16(x0, v_factors);
|
||||
x5 = _mm_maddubs_epi16(x1, v_factors);
|
||||
|
||||
/* the total sums */
|
||||
x0 = _mm_hadd_epi16(x2, x3);
|
||||
x1 = _mm_hadd_epi16(x4, x5);
|
||||
|
||||
/* shift the results */
|
||||
x0 = _mm_srai_epi16(x0, 7);
|
||||
x1 = _mm_srai_epi16(x1, 7);
|
||||
|
||||
/* pack the 16 words into bytes */
|
||||
x0 = _mm_packs_epi16(x0, x1);
|
||||
|
||||
/* add 128 */
|
||||
x0 = _mm_add_epi8(x0, vector128);
|
||||
|
||||
/* the lower 8 bytes go to the u plane */
|
||||
_mm_storel_pi(udst++, _mm_castsi128_ps(x0));
|
||||
|
||||
/* the upper 8 bytes go to the v plane */
|
||||
_mm_storeh_pi(vdst++, _mm_castsi128_ps(x0));
|
||||
}
|
||||
}
|
||||
|
||||
static pstatus_t ssse3_RGBToYUV420_BGRX(
|
||||
const BYTE* pSrc, UINT32 srcFormat, UINT32 srcStep,
|
||||
BYTE* pDst[3], UINT32 dstStep[3],
|
||||
const prim_size_t* roi)
|
||||
{
|
||||
UINT32 y;
|
||||
const BYTE* argb = pSrc;
|
||||
BYTE* ydst = pDst[0];
|
||||
BYTE* udst = pDst[1];
|
||||
BYTE* vdst = pDst[2];
|
||||
|
||||
if (roi->height < 1 || roi->width < 1)
|
||||
{
|
||||
return !PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
if (roi->width % 16 || (unsigned long)pSrc % 16 || srcStep % 16)
|
||||
{
|
||||
return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
|
||||
}
|
||||
|
||||
for (y = 0; y < roi->height-1; y+=2)
|
||||
{
|
||||
const BYTE* line1 = argb;
|
||||
const BYTE* line2 = argb + srcStep;
|
||||
|
||||
ssse3_RGBToYUV420_BGRX_UV(line1, line2, udst, vdst, roi->width);
|
||||
ssse3_RGBToYUV420_BGRX_Y(line1, ydst, roi->width);
|
||||
ssse3_RGBToYUV420_BGRX_Y(line2, ydst + dstStep[0], roi->width);
|
||||
|
||||
argb += 2 * srcStep;
|
||||
ydst += 2 * dstStep[0];
|
||||
udst += 1 * dstStep[1];
|
||||
vdst += 1 * dstStep[2];
|
||||
}
|
||||
|
||||
if (roi->height & 1)
|
||||
{
|
||||
/* pass the same last line of an odd height twice for UV */
|
||||
ssse3_RGBToYUV420_BGRX_UV(argb, argb, udst, vdst, roi->width);
|
||||
ssse3_RGBToYUV420_BGRX_Y(argb, ydst, roi->width);
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t ssse3_RGBToYUV420(
|
||||
const BYTE* pSrc, UINT32 srcFormat, UINT32 srcStep,
|
||||
BYTE* pDst[3], UINT32 dstStep[3],
|
||||
const prim_size_t* roi)
|
||||
{
|
||||
switch (srcFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
return ssse3_RGBToYUV420_BGRX(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
|
||||
default:
|
||||
return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
void primitives_init_YUV_opt(primitives_t* prims)
|
||||
@ -347,7 +613,8 @@ void primitives_init_YUV_opt(primitives_t* prims)
|
||||
if (IsProcessorFeaturePresentEx(PF_EX_SSSE3)
|
||||
&& IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
|
||||
{
|
||||
prims->YUV420ToRGB_8u_P3AC4R = ssse3_YUV420ToRGB_8u_P3AC4R;
|
||||
prims->RGBToYUV420_8u_P3AC4R = ssse3_RGBToYUV420;
|
||||
prims->YUV420ToRGB_8u_P3AC4R = ssse3_YUV420ToRGB;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -28,6 +28,15 @@
|
||||
#include <freerdp/primitives.h>
|
||||
#include <freerdp/api.h>
|
||||
|
||||
|
||||
#ifdef __GNUC__
|
||||
#define PRIM_ALIGN_128 __attribute__((aligned(16)))
|
||||
#else
|
||||
#ifdef _WIN32
|
||||
#define PRIM_ALIGN_128 __declspec(align(16))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* Use lddqu for unaligned; load for 16-byte aligned. */
|
||||
#define LOAD_SI128(_ptr_) \
|
||||
(((ULONG_PTR) (_ptr_) & 0x0f) \
|
||||
|
@ -83,8 +83,8 @@ static void get_size(UINT32* width, UINT32* height)
|
||||
winpr_RAND((BYTE*)width, sizeof(*width));
|
||||
winpr_RAND((BYTE*)height, sizeof(*height));
|
||||
// TODO: Algorithm only works on even resolutions...
|
||||
*width = (*width % 64) << 1;
|
||||
*height = (*height % 64 << 1);
|
||||
*width = (*width % 64 + 1) << 1;
|
||||
*height = (*height % 64 + 1) << 1;
|
||||
}
|
||||
|
||||
static BOOL check_padding(const BYTE* psrc, size_t size, size_t padding,
|
||||
@ -567,14 +567,20 @@ int TestPrimitivesYUV(int argc, char* argv[])
|
||||
|
||||
for (x = 0; x < 10; x++)
|
||||
{
|
||||
if (!TestPrimitiveYUV(TRUE))
|
||||
if (!TestPrimitiveYUV(TRUE)) {
|
||||
printf("TestPrimitiveYUV (444) failed.\n");
|
||||
goto end;
|
||||
}
|
||||
|
||||
if (!TestPrimitiveYUV(FALSE))
|
||||
if (!TestPrimitiveYUV(FALSE)) {
|
||||
printf("TestPrimitiveYUV (420) failed.\n");
|
||||
goto end;
|
||||
}
|
||||
|
||||
if (!TestPrimitiveYUVCombine())
|
||||
if (!TestPrimitiveYUVCombine()) {
|
||||
printf("TestPrimitiveYUVCombine failed.\n");
|
||||
goto end;
|
||||
}
|
||||
}
|
||||
|
||||
rc = 0;
|
||||
|
Loading…
Reference in New Issue
Block a user