[primitives] refactor, split sse/neon/opencl

This commit is contained in:
akallabeth 2024-06-17 11:51:03 +02:00
parent 561d2b32c6
commit 3cecd1de06
No known key found for this signature in database
GPG Key ID: A49454A3FC909FD5
38 changed files with 1014 additions and 558 deletions

View File

@ -2,63 +2,86 @@
set(PRIMITIVES_SRCS
prim_add.c
prim_add.h
prim_andor.c
prim_andor.h
prim_alphaComp.c
prim_alphaComp.h
prim_colors.c
prim_colors.h
prim_copy.c
prim_copy.h
prim_set.c
prim_set.h
prim_shift.c
prim_shift.h
prim_sign.c
prim_sign.h
prim_YUV.c
prim_YUV.h
prim_YCoCg.c
prim_YCoCg.h
primitives.c
prim_internal.h)
if (WITH_SSE2 OR WITH_NEON)
set(PRIMITIVES_SSE2_SRCS
prim_colors_opt.c
prim_copy_sse.c
prim_copy_avx2.c
prim_set_opt.c)
set(PRIMITIVES_SSE2_SRCS
sse/prim_colors_sse2.c
sse/prim_set_sse2.c
)
set(PRIMITIVES_SSE3_SRCS
prim_add_opt.c
prim_alphaComp_opt.c
prim_andor_opt.c
prim_shift_opt.c)
set(PRIMITIVES_SSE3_SRCS
sse/prim_add_sse3.c
sse/prim_alphaComp_sse3.c
sse/prim_andor_sse3.c
sse/prim_shift_sse3.c
)
set(PRIMITIVES_SSSE3_SRCS
prim_sign_opt.c
prim_YCoCg_opt.c)
set(PRIMITIVES_SSSE3_SRCS
sse/prim_YUV_ssse3.c
sse/prim_sign_ssse3.c
sse/prim_YCoCg_ssse3.c
)
if (WITH_SSE2)
set(PRIMITIVES_SSSE3_SRCS ${PRIMITIVES_SSSE3_SRCS}
prim_YUV_ssse3.c)
endif()
set(PRIMITIVES_SSE4_1_SRCS
sse/prim_copy_sse4_1.c
)
if (WITH_NEON)
set(PRIMITIVES_SSSE3_SRCS ${PRIMITIVES_SSSE3_SRCS}
prim_YUV_neon.c)
endif()
endif()
set(PRIMITIVES_SSE4_2_SRCS
)
set(PRIMITIVES_AVX2_SRCS
sse/prim_copy_avx2.c
)
set(PRIMITIVES_NEON_SRCS
neon/prim_colors_neon.c
neon/prim_YCoCg_neon.c
neon/prim_YUV_neon.c
)
set(PRIMITIVES_OPENCL_SRCS
opencl/prim_YUV_opencl.c
)
if (WITH_OPENCL)
set(PRIMITIVES_OPENCL_SRCS prim_YUV_opencl.c)
freerdp_include_directory_add(${OpenCL_INCLUDE_DIRS})
freerdp_library_add(OpenCL::OpenCL)
endif()
set(PRIMITIVES_OPT_SRCS
${PRIMITIVES_NEON_SRCS}
${PRIMITIVES_SSE2_SRCS}
${PRIMITIVES_SSE3_SRCS}
${PRIMITIVES_SSSE3_SRCS}
${PRIMITIVES_SSE4_1_SRCS}
${PRIMITIVES_SSE4_2_SRCS}
${PRIMITIVES_AVX2_SRCS}
${PRIMITIVES_OPENCL_SRCS})
set(PRIMITIVES_SRCS ${PRIMITIVES_SRCS} ${PRIMITIVES_OPT_SRCS})
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
add_library(freerdp-primitives OBJECT
${PRIMITIVES_SRCS}
)
@ -74,8 +97,15 @@ if(WITH_SSE2)
if (PRIMITIVES_SSSE3_SRCS)
set_source_files_properties(${PRIMITIVES_SSSE3_SRCS} PROPERTIES COMPILE_FLAGS "-mssse3" )
endif()
set_source_files_properties(prim_copy_sse.c PROPERTIES COMPILE_FLAGS "-msse4.1" )
set_source_files_properties(prim_copy_avx2.c PROPERTIES COMPILE_FLAGS "-mavx2" )
if (PRIMITIVES_SSE4_1_SRCS)
set_source_files_properties(${PRIMITIVES_SSE4_1_SRCS} PROPERTIES COMPILE_FLAGS "-msse4.1" )
endif()
if (PRIMITIVES_SSE4_2_SRCS)
set_source_files_properties(${PRIMITIVES_SSE4_2_SRCS} PROPERTIES COMPILE_FLAGS "-msse4.2" )
endif()
if (PRIMITIVES_AVX2_SRCS)
set_source_files_properties(${PRIMITIVES_AVX2_SRCS} PROPERTIES COMPILE_FLAGS "-mavx2" )
endif()
endif()
if(MSVC)

View File

@ -0,0 +1,173 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Optimized YCoCg<->RGB conversion operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <freerdp/config.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#if defined(WITH_NEON)
#include <arm_neon.h>
#endif
#include "prim_internal.h"
#include "prim_templates.h"
#include "prim_YCoCg.h"
#if defined(WITH_NEON)
static primitives_t* generic = NULL;
static pstatus_t neon_YCoCgToRGB_8u_X(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
UINT32 width, UINT32 height, UINT8 shift, BYTE bPos,
BYTE gPos, BYTE rPos, BYTE aPos, BOOL alpha)
{
BYTE* dptr = pDst;
const BYTE* sptr = pSrc;
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
const int8_t cll = shift - 1; /* -1 builds in the /2's */
const UINT32 srcPad = srcStep - (width * 4);
const UINT32 dstPad = dstStep - (width * formatSize);
const UINT32 pad = width % 8;
const uint8x8_t aVal = vdup_n_u8(0xFF);
const int8x8_t cllv = vdup_n_s8(cll);
for (UINT32 y = 0; y < height; y++)
{
for (UINT32 x = 0; x < width - pad; x += 8)
{
/* Note: shifts must be done before sign-conversion. */
const uint8x8x4_t raw = vld4_u8(sptr);
const int8x8_t CgRaw = vreinterpret_s8_u8(vshl_u8(raw.val[0], cllv));
const int8x8_t CoRaw = vreinterpret_s8_u8(vshl_u8(raw.val[1], cllv));
const int16x8_t Cg = vmovl_s8(CgRaw);
const int16x8_t Co = vmovl_s8(CoRaw);
const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(raw.val[2])); /* UINT8 -> INT16 */
const int16x8_t T = vsubq_s16(Y, Cg);
const int16x8_t R = vaddq_s16(T, Co);
const int16x8_t G = vaddq_s16(Y, Cg);
const int16x8_t B = vsubq_s16(T, Co);
uint8x8x4_t bgrx;
bgrx.val[bPos] = vqmovun_s16(B);
bgrx.val[gPos] = vqmovun_s16(G);
bgrx.val[rPos] = vqmovun_s16(R);
if (alpha)
bgrx.val[aPos] = raw.val[3];
else
bgrx.val[aPos] = aVal;
vst4_u8(dptr, bgrx);
sptr += sizeof(raw);
dptr += sizeof(bgrx);
}
for (UINT32 x = 0; x < pad; x++)
{
/* Note: shifts must be done before sign-conversion. */
const INT16 Cg = (INT16)((INT8)((*sptr++) << cll));
const INT16 Co = (INT16)((INT8)((*sptr++) << cll));
const INT16 Y = (INT16)(*sptr++); /* UINT8->INT16 */
const INT16 T = Y - Cg;
const INT16 R = T + Co;
const INT16 G = Y + Cg;
const INT16 B = T - Co;
BYTE bgra[4];
bgra[bPos] = CLIP(B);
bgra[gPos] = CLIP(G);
bgra[rPos] = CLIP(R);
bgra[aPos] = *sptr++;
if (!alpha)
bgra[aPos] = 0xFF;
*dptr++ = bgra[0];
*dptr++ = bgra[1];
*dptr++ = bgra[2];
*dptr++ = bgra[3];
}
sptr += srcPad;
dptr += dstPad;
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t neon_YCoCgToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha)
{
switch (DstFormat)
{
case PIXEL_FORMAT_BGRA32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 2, 1, 0, 3, withAlpha);
case PIXEL_FORMAT_BGRX32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 2, 1, 0, 3, withAlpha);
case PIXEL_FORMAT_RGBA32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 0, 1, 2, 3, withAlpha);
case PIXEL_FORMAT_RGBX32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 0, 1, 2, 3, withAlpha);
case PIXEL_FORMAT_ARGB32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 1, 2, 3, 0, withAlpha);
case PIXEL_FORMAT_XRGB32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 1, 2, 3, 0, withAlpha);
case PIXEL_FORMAT_ABGR32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 3, 2, 1, 0, withAlpha);
case PIXEL_FORMAT_XBGR32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 3, 2, 1, 0, withAlpha);
default:
return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
height, shift, withAlpha);
}
}
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_YCoCg_neon(primitives_t* WINPR_RESTRICT prims)
{
#if defined(WITH_NEON)
generic = primitives_get_generic();
primitives_init_YCoCg(prims);
if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
{
prims->YCoCgToRGB_8u_AC4R = neon_YCoCgToRGB_8u_AC4R;
}
#else
WINPR_UNUSED(prims);
#endif
}

View File

@ -28,11 +28,9 @@
#include <freerdp/primitives.h>
#include "prim_internal.h"
#include "prim_YUV.h"
#if !defined(WITH_NEON)
#error "This file must only be included if WITH_NEON is active!"
#endif
#if defined(WITH_NEON)
#include <arm_neon.h>
static primitives_t* generic = NULL;
@ -742,9 +740,11 @@ static pstatus_t neon_YUV420CombineToYUV444(avc444_frame_type type,
return -1;
}
}
#endif
void primitives_init_YUV_opt(primitives_t* prims)
void primitives_init_YUV_neon(primitives_t* prims)
{
#if defined(WITH_NEON)
generic = primitives_get_generic();
primitives_init_YUV(prims);
@ -754,4 +754,7 @@ void primitives_init_YUV_opt(primitives_t* prims)
prims->YUV444ToRGB_8u_P3AC4R = neon_YUV444ToRGB_8u_P3AC4R;
prims->YUV420CombineToYUV444 = neon_YUV420CombineToYUV444;
}
#else
WINPR_UNUSED(prims);
#endif
}

View File

@ -0,0 +1,365 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Optimized Color conversion operations.
* vi:ts=4 sw=4:
*
* Copyright 2011 Stephen Erisman
* Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
* Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <freerdp/config.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#if defined(WITH_NEON)
#include <arm_neon.h>
#endif
#include "prim_internal.h"
#include "prim_templates.h"
#include "prim_colors.h"
/*---------------------------------------------------------------------------*/
#ifdef WITH_NEON
static primitives_t* generic = NULL;
static pstatus_t
neon_yCbCrToRGB_16s16s_P3P3(const INT16* const WINPR_RESTRICT pSrc[3], INT32 srcStep,
INT16* WINPR_RESTRICT pDst[3], INT32 dstStep,
const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
{
/* TODO: If necessary, check alignments and call the general version. */
int16x8_t zero = vdupq_n_s16(0);
int16x8_t max = vdupq_n_s16(255);
int16x8_t r_cr = vdupq_n_s16(22986); // 1.403 << 14
int16x8_t g_cb = vdupq_n_s16(-5636); // -0.344 << 14
int16x8_t g_cr = vdupq_n_s16(-11698); // -0.714 << 14
int16x8_t b_cb = vdupq_n_s16(28999); // 1.770 << 14
int16x8_t c4096 = vdupq_n_s16(4096);
int16x8_t* y_buf = (int16x8_t*)pSrc[0];
int16x8_t* cb_buf = (int16x8_t*)pSrc[1];
int16x8_t* cr_buf = (int16x8_t*)pSrc[2];
int16x8_t* r_buf = (int16x8_t*)pDst[0];
int16x8_t* g_buf = (int16x8_t*)pDst[1];
int16x8_t* b_buf = (int16x8_t*)pDst[2];
int srcbump = srcStep / sizeof(int16x8_t);
int dstbump = dstStep / sizeof(int16x8_t);
int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
for (int yp = 0; yp < roi->height; ++yp)
{
for (int i = 0; i < imax; i++)
{
/*
In order to use NEON signed 16-bit integer multiplication we need to convert
the floating point factors to signed int without loosing information.
The result of this multiplication is 32 bit and we have a NEON instruction
that returns the hi word of the saturated double.
Thus we will multiply the factors by the highest possible 2^n, take the
upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
shift by 1 to reverse the doubling) and correct this result by multiplying it
by 2^(16-n).
For the given factors in the conversion matrix the best possible n is 14.
Example for calculating r:
r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
*/
/* y = (y_buf[i] + 4096) >> 2 */
int16x8_t y = vld1q_s16((INT16*)&y_buf[i]);
y = vaddq_s16(y, c4096);
y = vshrq_n_s16(y, 2);
/* cb = cb_buf[i]; */
int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
/* cr = cr_buf[i]; */
int16x8_t cr = vld1q_s16((INT16*)&cr_buf[i]);
/* (y + HIWORD(cr*22986)) >> 3 */
int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
r = vshrq_n_s16(r, 3);
/* r_buf[i] = CLIP(r); */
r = vminq_s16(vmaxq_s16(r, zero), max);
vst1q_s16((INT16*)&r_buf[i], r);
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
g = vshrq_n_s16(g, 3);
/* g_buf[i] = CLIP(g); */
g = vminq_s16(vmaxq_s16(g, zero), max);
vst1q_s16((INT16*)&g_buf[i], g);
/* (y + HIWORD(cb*28999)) >> 3 */
int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
b = vshrq_n_s16(b, 3);
/* b_buf[i] = CLIP(b); */
b = vminq_s16(vmaxq_s16(b, zero), max);
vst1q_s16((INT16*)&b_buf[i], b);
}
y_buf += srcbump;
cb_buf += srcbump;
cr_buf += srcbump;
r_buf += dstbump;
g_buf += dstbump;
b_buf += dstbump;
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(const INT16* const WINPR_RESTRICT pSrc[3],
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
UINT32 dstStep,
const prim_size_t* WINPR_RESTRICT roi, uint8_t rPos,
uint8_t gPos, uint8_t bPos, uint8_t aPos)
{
BYTE* pRGB = pDst;
const INT16* pY = pSrc[0];
const INT16* pCb = pSrc[1];
const INT16* pCr = pSrc[2];
const size_t srcPad = (srcStep - (roi->width * sizeof(INT16))) / sizeof(INT16);
const size_t dstPad = (dstStep - (roi->width * 4)) / 4;
const size_t pad = roi->width % 8;
const int16x4_t c4096 = vdup_n_s16(4096);
for (UINT32 y = 0; y < roi->height; y++)
{
for (UINT32 x = 0; x < roi->width - pad; x += 8)
{
const int16x8_t Y = vld1q_s16(pY);
const int16x4_t Yh = vget_high_s16(Y);
const int16x4_t Yl = vget_low_s16(Y);
const int32x4_t YhAdd = vaddl_s16(Yh, c4096); /* Y + 4096 */
const int32x4_t YlAdd = vaddl_s16(Yl, c4096); /* Y + 4096 */
const int32x4_t YhW = vshlq_n_s32(YhAdd, 16);
const int32x4_t YlW = vshlq_n_s32(YlAdd, 16);
const int16x8_t Cr = vld1q_s16(pCr);
const int16x4_t Crh = vget_high_s16(Cr);
const int16x4_t Crl = vget_low_s16(Cr);
const int16x8_t Cb = vld1q_s16(pCb);
const int16x4_t Cbh = vget_high_s16(Cb);
const int16x4_t Cbl = vget_low_s16(Cb);
uint8x8x4_t bgrx;
{
/* R */
const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916); /* 1.402525 * 2^16 */
const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916); /* 1.402525 * 2^16 */
const int32x4_t CrhRa = vaddq_s32(CrhR, YhW);
const int32x4_t CrlRa = vaddq_s32(CrlR, YlW);
const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21));
const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21));
const int16x8_t Rs = vcombine_s16(Rsl, Rsh);
bgrx.val[rPos] = vqmovun_s16(Rs);
}
{
/* G */
const int32x4_t CbGh = vmull_n_s16(Cbh, 22527); /* 0.343730 * 2^16 */
const int32x4_t CbGl = vmull_n_s16(Cbl, 22527); /* 0.343730 * 2^16 */
const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819); /* 0.714401 * 2^16 */
const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819); /* 0.714401 * 2^16 */
const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh);
const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl);
const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh);
const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl);
const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21));
const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21));
const int16x8_t Gs = vcombine_s16(Gsl, Gsh);
const uint8x8_t G = vqmovun_s16(Gs);
bgrx.val[gPos] = G;
}
{
/* B */
const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992); /* 1.769905 * 2^16 */
const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992); /* 1.769905 * 2^16 */
const int32x4_t YCbBh = vaddq_s32(CbBh, YhW);
const int32x4_t YCbBl = vaddq_s32(CbBl, YlW);
const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21));
const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21));
const int16x8_t Bs = vcombine_s16(Bsl, Bsh);
const uint8x8_t B = vqmovun_s16(Bs);
bgrx.val[bPos] = B;
}
/* A */
{
bgrx.val[aPos] = vdup_n_u8(0xFF);
}
vst4_u8(pRGB, bgrx);
pY += 8;
pCb += 8;
pCr += 8;
pRGB += 32;
}
for (UINT32 x = 0; x < pad; x++)
{
const INT32 divisor = 16;
const INT32 Y = ((*pY++) + 4096) << divisor;
const INT32 Cb = (*pCb++);
const INT32 Cr = (*pCr++);
const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
BYTE bgrx[4];
bgrx[bPos] = CLIP(B);
bgrx[gPos] = CLIP(G);
bgrx[rPos] = CLIP(R);
bgrx[aPos] = 0xFF;
*pRGB++ = bgrx[0];
*pRGB++ = bgrx[1];
*pRGB++ = bgrx[2];
*pRGB++ = bgrx[3];
}
pY += srcPad;
pCb += srcPad;
pCr += srcPad;
pRGB += dstPad;
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3],
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
UINT32 dstStep, UINT32 DstFormat,
const prim_size_t* WINPR_RESTRICT roi)
{
switch (DstFormat)
{
case PIXEL_FORMAT_BGRA32:
case PIXEL_FORMAT_BGRX32:
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
case PIXEL_FORMAT_RGBA32:
case PIXEL_FORMAT_RGBX32:
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
case PIXEL_FORMAT_ARGB32:
case PIXEL_FORMAT_XRGB32:
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
case PIXEL_FORMAT_ABGR32:
case PIXEL_FORMAT_XBGR32:
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
default:
return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
}
}
static pstatus_t neon_RGBToRGB_16s8u_P3AC4R_X(
const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
UINT32 srcStep, /* bytes between rows in source data */
BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
UINT32 dstStep, /* bytes between rows in dest data */
const prim_size_t* WINPR_RESTRICT roi, /* region of interest */
uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
{
UINT32 pad = roi->width % 8;
for (UINT32 y = 0; y < roi->height; y++)
{
const INT16* pr = (INT16*)(((BYTE*)pSrc[0]) + y * srcStep);
const INT16* pg = (INT16*)(((BYTE*)pSrc[1]) + y * srcStep);
const INT16* pb = (INT16*)(((BYTE*)pSrc[2]) + y * srcStep);
BYTE* dst = pDst + y * dstStep;
for (UINT32 x = 0; x < roi->width - pad; x += 8)
{
int16x8_t r = vld1q_s16(pr);
int16x8_t g = vld1q_s16(pg);
int16x8_t b = vld1q_s16(pb);
uint8x8x4_t bgrx;
bgrx.val[aPos] = vdup_n_u8(0xFF);
bgrx.val[rPos] = vqmovun_s16(r);
bgrx.val[gPos] = vqmovun_s16(g);
bgrx.val[bPos] = vqmovun_s16(b);
vst4_u8(dst, bgrx);
pr += 8;
pg += 8;
pb += 8;
dst += 32;
}
for (UINT32 x = 0; x < pad; x++)
{
BYTE bgrx[4];
bgrx[bPos] = *pb++;
bgrx[gPos] = *pg++;
bgrx[rPos] = *pr++;
bgrx[aPos] = 0xFF;
*dst++ = bgrx[0];
*dst++ = bgrx[1];
*dst++ = bgrx[2];
*dst++ = bgrx[3];
}
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t
neon_RGBToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
UINT32 srcStep, /* bytes between rows in source data */
BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
UINT32 dstStep, /* bytes between rows in dest data */
UINT32 DstFormat,
const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
{
switch (DstFormat)
{
case PIXEL_FORMAT_BGRA32:
case PIXEL_FORMAT_BGRX32:
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
case PIXEL_FORMAT_RGBA32:
case PIXEL_FORMAT_RGBX32:
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
case PIXEL_FORMAT_ARGB32:
case PIXEL_FORMAT_XRGB32:
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
case PIXEL_FORMAT_ABGR32:
case PIXEL_FORMAT_XBGR32:
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
default:
return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
}
}
#endif /* WITH_NEON */
/* ------------------------------------------------------------------------- */
void primitives_init_colors_neon(primitives_t* prims)
{
#if defined(WITH_NEON)
generic = primitives_get_generic();
primitives_init_colors(prims);
if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
{
prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R;
prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R;
prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
}
#else
WINPR_UNUSED(prims);
#endif
}

View File

@ -30,7 +30,6 @@
#else
#include <CL/cl.h>
#endif
#endif
#include <freerdp/log.h>
#define TAG FREERDP_TAG("primitives")
@ -481,9 +480,11 @@ static pstatus_t opencl_YUV444ToRGB_8u_P3AC4R(const BYTE* const WINPR_RESTRICT p
return opencl_YUVToRGB(kernel_name, pSrc, srcStep, pDst, dstStep, roi);
}
#endif
BOOL primitives_init_opencl(primitives_t* prims)
{
#if defined(WITH_OPENCL)
primitives_t* p = primitives_get_by_type(PRIMITIVES_ONLY_CPU);
if (!prims || !p)
return FALSE;
@ -496,5 +497,6 @@ BOOL primitives_init_opencl(primitives_t* prims)
prims->YUV444ToRGB_8u_P3AC4R = opencl_YUV444ToRGB_8u_P3AC4R;
prims->flags |= PRIM_FLAGS_HAVE_EXTGPU;
prims->uninit = primitives_uninit_opencl;
#endif
return TRUE;
}

View File

@ -23,6 +23,7 @@
#include <freerdp/primitives.h>
#include "prim_internal.h"
#include "prim_YCoCg.h"
/* helper function to convert raw 8 bit values to signed 16bit values.
*/
@ -71,3 +72,9 @@ void primitives_init_YCoCg(primitives_t* prims)
{
prims->YCoCgToRGB_8u_AC4R = general_YCoCgToRGB_8u_AC4R;
}
void primitives_init_YCoCg_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_YCoCg_ssse3(prims);
primitives_init_YCoCg_neon(prims);
}

View File

@ -0,0 +1,31 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Primitives copy
*
* Copyright 2024 Armin Novak <anovak@thincast.com>
* Copyright 2024 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FREERDP_LIB_PRIM_YCoCg_H
#define FREERDP_LIB_PRIM_YCoCg_H
#include <winpr/wtypes.h>
#include <freerdp/config.h>
#include <freerdp/primitives.h>
void primitives_init_YCoCg_ssse3(primitives_t* WINPR_RESTRICT prims);
void primitives_init_YCoCg_neon(primitives_t* WINPR_RESTRICT prims);
#endif

View File

@ -29,6 +29,7 @@
#include <freerdp/primitives.h>
#include <freerdp/codec/color.h>
#include "prim_internal.h"
#include "prim_YUV.h"
static pstatus_t general_LumaToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[3],
const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
@ -1875,3 +1876,9 @@ void primitives_init_YUV(primitives_t* WINPR_RESTRICT prims)
prims->RGBToAVC444YUV = general_RGBToAVC444YUV;
prims->RGBToAVC444YUVv2 = general_RGBToAVC444YUVv2;
}
void primitives_init_YUV_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_YUV_ssse3(prims);
primitives_init_YUV_neon(prims);
}

View File

@ -0,0 +1,31 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Primitives copy
*
* Copyright 2024 Armin Novak <anovak@thincast.com>
* Copyright 2024 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FREERDP_LIB_PRIM_YUV_H
#define FREERDP_LIB_PRIM_YUV_H
#include <winpr/wtypes.h>
#include <freerdp/config.h>
#include <freerdp/primitives.h>
void primitives_init_YUV_ssse3(primitives_t* prims);
void primitives_init_YUV_neon(primitives_t* prims);
#endif

View File

@ -22,6 +22,7 @@
#include <freerdp/primitives.h>
#include "prim_internal.h"
#include "prim_add.h"
/* ----------------------------------------------------------------------------
* 16-bit signed add with saturation (under and over).
@ -74,3 +75,8 @@ void primitives_init_add(primitives_t* prims)
prims->add_16s = general_add_16s;
prims->add_16s_inplace = general_add_16s_inplace;
}
void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_add_sse3(prims);
}

View File

@ -0,0 +1,30 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Primitives copy
*
* Copyright 2024 Armin Novak <anovak@thincast.com>
* Copyright 2024 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FREERDP_LIB_PRIM_ADD_H
#define FREERDP_LIB_PRIM_ADD_H
#include <winpr/wtypes.h>
#include <freerdp/config.h>
#include <freerdp/primitives.h>
void primitives_init_add_sse3(primitives_t* prims);
#endif

View File

@ -25,6 +25,7 @@
#include <freerdp/primitives.h>
#include "prim_internal.h"
#include "prim_alphaComp.h"
#define ALPHA(_k_) (((_k_)&0xFF000000U) >> 24)
#define RED(_k_) (((_k_)&0x00FF0000U) >> 16)
@ -91,3 +92,8 @@ void primitives_init_alphaComp(primitives_t* prims)
{
prims->alphaComp_argb = general_alphaComp_argb;
}
void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_alphaComp_sse3(prims);
}

View File

@ -0,0 +1,30 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Primitives copy
*
* Copyright 2024 Armin Novak <anovak@thincast.com>
* Copyright 2024 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FREERDP_LIB_PRIM_ALPHA_COMP_H
#define FREERDP_LIB_PRIM_ALPHA_COMP_H
#include <winpr/wtypes.h>
#include <freerdp/config.h>
#include <freerdp/primitives.h>
void primitives_init_alphaComp_sse3(primitives_t* prims);
#endif

View File

@ -19,6 +19,7 @@
#include <freerdp/primitives.h>
#include "prim_internal.h"
#include "prim_andor.h"
/* ----------------------------------------------------------------------------
* 32-bit AND with a constant.
@ -55,3 +56,8 @@ void primitives_init_andor(primitives_t* prims)
prims->andC_32u = general_andC_32u;
prims->orC_32u = general_orC_32u;
}
void primitives_init_andor_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_andor_sse3(prims);
}

View File

@ -0,0 +1,30 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Primitives copy
*
* Copyright 2024 Armin Novak <anovak@thincast.com>
* Copyright 2024 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FREERDP_LIB_PRIM_ANDOR_H
#define FREERDP_LIB_PRIM_ANDOR_H
#include <winpr/wtypes.h>
#include <freerdp/config.h>
#include <freerdp/primitives.h>
void primitives_init_andor_sse3(primitives_t* prims);
#endif

View File

@ -24,6 +24,7 @@
#include <freerdp/codec/color.h>
#include "prim_internal.h"
#include "prim_colors.h"
#ifndef MINMAX
#define MINMAX(_v_, _l_, _h_) ((_v_) < (_l_) ? (_l_) : ((_v_) > (_h_) ? (_h_) : (_v_)))
@ -507,3 +508,10 @@ void primitives_init_colors(primitives_t* prims)
prims->RGBToYCbCr_16s16s_P3P3 = general_RGBToYCbCr_16s16s_P3P3;
prims->RGBToRGB_16s8u_P3AC4R = general_RGBToRGB_16s8u_P3AC4R;
}
/* ------------------------------------------------------------------------- */
void primitives_init_colors_opt(primitives_t* prims)
{
primitives_init_colors_sse2(prims);
primitives_init_colors_neon(prims);
}

View File

@ -0,0 +1,31 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Primitives colors
*
* Copyright 2024 Armin Novak <anovak@thincast.com>
* Copyright 2024 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FREERDP_LIB_PRIM_COLORS_H
#define FREERDP_LIB_PRIM_COLORS_H
#include <winpr/wtypes.h>
#include <freerdp/config.h>
#include <freerdp/primitives.h>
void primitives_init_colors_sse2(primitives_t* prims);
void primitives_init_colors_neon(primitives_t* prims);
#endif

View File

@ -392,22 +392,8 @@ void primitives_init_copy(primitives_t* prims)
prims->copy_no_overlap = generic_image_copy_no_overlap;
}
#if defined(WITH_SSE2) || defined(WITH_NEON)
void primitives_init_copy_opt(primitives_t* prims)
{
generic = primitives_get_generic();
primitives_init_copy(prims);
/* Pick tuned versions if possible. */
/* Performance with an SSE2 version with no prefetch seemed to be
* all over the map vs. memcpy.
* Sometimes it was significantly faster, sometimes dreadfully slower,
* and it seemed to vary a lot depending on block size and processor.
* Hence, no SSE version is used here unless once can be written that
* is consistently faster than memcpy.
*/
/* This is just an alias with void* parameters */
prims->copy = (__copy_t)(prims->copy_8u);
primitives_init_copy_sse(prims);
primitives_init_copy_sse41(prims);
primitives_init_copy_avx2(prims);
}
#endif

View File

@ -22,6 +22,7 @@
#define FREERDP_LIB_PRIM_COPY_H
#include <winpr/wtypes.h>
#include <freerdp/config.h>
#include <freerdp/primitives.h>
pstatus_t generic_image_copy_no_overlap_convert(
@ -37,6 +38,7 @@ pstatus_t generic_image_copy_no_overlap_memcpy(
SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset,
UINT32 flags);
extern void primitives_init_copy_sse(primitives_t* prims);
extern void primitives_init_copy_avx2(primitives_t* prims);
void primitives_init_copy_sse41(primitives_t* prims);
void primitives_init_copy_avx2(primitives_t* prims);
#endif

View File

@ -275,7 +275,6 @@ FREERDP_LOCAL void primitives_init_colors(primitives_t* prims);
FREERDP_LOCAL void primitives_init_YCoCg(primitives_t* prims);
FREERDP_LOCAL void primitives_init_YUV(primitives_t* prims);
#if defined(WITH_SSE2) || defined(WITH_NEON)
FREERDP_LOCAL void primitives_init_copy_opt(primitives_t* prims);
FREERDP_LOCAL void primitives_init_set_opt(primitives_t* prims);
FREERDP_LOCAL void primitives_init_add_opt(primitives_t* prims);
@ -286,7 +285,6 @@ FREERDP_LOCAL void primitives_init_alphaComp_opt(primitives_t* prims);
FREERDP_LOCAL void primitives_init_colors_opt(primitives_t* prims);
FREERDP_LOCAL void primitives_init_YCoCg_opt(primitives_t* prims);
FREERDP_LOCAL void primitives_init_YUV_opt(primitives_t* prims);
#endif
#if defined(WITH_OPENCL)
FREERDP_LOCAL BOOL primitives_init_opencl(primitives_t* prims);

View File

@ -22,6 +22,7 @@
#include <freerdp/primitives.h>
#include "prim_internal.h"
#include "prim_set.h"
/* ========================================================================= */
static pstatus_t general_set_8u(BYTE val, BYTE* pDst, UINT32 len)
@ -120,3 +121,8 @@ void primitives_init_set(primitives_t* prims)
prims->set_32u = general_set_32u;
prims->zero = general_zero;
}
void primitives_init_set_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_set_sse2(prims);
}

View File

@ -0,0 +1,30 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Primitives copy
*
* Copyright 2024 Armin Novak <anovak@thincast.com>
* Copyright 2024 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FREERDP_LIB_PRIM_SET_H
#define FREERDP_LIB_PRIM_SET_H
#include <winpr/wtypes.h>
#include <freerdp/config.h>
#include <freerdp/primitives.h>
void primitives_init_set_sse2(primitives_t* prims);
#endif

View File

@ -19,6 +19,8 @@
#include <freerdp/primitives.h>
#include "prim_internal.h"
#include "prim_shift.h"
/* ------------------------------------------------------------------------- */
static INLINE INT16 shift(INT16 val, UINT32 sh)
{
@ -133,3 +135,8 @@ void primitives_init_shift(primitives_t* prims)
prims->shiftC_16s = general_shiftC_16s;
prims->shiftC_16u = general_shiftC_16u;
}
void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_shift_sse3(prims);
}

View File

@ -0,0 +1,30 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Primitives copy
*
* Copyright 2024 Armin Novak <anovak@thincast.com>
* Copyright 2024 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FREERDP_LIB_PRIM_SHIFT_H
#define FREERDP_LIB_PRIM_SHIFT_H
#include <winpr/wtypes.h>
#include <freerdp/config.h>
#include <freerdp/primitives.h>
extern void primitives_init_shift_sse3(primitives_t* prims);
#endif

View File

@ -19,6 +19,7 @@
#include <freerdp/primitives.h>
#include "prim_internal.h"
#include "prim_sign.h"
/* ----------------------------------------------------------------------------
* Set pDst to the sign-value of the 16-bit values in pSrc (-1, 0, or 1).
@ -40,3 +41,8 @@ void primitives_init_sign(primitives_t* prims)
/* Start with the default. */
prims->sign_16s = general_sign_16s;
}
void primitives_init_sign_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_sign_ssse3(prims);
}

View File

@ -0,0 +1,30 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Primitives copy
*
* Copyright 2024 Armin Novak <anovak@thincast.com>
* Copyright 2024 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FREERDP_LIB_PRIM_SIGN_H
#define FREERDP_LIB_PRIM_SIGN_H
#include <winpr/wtypes.h>
#include <freerdp/config.h>
#include <freerdp/primitives.h>
void primitives_init_sign_ssse3(primitives_t* prims);
#endif

View File

@ -23,19 +23,19 @@
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_YCoCg.h"
#ifdef WITH_SSE2
#include <emmintrin.h>
#include <tmmintrin.h>
#elif defined(WITH_NEON)
#include <arm_neon.h>
#endif /* WITH_SSE2 else WITH_NEON */
#endif
#include "prim_internal.h"
#include "prim_templates.h"
#ifdef WITH_SSE2
static primitives_t* generic = NULL;
#ifdef WITH_SSE2
/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
@ -411,9 +411,7 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT
return PRIMITIVES_SUCCESS;
}
#endif /* WITH_SSE2 */
#ifdef WITH_SSE2
/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
@ -437,153 +435,22 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT3
height, shift, withAlpha);
}
}
#elif defined(WITH_NEON)
static pstatus_t neon_YCoCgToRGB_8u_X(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
UINT32 width, UINT32 height, UINT8 shift, BYTE bPos,
BYTE gPos, BYTE rPos, BYTE aPos, BOOL alpha)
{
BYTE* dptr = pDst;
const BYTE* sptr = pSrc;
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
const int8_t cll = shift - 1; /* -1 builds in the /2's */
const UINT32 srcPad = srcStep - (width * 4);
const UINT32 dstPad = dstStep - (width * formatSize);
const UINT32 pad = width % 8;
const uint8x8_t aVal = vdup_n_u8(0xFF);
const int8x8_t cllv = vdup_n_s8(cll);
for (UINT32 y = 0; y < height; y++)
{
for (UINT32 x = 0; x < width - pad; x += 8)
{
/* Note: shifts must be done before sign-conversion. */
const uint8x8x4_t raw = vld4_u8(sptr);
const int8x8_t CgRaw = vreinterpret_s8_u8(vshl_u8(raw.val[0], cllv));
const int8x8_t CoRaw = vreinterpret_s8_u8(vshl_u8(raw.val[1], cllv));
const int16x8_t Cg = vmovl_s8(CgRaw);
const int16x8_t Co = vmovl_s8(CoRaw);
const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(raw.val[2])); /* UINT8 -> INT16 */
const int16x8_t T = vsubq_s16(Y, Cg);
const int16x8_t R = vaddq_s16(T, Co);
const int16x8_t G = vaddq_s16(Y, Cg);
const int16x8_t B = vsubq_s16(T, Co);
uint8x8x4_t bgrx;
bgrx.val[bPos] = vqmovun_s16(B);
bgrx.val[gPos] = vqmovun_s16(G);
bgrx.val[rPos] = vqmovun_s16(R);
if (alpha)
bgrx.val[aPos] = raw.val[3];
else
bgrx.val[aPos] = aVal;
vst4_u8(dptr, bgrx);
sptr += sizeof(raw);
dptr += sizeof(bgrx);
}
for (UINT32 x = 0; x < pad; x++)
{
/* Note: shifts must be done before sign-conversion. */
const INT16 Cg = (INT16)((INT8)((*sptr++) << cll));
const INT16 Co = (INT16)((INT8)((*sptr++) << cll));
const INT16 Y = (INT16)(*sptr++); /* UINT8->INT16 */
const INT16 T = Y - Cg;
const INT16 R = T + Co;
const INT16 G = Y + Cg;
const INT16 B = T - Co;
BYTE bgra[4];
bgra[bPos] = CLIP(B);
bgra[gPos] = CLIP(G);
bgra[rPos] = CLIP(R);
bgra[aPos] = *sptr++;
if (!alpha)
bgra[aPos] = 0xFF;
*dptr++ = bgra[0];
*dptr++ = bgra[1];
*dptr++ = bgra[2];
*dptr++ = bgra[3];
}
sptr += srcPad;
dptr += dstPad;
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t neon_YCoCgToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha)
{
switch (DstFormat)
{
case PIXEL_FORMAT_BGRA32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 2, 1, 0, 3, withAlpha);
case PIXEL_FORMAT_BGRX32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 2, 1, 0, 3, withAlpha);
case PIXEL_FORMAT_RGBA32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 0, 1, 2, 3, withAlpha);
case PIXEL_FORMAT_RGBX32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 0, 1, 2, 3, withAlpha);
case PIXEL_FORMAT_ARGB32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 1, 2, 3, 0, withAlpha);
case PIXEL_FORMAT_XRGB32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 1, 2, 3, 0, withAlpha);
case PIXEL_FORMAT_ABGR32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 3, 2, 1, 0, withAlpha);
case PIXEL_FORMAT_XBGR32:
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
shift, 3, 2, 1, 0, withAlpha);
default:
return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
height, shift, withAlpha);
}
}
#endif /* WITH_SSE2 */
/* ------------------------------------------------------------------------- */
void primitives_init_YCoCg_opt(primitives_t* WINPR_RESTRICT prims)
void primitives_init_YCoCg_ssse3(primitives_t* WINPR_RESTRICT prims)
{
#if defined(WITH_SSE2)
generic = primitives_get_generic();
primitives_init_YCoCg(prims);
/* While IPP acknowledges the existence of YCoCg-R, it doesn't currently
* include any routines to work with it, especially with variable shift
* width.
*/
#if defined(WITH_SSE2)
if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
{
prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
}
#elif defined(WITH_NEON)
if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
{
prims->YCoCgToRGB_8u_AC4R = neon_YCoCgToRGB_8u_AC4R;
}
#endif /* WITH_SSE2 */
#else
WINPR_UNUSED(prims);
#endif
}

View File

@ -29,14 +29,12 @@
#include <freerdp/primitives.h>
#include "prim_internal.h"
#include "prim_YUV.h"
#if defined(WITH_SSE2)
#include <emmintrin.h>
#include <tmmintrin.h>
#if !defined(WITH_SSE2)
#error "This file needs WITH_SSE2 enabled!"
#endif
static primitives_t* generic = NULL;
/****************************************************************************/
@ -1496,9 +1494,11 @@ static pstatus_t ssse3_YUV420CombineToYUV444(avc444_frame_type type,
return -1;
}
}
#endif
void primitives_init_YUV_opt(primitives_t* WINPR_RESTRICT prims)
void primitives_init_YUV_ssse3(primitives_t* WINPR_RESTRICT prims)
{
#if defined(WITH_SSE2)
generic = primitives_get_generic();
primitives_init_YUV(prims);
@ -1512,4 +1512,7 @@ void primitives_init_YUV_opt(primitives_t* WINPR_RESTRICT prims)
prims->YUV444ToRGB_8u_P3AC4R = ssse3_YUV444ToRGB_8u_P3AC4R;
prims->YUV420CombineToYUV444 = ssse3_YUV420CombineToYUV444;
}
#else
WINPR_UNUSED(prims);
#endif
}

View File

@ -20,6 +20,8 @@
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_add.h"
#ifdef WITH_SSE2
#include <emmintrin.h>
#include <pmmintrin.h>
@ -28,9 +30,9 @@
#include "prim_internal.h"
#include "prim_templates.h"
#ifdef WITH_SSE2
static primitives_t* generic = NULL;
#ifdef WITH_SSE2
/* ------------------------------------------------------------------------- */
SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16,
generic->add_16s(sptr1++, sptr2++, dptr++, 1))
@ -174,12 +176,12 @@ static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1,
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims)
void primitives_init_add_sse3(primitives_t* WINPR_RESTRICT prims)
{
#if defined(WITH_SSE2)
generic = primitives_get_generic();
primitives_init_add(prims);
#if defined(WITH_SSE2)
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
{
@ -187,5 +189,7 @@ void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims)
prims->add_16s_inplace = sse3_add_16s_inplace;
}
#else
WINPR_UNUSED(prims);
#endif
}

View File

@ -26,6 +26,8 @@
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_alphaComp.h"
#ifdef WITH_SSE2
#include <emmintrin.h>
#include <pmmintrin.h>
@ -33,10 +35,9 @@
#include "prim_internal.h"
static primitives_t* generic = NULL;
/* ------------------------------------------------------------------------- */
#ifdef WITH_SSE2
static primitives_t* generic = NULL;
static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step,
const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step,
@ -208,11 +209,11 @@ static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 sr
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims)
void primitives_init_alphaComp_sse3(primitives_t* WINPR_RESTRICT prims)
{
#if defined(WITH_SSE2)
generic = primitives_get_generic();
primitives_init_alphaComp(prims);
#if defined(WITH_SSE2)
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
@ -220,5 +221,7 @@ void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims)
prims->alphaComp_argb = sse2_alphaComp_argb;
}
#else
WINPR_UNUSED(prims);
#endif
}

View File

@ -19,6 +19,8 @@
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_andor.h"
#ifdef WITH_SSE2
#include <emmintrin.h>
#include <pmmintrin.h>
@ -27,9 +29,9 @@
#include "prim_internal.h"
#include "prim_templates.h"
#ifdef WITH_SSE2
static primitives_t* generic = NULL;
#ifdef WITH_SSE2
/* ------------------------------------------------------------------------- */
SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, generic->andC_32u, _mm_and_si128,
*dptr++ = *sptr++ & val)
@ -37,11 +39,11 @@ SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, generic->orC_32u, _mm_or_si128, *dptr
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_andor_opt(primitives_t* WINPR_RESTRICT prims)
void primitives_init_andor_sse3(primitives_t* WINPR_RESTRICT prims)
{
#if defined(WITH_SSE2)
generic = primitives_get_generic();
primitives_init_andor(prims);
#if defined(WITH_SSE2)
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
@ -50,5 +52,7 @@ void primitives_init_andor_opt(primitives_t* WINPR_RESTRICT prims)
prims->orC_32u = sse3_orC_32u;
}
#else
WINPR_UNUSED(prims);
#endif
}

View File

@ -23,18 +23,17 @@
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_colors.h"
#ifdef WITH_SSE2
#include <emmintrin.h>
#elif defined(WITH_NEON)
#include <arm_neon.h>
#endif /* WITH_SSE2 else WITH_NEON */
#endif /* WITH_SSE2 */
#include "prim_internal.h"
#include "prim_templates.h"
static primitives_t* generic = NULL;
#ifdef WITH_SSE2
static primitives_t* generic = NULL;
#ifdef __GNUC__
#define GNU_INLINE __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@ -1246,329 +1245,11 @@ sse2_RGBToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit
}
#endif /* WITH_SSE2 */
/*---------------------------------------------------------------------------*/
#ifdef WITH_NEON
static pstatus_t
neon_yCbCrToRGB_16s16s_P3P3(const INT16* const WINPR_RESTRICT pSrc[3], INT32 srcStep,
INT16* WINPR_RESTRICT pDst[3], INT32 dstStep,
const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
{
/* TODO: If necessary, check alignments and call the general version. */
int16x8_t zero = vdupq_n_s16(0);
int16x8_t max = vdupq_n_s16(255);
int16x8_t r_cr = vdupq_n_s16(22986); // 1.403 << 14
int16x8_t g_cb = vdupq_n_s16(-5636); // -0.344 << 14
int16x8_t g_cr = vdupq_n_s16(-11698); // -0.714 << 14
int16x8_t b_cb = vdupq_n_s16(28999); // 1.770 << 14
int16x8_t c4096 = vdupq_n_s16(4096);
int16x8_t* y_buf = (int16x8_t*)pSrc[0];
int16x8_t* cb_buf = (int16x8_t*)pSrc[1];
int16x8_t* cr_buf = (int16x8_t*)pSrc[2];
int16x8_t* r_buf = (int16x8_t*)pDst[0];
int16x8_t* g_buf = (int16x8_t*)pDst[1];
int16x8_t* b_buf = (int16x8_t*)pDst[2];
int srcbump = srcStep / sizeof(int16x8_t);
int dstbump = dstStep / sizeof(int16x8_t);
int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
for (int yp = 0; yp < roi->height; ++yp)
{
for (int i = 0; i < imax; i++)
{
/*
In order to use NEON signed 16-bit integer multiplication we need to convert
the floating point factors to signed int without loosing information.
The result of this multiplication is 32 bit and we have a NEON instruction
that returns the hi word of the saturated double.
Thus we will multiply the factors by the highest possible 2^n, take the
upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
shift by 1 to reverse the doubling) and correct this result by multiplying it
by 2^(16-n).
For the given factors in the conversion matrix the best possible n is 14.
Example for calculating r:
r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
*/
/* y = (y_buf[i] + 4096) >> 2 */
int16x8_t y = vld1q_s16((INT16*)&y_buf[i]);
y = vaddq_s16(y, c4096);
y = vshrq_n_s16(y, 2);
/* cb = cb_buf[i]; */
int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
/* cr = cr_buf[i]; */
int16x8_t cr = vld1q_s16((INT16*)&cr_buf[i]);
/* (y + HIWORD(cr*22986)) >> 3 */
int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
r = vshrq_n_s16(r, 3);
/* r_buf[i] = CLIP(r); */
r = vminq_s16(vmaxq_s16(r, zero), max);
vst1q_s16((INT16*)&r_buf[i], r);
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
g = vshrq_n_s16(g, 3);
/* g_buf[i] = CLIP(g); */
g = vminq_s16(vmaxq_s16(g, zero), max);
vst1q_s16((INT16*)&g_buf[i], g);
/* (y + HIWORD(cb*28999)) >> 3 */
int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
b = vshrq_n_s16(b, 3);
/* b_buf[i] = CLIP(b); */
b = vminq_s16(vmaxq_s16(b, zero), max);
vst1q_s16((INT16*)&b_buf[i], b);
}
y_buf += srcbump;
cb_buf += srcbump;
cr_buf += srcbump;
r_buf += dstbump;
g_buf += dstbump;
b_buf += dstbump;
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(const INT16* const WINPR_RESTRICT pSrc[3],
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
UINT32 dstStep,
const prim_size_t* WINPR_RESTRICT roi, uint8_t rPos,
uint8_t gPos, uint8_t bPos, uint8_t aPos)
{
BYTE* pRGB = pDst;
const INT16* pY = pSrc[0];
const INT16* pCb = pSrc[1];
const INT16* pCr = pSrc[2];
const size_t srcPad = (srcStep - (roi->width * sizeof(INT16))) / sizeof(INT16);
const size_t dstPad = (dstStep - (roi->width * 4)) / 4;
const size_t pad = roi->width % 8;
const int16x4_t c4096 = vdup_n_s16(4096);
for (UINT32 y = 0; y < roi->height; y++)
{
for (UINT32 x = 0; x < roi->width - pad; x += 8)
{
const int16x8_t Y = vld1q_s16(pY);
const int16x4_t Yh = vget_high_s16(Y);
const int16x4_t Yl = vget_low_s16(Y);
const int32x4_t YhAdd = vaddl_s16(Yh, c4096); /* Y + 4096 */
const int32x4_t YlAdd = vaddl_s16(Yl, c4096); /* Y + 4096 */
const int32x4_t YhW = vshlq_n_s32(YhAdd, 16);
const int32x4_t YlW = vshlq_n_s32(YlAdd, 16);
const int16x8_t Cr = vld1q_s16(pCr);
const int16x4_t Crh = vget_high_s16(Cr);
const int16x4_t Crl = vget_low_s16(Cr);
const int16x8_t Cb = vld1q_s16(pCb);
const int16x4_t Cbh = vget_high_s16(Cb);
const int16x4_t Cbl = vget_low_s16(Cb);
uint8x8x4_t bgrx;
{
/* R */
const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916); /* 1.402525 * 2^16 */
const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916); /* 1.402525 * 2^16 */
const int32x4_t CrhRa = vaddq_s32(CrhR, YhW);
const int32x4_t CrlRa = vaddq_s32(CrlR, YlW);
const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21));
const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21));
const int16x8_t Rs = vcombine_s16(Rsl, Rsh);
bgrx.val[rPos] = vqmovun_s16(Rs);
}
{
/* G */
const int32x4_t CbGh = vmull_n_s16(Cbh, 22527); /* 0.343730 * 2^16 */
const int32x4_t CbGl = vmull_n_s16(Cbl, 22527); /* 0.343730 * 2^16 */
const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819); /* 0.714401 * 2^16 */
const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819); /* 0.714401 * 2^16 */
const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh);
const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl);
const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh);
const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl);
const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21));
const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21));
const int16x8_t Gs = vcombine_s16(Gsl, Gsh);
const uint8x8_t G = vqmovun_s16(Gs);
bgrx.val[gPos] = G;
}
{
/* B */
const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992); /* 1.769905 * 2^16 */
const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992); /* 1.769905 * 2^16 */
const int32x4_t YCbBh = vaddq_s32(CbBh, YhW);
const int32x4_t YCbBl = vaddq_s32(CbBl, YlW);
const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21));
const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21));
const int16x8_t Bs = vcombine_s16(Bsl, Bsh);
const uint8x8_t B = vqmovun_s16(Bs);
bgrx.val[bPos] = B;
}
/* A */
{
bgrx.val[aPos] = vdup_n_u8(0xFF);
}
vst4_u8(pRGB, bgrx);
pY += 8;
pCb += 8;
pCr += 8;
pRGB += 32;
}
for (UINT32 x = 0; x < pad; x++)
{
const INT32 divisor = 16;
const INT32 Y = ((*pY++) + 4096) << divisor;
const INT32 Cb = (*pCb++);
const INT32 Cr = (*pCr++);
const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
BYTE bgrx[4];
bgrx[bPos] = CLIP(B);
bgrx[gPos] = CLIP(G);
bgrx[rPos] = CLIP(R);
bgrx[aPos] = 0xFF;
*pRGB++ = bgrx[0];
*pRGB++ = bgrx[1];
*pRGB++ = bgrx[2];
*pRGB++ = bgrx[3];
}
pY += srcPad;
pCb += srcPad;
pCr += srcPad;
pRGB += dstPad;
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3],
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
UINT32 dstStep, UINT32 DstFormat,
const prim_size_t* WINPR_RESTRICT roi)
{
switch (DstFormat)
{
case PIXEL_FORMAT_BGRA32:
case PIXEL_FORMAT_BGRX32:
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
case PIXEL_FORMAT_RGBA32:
case PIXEL_FORMAT_RGBX32:
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
case PIXEL_FORMAT_ARGB32:
case PIXEL_FORMAT_XRGB32:
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
case PIXEL_FORMAT_ABGR32:
case PIXEL_FORMAT_XBGR32:
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
default:
return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
}
}
static pstatus_t neon_RGBToRGB_16s8u_P3AC4R_X(
const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
UINT32 srcStep, /* bytes between rows in source data */
BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
UINT32 dstStep, /* bytes between rows in dest data */
const prim_size_t* WINPR_RESTRICT roi, /* region of interest */
uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
{
UINT32 pad = roi->width % 8;
for (UINT32 y = 0; y < roi->height; y++)
{
const INT16* pr = (INT16*)(((BYTE*)pSrc[0]) + y * srcStep);
const INT16* pg = (INT16*)(((BYTE*)pSrc[1]) + y * srcStep);
const INT16* pb = (INT16*)(((BYTE*)pSrc[2]) + y * srcStep);
BYTE* dst = pDst + y * dstStep;
for (UINT32 x = 0; x < roi->width - pad; x += 8)
{
int16x8_t r = vld1q_s16(pr);
int16x8_t g = vld1q_s16(pg);
int16x8_t b = vld1q_s16(pb);
uint8x8x4_t bgrx;
bgrx.val[aPos] = vdup_n_u8(0xFF);
bgrx.val[rPos] = vqmovun_s16(r);
bgrx.val[gPos] = vqmovun_s16(g);
bgrx.val[bPos] = vqmovun_s16(b);
vst4_u8(dst, bgrx);
pr += 8;
pg += 8;
pb += 8;
dst += 32;
}
for (UINT32 x = 0; x < pad; x++)
{
BYTE bgrx[4];
bgrx[bPos] = *pb++;
bgrx[gPos] = *pg++;
bgrx[rPos] = *pr++;
bgrx[aPos] = 0xFF;
*dst++ = bgrx[0];
*dst++ = bgrx[1];
*dst++ = bgrx[2];
*dst++ = bgrx[3];
}
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t
neon_RGBToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
UINT32 srcStep, /* bytes between rows in source data */
BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
UINT32 dstStep, /* bytes between rows in dest data */
UINT32 DstFormat,
const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
{
switch (DstFormat)
{
case PIXEL_FORMAT_BGRA32:
case PIXEL_FORMAT_BGRX32:
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
case PIXEL_FORMAT_RGBA32:
case PIXEL_FORMAT_RGBX32:
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
case PIXEL_FORMAT_ARGB32:
case PIXEL_FORMAT_XRGB32:
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
case PIXEL_FORMAT_ABGR32:
case PIXEL_FORMAT_XBGR32:
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
default:
return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
}
}
#endif /* WITH_NEON */
/* I don't see a direct IPP version of this, since the input is INT16
* YCbCr. It may be possible via Deinterleave and then YCbCrToRGB_<mod>.
* But that would likely be slower.
*/
/* ------------------------------------------------------------------------- */
void primitives_init_colors_opt(primitives_t* prims)
void primitives_init_colors_sse2(primitives_t* prims)
{
#if defined(WITH_SSE2)
generic = primitives_get_generic();
primitives_init_colors(prims);
#if defined(WITH_SSE2)
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
{
@ -1578,14 +1259,7 @@ void primitives_init_colors_opt(primitives_t* prims)
prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
}
#elif defined(WITH_NEON)
if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
{
prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R;
prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R;
prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
}
#endif /* WITH_SSE2 */
#else
WINPR_UNUSED(prims);
#endif
}

View File

@ -268,7 +268,7 @@ static pstatus_t sse_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_copy_sse(primitives_t* prims)
void primitives_init_copy_sse41(primitives_t* prims)
{
#if defined(WITH_SSE2)
if (IsProcessorFeaturePresent(PF_SSE4_1_INSTRUCTIONS_AVAILABLE))

View File

@ -26,11 +26,12 @@
#endif /* WITH_SSE2 */
#include "prim_internal.h"
static primitives_t* generic = NULL;
#include "prim_set.h"
/* ========================================================================= */
#ifdef WITH_SSE2
static primitives_t* generic = NULL;
static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 len)
{
BYTE byte = 0;
@ -216,14 +217,13 @@ static pstatus_t sse2_set_32s(INT32 val, INT32* WINPR_RESTRICT pDst, UINT32 len)
#endif /* WITH_SSE2 */
/* ------------------------------------------------------------------------- */
void primitives_init_set_opt(primitives_t* WINPR_RESTRICT prims)
void primitives_init_set_sse2(primitives_t* WINPR_RESTRICT prims)
{
#if defined(WITH_SSE2)
generic = primitives_get_generic();
primitives_init_set(prims);
/* Pick tuned versions if possible. */
#if defined(WITH_SSE2)
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
{
prims->set_8u = sse2_set_8u;
@ -231,5 +231,7 @@ void primitives_init_set_opt(primitives_t* WINPR_RESTRICT prims)
prims->set_32u = sse2_set_32u;
}
#else
WINPR_UNUSED(prims);
#endif
}

View File

@ -19,6 +19,8 @@
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_shift.h"
#ifdef WITH_SSE2
#include <emmintrin.h>
#include <pmmintrin.h>
@ -27,9 +29,9 @@
#include "prim_internal.h"
#include "prim_templates.h"
#ifdef WITH_SSE2
static primitives_t* generic = NULL;
#ifdef WITH_SSE2
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, _mm_slli_epi16,
*dptr++ = (INT16)((UINT16)*sptr++ << val))
@ -142,11 +144,11 @@ static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32
*/
/* ------------------------------------------------------------------------- */
void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims)
void primitives_init_shift_sse3(primitives_t* WINPR_RESTRICT prims)
{
#if defined(WITH_SSE2)
generic = primitives_get_generic();
primitives_init_shift(prims);
#if defined(WITH_SSE2)
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
@ -158,5 +160,7 @@ void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims)
prims->rShiftC_16u = sse2_rShiftC_16u;
}
#else
WINPR_UNUSED(prims);
#endif
}

View File

@ -19,16 +19,18 @@
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#ifdef WITH_SSE2
#include "prim_sign.h"
#if defined(WITH_SSE2)
#include <emmintrin.h>
#include <tmmintrin.h>
#endif /* WITH_SSE2 */
#endif
#include "prim_internal.h"
#if defined(WITH_SSE2)
static primitives_t* generic = NULL;
#ifdef WITH_SSE2
/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst,
UINT32 len)
@ -167,13 +169,13 @@ static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_R
#endif /* WITH_SSE2 */
/* ------------------------------------------------------------------------- */
void primitives_init_sign_opt(primitives_t* WINPR_RESTRICT prims)
void primitives_init_sign_ssse3(primitives_t* WINPR_RESTRICT prims)
{
#if defined(WITH_SSE2)
generic = primitives_get_generic();
primitives_init_sign(prims);
/* Pick tuned versions if possible. */
/* I didn't spot an IPP version of this. */
#if defined(WITH_SSE2)
if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
@ -181,5 +183,7 @@ void primitives_init_sign_opt(primitives_t* WINPR_RESTRICT prims)
prims->sign_16s = ssse3_sign_16s;
}
#else
WINPR_UNUSED(prims);
#endif
}