[primitives] refactor, split sse/neon/opencl
This commit is contained in:
parent
561d2b32c6
commit
3cecd1de06
@ -2,63 +2,86 @@
|
||||
|
||||
set(PRIMITIVES_SRCS
|
||||
prim_add.c
|
||||
prim_add.h
|
||||
prim_andor.c
|
||||
prim_andor.h
|
||||
prim_alphaComp.c
|
||||
prim_alphaComp.h
|
||||
prim_colors.c
|
||||
prim_colors.h
|
||||
prim_copy.c
|
||||
prim_copy.h
|
||||
prim_set.c
|
||||
prim_set.h
|
||||
prim_shift.c
|
||||
prim_shift.h
|
||||
prim_sign.c
|
||||
prim_sign.h
|
||||
prim_YUV.c
|
||||
prim_YUV.h
|
||||
prim_YCoCg.c
|
||||
prim_YCoCg.h
|
||||
primitives.c
|
||||
prim_internal.h)
|
||||
|
||||
if (WITH_SSE2 OR WITH_NEON)
|
||||
set(PRIMITIVES_SSE2_SRCS
|
||||
prim_colors_opt.c
|
||||
prim_copy_sse.c
|
||||
prim_copy_avx2.c
|
||||
prim_set_opt.c)
|
||||
set(PRIMITIVES_SSE2_SRCS
|
||||
sse/prim_colors_sse2.c
|
||||
sse/prim_set_sse2.c
|
||||
)
|
||||
|
||||
set(PRIMITIVES_SSE3_SRCS
|
||||
prim_add_opt.c
|
||||
prim_alphaComp_opt.c
|
||||
prim_andor_opt.c
|
||||
prim_shift_opt.c)
|
||||
set(PRIMITIVES_SSE3_SRCS
|
||||
sse/prim_add_sse3.c
|
||||
sse/prim_alphaComp_sse3.c
|
||||
sse/prim_andor_sse3.c
|
||||
sse/prim_shift_sse3.c
|
||||
)
|
||||
|
||||
set(PRIMITIVES_SSSE3_SRCS
|
||||
prim_sign_opt.c
|
||||
prim_YCoCg_opt.c)
|
||||
set(PRIMITIVES_SSSE3_SRCS
|
||||
sse/prim_YUV_ssse3.c
|
||||
sse/prim_sign_ssse3.c
|
||||
sse/prim_YCoCg_ssse3.c
|
||||
)
|
||||
|
||||
if (WITH_SSE2)
|
||||
set(PRIMITIVES_SSSE3_SRCS ${PRIMITIVES_SSSE3_SRCS}
|
||||
prim_YUV_ssse3.c)
|
||||
endif()
|
||||
set(PRIMITIVES_SSE4_1_SRCS
|
||||
sse/prim_copy_sse4_1.c
|
||||
)
|
||||
|
||||
if (WITH_NEON)
|
||||
set(PRIMITIVES_SSSE3_SRCS ${PRIMITIVES_SSSE3_SRCS}
|
||||
prim_YUV_neon.c)
|
||||
endif()
|
||||
endif()
|
||||
set(PRIMITIVES_SSE4_2_SRCS
|
||||
)
|
||||
|
||||
set(PRIMITIVES_AVX2_SRCS
|
||||
sse/prim_copy_avx2.c
|
||||
)
|
||||
|
||||
set(PRIMITIVES_NEON_SRCS
|
||||
neon/prim_colors_neon.c
|
||||
neon/prim_YCoCg_neon.c
|
||||
neon/prim_YUV_neon.c
|
||||
)
|
||||
|
||||
set(PRIMITIVES_OPENCL_SRCS
|
||||
opencl/prim_YUV_opencl.c
|
||||
)
|
||||
|
||||
if (WITH_OPENCL)
|
||||
set(PRIMITIVES_OPENCL_SRCS prim_YUV_opencl.c)
|
||||
|
||||
freerdp_include_directory_add(${OpenCL_INCLUDE_DIRS})
|
||||
freerdp_library_add(OpenCL::OpenCL)
|
||||
|
||||
endif()
|
||||
|
||||
set(PRIMITIVES_OPT_SRCS
|
||||
${PRIMITIVES_NEON_SRCS}
|
||||
${PRIMITIVES_SSE2_SRCS}
|
||||
${PRIMITIVES_SSE3_SRCS}
|
||||
${PRIMITIVES_SSSE3_SRCS}
|
||||
${PRIMITIVES_SSE4_1_SRCS}
|
||||
${PRIMITIVES_SSE4_2_SRCS}
|
||||
${PRIMITIVES_AVX2_SRCS}
|
||||
${PRIMITIVES_OPENCL_SRCS})
|
||||
|
||||
set(PRIMITIVES_SRCS ${PRIMITIVES_SRCS} ${PRIMITIVES_OPT_SRCS})
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
add_library(freerdp-primitives OBJECT
|
||||
${PRIMITIVES_SRCS}
|
||||
)
|
||||
@ -74,8 +97,15 @@ if(WITH_SSE2)
|
||||
if (PRIMITIVES_SSSE3_SRCS)
|
||||
set_source_files_properties(${PRIMITIVES_SSSE3_SRCS} PROPERTIES COMPILE_FLAGS "-mssse3" )
|
||||
endif()
|
||||
set_source_files_properties(prim_copy_sse.c PROPERTIES COMPILE_FLAGS "-msse4.1" )
|
||||
set_source_files_properties(prim_copy_avx2.c PROPERTIES COMPILE_FLAGS "-mavx2" )
|
||||
if (PRIMITIVES_SSE4_1_SRCS)
|
||||
set_source_files_properties(${PRIMITIVES_SSE4_1_SRCS} PROPERTIES COMPILE_FLAGS "-msse4.1" )
|
||||
endif()
|
||||
if (PRIMITIVES_SSE4_2_SRCS)
|
||||
set_source_files_properties(${PRIMITIVES_SSE4_2_SRCS} PROPERTIES COMPILE_FLAGS "-msse4.2" )
|
||||
endif()
|
||||
if (PRIMITIVES_AVX2_SRCS)
|
||||
set_source_files_properties(${PRIMITIVES_AVX2_SRCS} PROPERTIES COMPILE_FLAGS "-mavx2" )
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(MSVC)
|
||||
|
173
libfreerdp/primitives/neon/prim_YCoCg_neon.c
Normal file
173
libfreerdp/primitives/neon/prim_YCoCg_neon.c
Normal file
@ -0,0 +1,173 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Optimized YCoCg<->RGB conversion operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#if defined(WITH_NEON)
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_templates.h"
|
||||
#include "prim_YCoCg.h"
|
||||
|
||||
#if defined(WITH_NEON)
|
||||
static primitives_t* generic = NULL;
|
||||
|
||||
static pstatus_t neon_YCoCgToRGB_8u_X(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
|
||||
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
|
||||
UINT32 width, UINT32 height, UINT8 shift, BYTE bPos,
|
||||
BYTE gPos, BYTE rPos, BYTE aPos, BOOL alpha)
|
||||
{
|
||||
BYTE* dptr = pDst;
|
||||
const BYTE* sptr = pSrc;
|
||||
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
|
||||
const int8_t cll = shift - 1; /* -1 builds in the /2's */
|
||||
const UINT32 srcPad = srcStep - (width * 4);
|
||||
const UINT32 dstPad = dstStep - (width * formatSize);
|
||||
const UINT32 pad = width % 8;
|
||||
const uint8x8_t aVal = vdup_n_u8(0xFF);
|
||||
const int8x8_t cllv = vdup_n_s8(cll);
|
||||
|
||||
for (UINT32 y = 0; y < height; y++)
|
||||
{
|
||||
for (UINT32 x = 0; x < width - pad; x += 8)
|
||||
{
|
||||
/* Note: shifts must be done before sign-conversion. */
|
||||
const uint8x8x4_t raw = vld4_u8(sptr);
|
||||
const int8x8_t CgRaw = vreinterpret_s8_u8(vshl_u8(raw.val[0], cllv));
|
||||
const int8x8_t CoRaw = vreinterpret_s8_u8(vshl_u8(raw.val[1], cllv));
|
||||
const int16x8_t Cg = vmovl_s8(CgRaw);
|
||||
const int16x8_t Co = vmovl_s8(CoRaw);
|
||||
const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(raw.val[2])); /* UINT8 -> INT16 */
|
||||
const int16x8_t T = vsubq_s16(Y, Cg);
|
||||
const int16x8_t R = vaddq_s16(T, Co);
|
||||
const int16x8_t G = vaddq_s16(Y, Cg);
|
||||
const int16x8_t B = vsubq_s16(T, Co);
|
||||
uint8x8x4_t bgrx;
|
||||
bgrx.val[bPos] = vqmovun_s16(B);
|
||||
bgrx.val[gPos] = vqmovun_s16(G);
|
||||
bgrx.val[rPos] = vqmovun_s16(R);
|
||||
|
||||
if (alpha)
|
||||
bgrx.val[aPos] = raw.val[3];
|
||||
else
|
||||
bgrx.val[aPos] = aVal;
|
||||
|
||||
vst4_u8(dptr, bgrx);
|
||||
sptr += sizeof(raw);
|
||||
dptr += sizeof(bgrx);
|
||||
}
|
||||
|
||||
for (UINT32 x = 0; x < pad; x++)
|
||||
{
|
||||
/* Note: shifts must be done before sign-conversion. */
|
||||
const INT16 Cg = (INT16)((INT8)((*sptr++) << cll));
|
||||
const INT16 Co = (INT16)((INT8)((*sptr++) << cll));
|
||||
const INT16 Y = (INT16)(*sptr++); /* UINT8->INT16 */
|
||||
const INT16 T = Y - Cg;
|
||||
const INT16 R = T + Co;
|
||||
const INT16 G = Y + Cg;
|
||||
const INT16 B = T - Co;
|
||||
BYTE bgra[4];
|
||||
bgra[bPos] = CLIP(B);
|
||||
bgra[gPos] = CLIP(G);
|
||||
bgra[rPos] = CLIP(R);
|
||||
bgra[aPos] = *sptr++;
|
||||
|
||||
if (!alpha)
|
||||
bgra[aPos] = 0xFF;
|
||||
|
||||
*dptr++ = bgra[0];
|
||||
*dptr++ = bgra[1];
|
||||
*dptr++ = bgra[2];
|
||||
*dptr++ = bgra[3];
|
||||
}
|
||||
|
||||
sptr += srcPad;
|
||||
dptr += dstPad;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t neon_YCoCgToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
|
||||
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
|
||||
UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha)
|
||||
{
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 2, 1, 0, 3, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 2, 1, 0, 3, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 0, 1, 2, 3, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 0, 1, 2, 3, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_ARGB32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 1, 2, 3, 0, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_XRGB32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 1, 2, 3, 0, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_ABGR32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 3, 2, 1, 0, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_XBGR32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 3, 2, 1, 0, withAlpha);
|
||||
|
||||
default:
|
||||
return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
|
||||
height, shift, withAlpha);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_YCoCg_neon(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(WITH_NEON)
|
||||
generic = primitives_get_generic();
|
||||
primitives_init_YCoCg(prims);
|
||||
|
||||
if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
|
||||
{
|
||||
prims->YCoCgToRGB_8u_AC4R = neon_YCoCgToRGB_8u_AC4R;
|
||||
}
|
||||
#else
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
@ -28,11 +28,9 @@
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_YUV.h"
|
||||
|
||||
#if !defined(WITH_NEON)
|
||||
#error "This file must only be included if WITH_NEON is active!"
|
||||
#endif
|
||||
|
||||
#if defined(WITH_NEON)
|
||||
#include <arm_neon.h>
|
||||
|
||||
static primitives_t* generic = NULL;
|
||||
@ -742,9 +740,11 @@ static pstatus_t neon_YUV420CombineToYUV444(avc444_frame_type type,
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void primitives_init_YUV_opt(primitives_t* prims)
|
||||
void primitives_init_YUV_neon(primitives_t* prims)
|
||||
{
|
||||
#if defined(WITH_NEON)
|
||||
generic = primitives_get_generic();
|
||||
primitives_init_YUV(prims);
|
||||
|
||||
@ -754,4 +754,7 @@ void primitives_init_YUV_opt(primitives_t* prims)
|
||||
prims->YUV444ToRGB_8u_P3AC4R = neon_YUV444ToRGB_8u_P3AC4R;
|
||||
prims->YUV420CombineToYUV444 = neon_YUV420CombineToYUV444;
|
||||
}
|
||||
#else
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
365
libfreerdp/primitives/neon/prim_colors_neon.c
Normal file
365
libfreerdp/primitives/neon/prim_colors_neon.c
Normal file
@ -0,0 +1,365 @@
|
||||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Optimized Color conversion operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* Copyright 2011 Stephen Erisman
|
||||
* Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
|
||||
* Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#if defined(WITH_NEON)
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_templates.h"
|
||||
#include "prim_colors.h"
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
#ifdef WITH_NEON
|
||||
static primitives_t* generic = NULL;
|
||||
|
||||
static pstatus_t
|
||||
neon_yCbCrToRGB_16s16s_P3P3(const INT16* const WINPR_RESTRICT pSrc[3], INT32 srcStep,
|
||||
INT16* WINPR_RESTRICT pDst[3], INT32 dstStep,
|
||||
const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
|
||||
{
|
||||
/* TODO: If necessary, check alignments and call the general version. */
|
||||
int16x8_t zero = vdupq_n_s16(0);
|
||||
int16x8_t max = vdupq_n_s16(255);
|
||||
int16x8_t r_cr = vdupq_n_s16(22986); // 1.403 << 14
|
||||
int16x8_t g_cb = vdupq_n_s16(-5636); // -0.344 << 14
|
||||
int16x8_t g_cr = vdupq_n_s16(-11698); // -0.714 << 14
|
||||
int16x8_t b_cb = vdupq_n_s16(28999); // 1.770 << 14
|
||||
int16x8_t c4096 = vdupq_n_s16(4096);
|
||||
int16x8_t* y_buf = (int16x8_t*)pSrc[0];
|
||||
int16x8_t* cb_buf = (int16x8_t*)pSrc[1];
|
||||
int16x8_t* cr_buf = (int16x8_t*)pSrc[2];
|
||||
int16x8_t* r_buf = (int16x8_t*)pDst[0];
|
||||
int16x8_t* g_buf = (int16x8_t*)pDst[1];
|
||||
int16x8_t* b_buf = (int16x8_t*)pDst[2];
|
||||
int srcbump = srcStep / sizeof(int16x8_t);
|
||||
int dstbump = dstStep / sizeof(int16x8_t);
|
||||
int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
|
||||
|
||||
for (int yp = 0; yp < roi->height; ++yp)
|
||||
{
|
||||
for (int i = 0; i < imax; i++)
|
||||
{
|
||||
/*
|
||||
In order to use NEON signed 16-bit integer multiplication we need to convert
|
||||
the floating point factors to signed int without loosing information.
|
||||
The result of this multiplication is 32 bit and we have a NEON instruction
|
||||
that returns the hi word of the saturated double.
|
||||
Thus we will multiply the factors by the highest possible 2^n, take the
|
||||
upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
|
||||
shift by 1 to reverse the doubling) and correct this result by multiplying it
|
||||
by 2^(16-n).
|
||||
For the given factors in the conversion matrix the best possible n is 14.
|
||||
|
||||
Example for calculating r:
|
||||
r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
|
||||
r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
|
||||
r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
|
||||
r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
|
||||
*/
|
||||
/* y = (y_buf[i] + 4096) >> 2 */
|
||||
int16x8_t y = vld1q_s16((INT16*)&y_buf[i]);
|
||||
y = vaddq_s16(y, c4096);
|
||||
y = vshrq_n_s16(y, 2);
|
||||
/* cb = cb_buf[i]; */
|
||||
int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
|
||||
/* cr = cr_buf[i]; */
|
||||
int16x8_t cr = vld1q_s16((INT16*)&cr_buf[i]);
|
||||
/* (y + HIWORD(cr*22986)) >> 3 */
|
||||
int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
|
||||
r = vshrq_n_s16(r, 3);
|
||||
/* r_buf[i] = CLIP(r); */
|
||||
r = vminq_s16(vmaxq_s16(r, zero), max);
|
||||
vst1q_s16((INT16*)&r_buf[i], r);
|
||||
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
|
||||
int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
|
||||
g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
|
||||
g = vshrq_n_s16(g, 3);
|
||||
/* g_buf[i] = CLIP(g); */
|
||||
g = vminq_s16(vmaxq_s16(g, zero), max);
|
||||
vst1q_s16((INT16*)&g_buf[i], g);
|
||||
/* (y + HIWORD(cb*28999)) >> 3 */
|
||||
int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
|
||||
b = vshrq_n_s16(b, 3);
|
||||
/* b_buf[i] = CLIP(b); */
|
||||
b = vminq_s16(vmaxq_s16(b, zero), max);
|
||||
vst1q_s16((INT16*)&b_buf[i], b);
|
||||
}
|
||||
|
||||
y_buf += srcbump;
|
||||
cb_buf += srcbump;
|
||||
cr_buf += srcbump;
|
||||
r_buf += dstbump;
|
||||
g_buf += dstbump;
|
||||
b_buf += dstbump;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(const INT16* const WINPR_RESTRICT pSrc[3],
|
||||
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
|
||||
UINT32 dstStep,
|
||||
const prim_size_t* WINPR_RESTRICT roi, uint8_t rPos,
|
||||
uint8_t gPos, uint8_t bPos, uint8_t aPos)
|
||||
{
|
||||
BYTE* pRGB = pDst;
|
||||
const INT16* pY = pSrc[0];
|
||||
const INT16* pCb = pSrc[1];
|
||||
const INT16* pCr = pSrc[2];
|
||||
const size_t srcPad = (srcStep - (roi->width * sizeof(INT16))) / sizeof(INT16);
|
||||
const size_t dstPad = (dstStep - (roi->width * 4)) / 4;
|
||||
const size_t pad = roi->width % 8;
|
||||
const int16x4_t c4096 = vdup_n_s16(4096);
|
||||
|
||||
for (UINT32 y = 0; y < roi->height; y++)
|
||||
{
|
||||
for (UINT32 x = 0; x < roi->width - pad; x += 8)
|
||||
{
|
||||
const int16x8_t Y = vld1q_s16(pY);
|
||||
const int16x4_t Yh = vget_high_s16(Y);
|
||||
const int16x4_t Yl = vget_low_s16(Y);
|
||||
const int32x4_t YhAdd = vaddl_s16(Yh, c4096); /* Y + 4096 */
|
||||
const int32x4_t YlAdd = vaddl_s16(Yl, c4096); /* Y + 4096 */
|
||||
const int32x4_t YhW = vshlq_n_s32(YhAdd, 16);
|
||||
const int32x4_t YlW = vshlq_n_s32(YlAdd, 16);
|
||||
const int16x8_t Cr = vld1q_s16(pCr);
|
||||
const int16x4_t Crh = vget_high_s16(Cr);
|
||||
const int16x4_t Crl = vget_low_s16(Cr);
|
||||
const int16x8_t Cb = vld1q_s16(pCb);
|
||||
const int16x4_t Cbh = vget_high_s16(Cb);
|
||||
const int16x4_t Cbl = vget_low_s16(Cb);
|
||||
uint8x8x4_t bgrx;
|
||||
{
|
||||
/* R */
|
||||
const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916); /* 1.402525 * 2^16 */
|
||||
const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916); /* 1.402525 * 2^16 */
|
||||
const int32x4_t CrhRa = vaddq_s32(CrhR, YhW);
|
||||
const int32x4_t CrlRa = vaddq_s32(CrlR, YlW);
|
||||
const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21));
|
||||
const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21));
|
||||
const int16x8_t Rs = vcombine_s16(Rsl, Rsh);
|
||||
bgrx.val[rPos] = vqmovun_s16(Rs);
|
||||
}
|
||||
{
|
||||
/* G */
|
||||
const int32x4_t CbGh = vmull_n_s16(Cbh, 22527); /* 0.343730 * 2^16 */
|
||||
const int32x4_t CbGl = vmull_n_s16(Cbl, 22527); /* 0.343730 * 2^16 */
|
||||
const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819); /* 0.714401 * 2^16 */
|
||||
const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819); /* 0.714401 * 2^16 */
|
||||
const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh);
|
||||
const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl);
|
||||
const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh);
|
||||
const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl);
|
||||
const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21));
|
||||
const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21));
|
||||
const int16x8_t Gs = vcombine_s16(Gsl, Gsh);
|
||||
const uint8x8_t G = vqmovun_s16(Gs);
|
||||
bgrx.val[gPos] = G;
|
||||
}
|
||||
{
|
||||
/* B */
|
||||
const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992); /* 1.769905 * 2^16 */
|
||||
const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992); /* 1.769905 * 2^16 */
|
||||
const int32x4_t YCbBh = vaddq_s32(CbBh, YhW);
|
||||
const int32x4_t YCbBl = vaddq_s32(CbBl, YlW);
|
||||
const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21));
|
||||
const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21));
|
||||
const int16x8_t Bs = vcombine_s16(Bsl, Bsh);
|
||||
const uint8x8_t B = vqmovun_s16(Bs);
|
||||
bgrx.val[bPos] = B;
|
||||
}
|
||||
/* A */
|
||||
{
|
||||
bgrx.val[aPos] = vdup_n_u8(0xFF);
|
||||
}
|
||||
vst4_u8(pRGB, bgrx);
|
||||
pY += 8;
|
||||
pCb += 8;
|
||||
pCr += 8;
|
||||
pRGB += 32;
|
||||
}
|
||||
|
||||
for (UINT32 x = 0; x < pad; x++)
|
||||
{
|
||||
const INT32 divisor = 16;
|
||||
const INT32 Y = ((*pY++) + 4096) << divisor;
|
||||
const INT32 Cb = (*pCb++);
|
||||
const INT32 Cr = (*pCr++);
|
||||
const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
|
||||
const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
|
||||
const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
|
||||
const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
|
||||
INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
|
||||
INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
|
||||
INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
|
||||
BYTE bgrx[4];
|
||||
bgrx[bPos] = CLIP(B);
|
||||
bgrx[gPos] = CLIP(G);
|
||||
bgrx[rPos] = CLIP(R);
|
||||
bgrx[aPos] = 0xFF;
|
||||
*pRGB++ = bgrx[0];
|
||||
*pRGB++ = bgrx[1];
|
||||
*pRGB++ = bgrx[2];
|
||||
*pRGB++ = bgrx[3];
|
||||
}
|
||||
|
||||
pY += srcPad;
|
||||
pCb += srcPad;
|
||||
pCr += srcPad;
|
||||
pRGB += dstPad;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3],
|
||||
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
|
||||
UINT32 dstStep, UINT32 DstFormat,
|
||||
const prim_size_t* WINPR_RESTRICT roi)
|
||||
{
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
|
||||
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
|
||||
|
||||
case PIXEL_FORMAT_ARGB32:
|
||||
case PIXEL_FORMAT_XRGB32:
|
||||
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
|
||||
|
||||
case PIXEL_FORMAT_ABGR32:
|
||||
case PIXEL_FORMAT_XBGR32:
|
||||
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
|
||||
|
||||
default:
|
||||
return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
||||
}
|
||||
}
|
||||
|
||||
static pstatus_t neon_RGBToRGB_16s8u_P3AC4R_X(
|
||||
const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
|
||||
UINT32 srcStep, /* bytes between rows in source data */
|
||||
BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
|
||||
UINT32 dstStep, /* bytes between rows in dest data */
|
||||
const prim_size_t* WINPR_RESTRICT roi, /* region of interest */
|
||||
uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
|
||||
{
|
||||
UINT32 pad = roi->width % 8;
|
||||
|
||||
for (UINT32 y = 0; y < roi->height; y++)
|
||||
{
|
||||
const INT16* pr = (INT16*)(((BYTE*)pSrc[0]) + y * srcStep);
|
||||
const INT16* pg = (INT16*)(((BYTE*)pSrc[1]) + y * srcStep);
|
||||
const INT16* pb = (INT16*)(((BYTE*)pSrc[2]) + y * srcStep);
|
||||
BYTE* dst = pDst + y * dstStep;
|
||||
|
||||
for (UINT32 x = 0; x < roi->width - pad; x += 8)
|
||||
{
|
||||
int16x8_t r = vld1q_s16(pr);
|
||||
int16x8_t g = vld1q_s16(pg);
|
||||
int16x8_t b = vld1q_s16(pb);
|
||||
uint8x8x4_t bgrx;
|
||||
bgrx.val[aPos] = vdup_n_u8(0xFF);
|
||||
bgrx.val[rPos] = vqmovun_s16(r);
|
||||
bgrx.val[gPos] = vqmovun_s16(g);
|
||||
bgrx.val[bPos] = vqmovun_s16(b);
|
||||
vst4_u8(dst, bgrx);
|
||||
pr += 8;
|
||||
pg += 8;
|
||||
pb += 8;
|
||||
dst += 32;
|
||||
}
|
||||
|
||||
for (UINT32 x = 0; x < pad; x++)
|
||||
{
|
||||
BYTE bgrx[4];
|
||||
bgrx[bPos] = *pb++;
|
||||
bgrx[gPos] = *pg++;
|
||||
bgrx[rPos] = *pr++;
|
||||
bgrx[aPos] = 0xFF;
|
||||
*dst++ = bgrx[0];
|
||||
*dst++ = bgrx[1];
|
||||
*dst++ = bgrx[2];
|
||||
*dst++ = bgrx[3];
|
||||
}
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t
|
||||
neon_RGBToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
|
||||
UINT32 srcStep, /* bytes between rows in source data */
|
||||
BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
|
||||
UINT32 dstStep, /* bytes between rows in dest data */
|
||||
UINT32 DstFormat,
|
||||
const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
|
||||
{
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
|
||||
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
|
||||
|
||||
case PIXEL_FORMAT_ARGB32:
|
||||
case PIXEL_FORMAT_XRGB32:
|
||||
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
|
||||
|
||||
case PIXEL_FORMAT_ABGR32:
|
||||
case PIXEL_FORMAT_XBGR32:
|
||||
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
|
||||
|
||||
default:
|
||||
return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
||||
}
|
||||
}
|
||||
#endif /* WITH_NEON */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_colors_neon(primitives_t* prims)
|
||||
{
|
||||
#if defined(WITH_NEON)
|
||||
generic = primitives_get_generic();
|
||||
primitives_init_colors(prims);
|
||||
|
||||
if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
|
||||
{
|
||||
prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R;
|
||||
prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R;
|
||||
prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
|
||||
}
|
||||
#else
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
@ -30,7 +30,6 @@
|
||||
#else
|
||||
#include <CL/cl.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <freerdp/log.h>
|
||||
#define TAG FREERDP_TAG("primitives")
|
||||
@ -481,9 +480,11 @@ static pstatus_t opencl_YUV444ToRGB_8u_P3AC4R(const BYTE* const WINPR_RESTRICT p
|
||||
|
||||
return opencl_YUVToRGB(kernel_name, pSrc, srcStep, pDst, dstStep, roi);
|
||||
}
|
||||
#endif
|
||||
|
||||
BOOL primitives_init_opencl(primitives_t* prims)
|
||||
{
|
||||
#if defined(WITH_OPENCL)
|
||||
primitives_t* p = primitives_get_by_type(PRIMITIVES_ONLY_CPU);
|
||||
if (!prims || !p)
|
||||
return FALSE;
|
||||
@ -496,5 +497,6 @@ BOOL primitives_init_opencl(primitives_t* prims)
|
||||
prims->YUV444ToRGB_8u_P3AC4R = opencl_YUV444ToRGB_8u_P3AC4R;
|
||||
prims->flags |= PRIM_FLAGS_HAVE_EXTGPU;
|
||||
prims->uninit = primitives_uninit_opencl;
|
||||
#endif
|
||||
return TRUE;
|
||||
}
|
@ -23,6 +23,7 @@
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_YCoCg.h"
|
||||
|
||||
/* helper function to convert raw 8 bit values to signed 16bit values.
|
||||
*/
|
||||
@ -71,3 +72,9 @@ void primitives_init_YCoCg(primitives_t* prims)
|
||||
{
|
||||
prims->YCoCgToRGB_8u_AC4R = general_YCoCgToRGB_8u_AC4R;
|
||||
}
|
||||
|
||||
void primitives_init_YCoCg_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
primitives_init_YCoCg_ssse3(prims);
|
||||
primitives_init_YCoCg_neon(prims);
|
||||
}
|
||||
|
31
libfreerdp/primitives/prim_YCoCg.h
Normal file
31
libfreerdp/primitives/prim_YCoCg.h
Normal file
@ -0,0 +1,31 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Primitives copy
|
||||
*
|
||||
* Copyright 2024 Armin Novak <anovak@thincast.com>
|
||||
* Copyright 2024 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef FREERDP_LIB_PRIM_YCoCg_H
|
||||
#define FREERDP_LIB_PRIM_YCoCg_H
|
||||
|
||||
#include <winpr/wtypes.h>
|
||||
#include <freerdp/config.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
void primitives_init_YCoCg_ssse3(primitives_t* WINPR_RESTRICT prims);
|
||||
void primitives_init_YCoCg_neon(primitives_t* WINPR_RESTRICT prims);
|
||||
|
||||
#endif
|
@ -29,6 +29,7 @@
|
||||
#include <freerdp/primitives.h>
|
||||
#include <freerdp/codec/color.h>
|
||||
#include "prim_internal.h"
|
||||
#include "prim_YUV.h"
|
||||
|
||||
static pstatus_t general_LumaToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[3],
|
||||
const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
|
||||
@ -1875,3 +1876,9 @@ void primitives_init_YUV(primitives_t* WINPR_RESTRICT prims)
|
||||
prims->RGBToAVC444YUV = general_RGBToAVC444YUV;
|
||||
prims->RGBToAVC444YUVv2 = general_RGBToAVC444YUVv2;
|
||||
}
|
||||
|
||||
void primitives_init_YUV_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
primitives_init_YUV_ssse3(prims);
|
||||
primitives_init_YUV_neon(prims);
|
||||
}
|
||||
|
31
libfreerdp/primitives/prim_YUV.h
Normal file
31
libfreerdp/primitives/prim_YUV.h
Normal file
@ -0,0 +1,31 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Primitives copy
|
||||
*
|
||||
* Copyright 2024 Armin Novak <anovak@thincast.com>
|
||||
* Copyright 2024 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef FREERDP_LIB_PRIM_YUV_H
|
||||
#define FREERDP_LIB_PRIM_YUV_H
|
||||
|
||||
#include <winpr/wtypes.h>
|
||||
#include <freerdp/config.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
void primitives_init_YUV_ssse3(primitives_t* prims);
|
||||
void primitives_init_YUV_neon(primitives_t* prims);
|
||||
|
||||
#endif
|
@ -22,6 +22,7 @@
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_add.h"
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* 16-bit signed add with saturation (under and over).
|
||||
@ -74,3 +75,8 @@ void primitives_init_add(primitives_t* prims)
|
||||
prims->add_16s = general_add_16s;
|
||||
prims->add_16s_inplace = general_add_16s_inplace;
|
||||
}
|
||||
|
||||
void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
primitives_init_add_sse3(prims);
|
||||
}
|
||||
|
30
libfreerdp/primitives/prim_add.h
Normal file
30
libfreerdp/primitives/prim_add.h
Normal file
@ -0,0 +1,30 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Primitives copy
|
||||
*
|
||||
* Copyright 2024 Armin Novak <anovak@thincast.com>
|
||||
* Copyright 2024 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef FREERDP_LIB_PRIM_ADD_H
|
||||
#define FREERDP_LIB_PRIM_ADD_H
|
||||
|
||||
#include <winpr/wtypes.h>
|
||||
#include <freerdp/config.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
void primitives_init_add_sse3(primitives_t* prims);
|
||||
|
||||
#endif
|
@ -25,6 +25,7 @@
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_alphaComp.h"
|
||||
|
||||
#define ALPHA(_k_) (((_k_)&0xFF000000U) >> 24)
|
||||
#define RED(_k_) (((_k_)&0x00FF0000U) >> 16)
|
||||
@ -91,3 +92,8 @@ void primitives_init_alphaComp(primitives_t* prims)
|
||||
{
|
||||
prims->alphaComp_argb = general_alphaComp_argb;
|
||||
}
|
||||
|
||||
void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
primitives_init_alphaComp_sse3(prims);
|
||||
}
|
||||
|
30
libfreerdp/primitives/prim_alphaComp.h
Normal file
30
libfreerdp/primitives/prim_alphaComp.h
Normal file
@ -0,0 +1,30 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Primitives copy
|
||||
*
|
||||
* Copyright 2024 Armin Novak <anovak@thincast.com>
|
||||
* Copyright 2024 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef FREERDP_LIB_PRIM_ALPHA_COMP_H
|
||||
#define FREERDP_LIB_PRIM_ALPHA_COMP_H
|
||||
|
||||
#include <winpr/wtypes.h>
|
||||
#include <freerdp/config.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
void primitives_init_alphaComp_sse3(primitives_t* prims);
|
||||
|
||||
#endif
|
@ -19,6 +19,7 @@
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_andor.h"
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* 32-bit AND with a constant.
|
||||
@ -55,3 +56,8 @@ void primitives_init_andor(primitives_t* prims)
|
||||
prims->andC_32u = general_andC_32u;
|
||||
prims->orC_32u = general_orC_32u;
|
||||
}
|
||||
|
||||
void primitives_init_andor_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
primitives_init_andor_sse3(prims);
|
||||
}
|
||||
|
30
libfreerdp/primitives/prim_andor.h
Normal file
30
libfreerdp/primitives/prim_andor.h
Normal file
@ -0,0 +1,30 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Primitives copy
|
||||
*
|
||||
* Copyright 2024 Armin Novak <anovak@thincast.com>
|
||||
* Copyright 2024 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef FREERDP_LIB_PRIM_ANDOR_H
|
||||
#define FREERDP_LIB_PRIM_ANDOR_H
|
||||
|
||||
#include <winpr/wtypes.h>
|
||||
#include <freerdp/config.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
void primitives_init_andor_sse3(primitives_t* prims);
|
||||
|
||||
#endif
|
@ -24,6 +24,7 @@
|
||||
#include <freerdp/codec/color.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_colors.h"
|
||||
|
||||
#ifndef MINMAX
|
||||
#define MINMAX(_v_, _l_, _h_) ((_v_) < (_l_) ? (_l_) : ((_v_) > (_h_) ? (_h_) : (_v_)))
|
||||
@ -507,3 +508,10 @@ void primitives_init_colors(primitives_t* prims)
|
||||
prims->RGBToYCbCr_16s16s_P3P3 = general_RGBToYCbCr_16s16s_P3P3;
|
||||
prims->RGBToRGB_16s8u_P3AC4R = general_RGBToRGB_16s8u_P3AC4R;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_colors_opt(primitives_t* prims)
|
||||
{
|
||||
primitives_init_colors_sse2(prims);
|
||||
primitives_init_colors_neon(prims);
|
||||
}
|
||||
|
31
libfreerdp/primitives/prim_colors.h
Normal file
31
libfreerdp/primitives/prim_colors.h
Normal file
@ -0,0 +1,31 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Primitives colors
|
||||
*
|
||||
* Copyright 2024 Armin Novak <anovak@thincast.com>
|
||||
* Copyright 2024 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef FREERDP_LIB_PRIM_COLORS_H
|
||||
#define FREERDP_LIB_PRIM_COLORS_H
|
||||
|
||||
#include <winpr/wtypes.h>
|
||||
#include <freerdp/config.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
void primitives_init_colors_sse2(primitives_t* prims);
|
||||
void primitives_init_colors_neon(primitives_t* prims);
|
||||
|
||||
#endif
|
@ -392,22 +392,8 @@ void primitives_init_copy(primitives_t* prims)
|
||||
prims->copy_no_overlap = generic_image_copy_no_overlap;
|
||||
}
|
||||
|
||||
#if defined(WITH_SSE2) || defined(WITH_NEON)
|
||||
void primitives_init_copy_opt(primitives_t* prims)
|
||||
{
|
||||
generic = primitives_get_generic();
|
||||
primitives_init_copy(prims);
|
||||
/* Pick tuned versions if possible. */
|
||||
/* Performance with an SSE2 version with no prefetch seemed to be
|
||||
* all over the map vs. memcpy.
|
||||
* Sometimes it was significantly faster, sometimes dreadfully slower,
|
||||
* and it seemed to vary a lot depending on block size and processor.
|
||||
* Hence, no SSE version is used here unless once can be written that
|
||||
* is consistently faster than memcpy.
|
||||
*/
|
||||
/* This is just an alias with void* parameters */
|
||||
prims->copy = (__copy_t)(prims->copy_8u);
|
||||
primitives_init_copy_sse(prims);
|
||||
primitives_init_copy_sse41(prims);
|
||||
primitives_init_copy_avx2(prims);
|
||||
}
|
||||
#endif
|
||||
|
@ -22,6 +22,7 @@
|
||||
#define FREERDP_LIB_PRIM_COPY_H
|
||||
|
||||
#include <winpr/wtypes.h>
|
||||
#include <freerdp/config.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
pstatus_t generic_image_copy_no_overlap_convert(
|
||||
@ -37,6 +38,7 @@ pstatus_t generic_image_copy_no_overlap_memcpy(
|
||||
SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset,
|
||||
UINT32 flags);
|
||||
|
||||
extern void primitives_init_copy_sse(primitives_t* prims);
|
||||
extern void primitives_init_copy_avx2(primitives_t* prims);
|
||||
void primitives_init_copy_sse41(primitives_t* prims);
|
||||
void primitives_init_copy_avx2(primitives_t* prims);
|
||||
|
||||
#endif
|
||||
|
@ -275,7 +275,6 @@ FREERDP_LOCAL void primitives_init_colors(primitives_t* prims);
|
||||
FREERDP_LOCAL void primitives_init_YCoCg(primitives_t* prims);
|
||||
FREERDP_LOCAL void primitives_init_YUV(primitives_t* prims);
|
||||
|
||||
#if defined(WITH_SSE2) || defined(WITH_NEON)
|
||||
FREERDP_LOCAL void primitives_init_copy_opt(primitives_t* prims);
|
||||
FREERDP_LOCAL void primitives_init_set_opt(primitives_t* prims);
|
||||
FREERDP_LOCAL void primitives_init_add_opt(primitives_t* prims);
|
||||
@ -286,7 +285,6 @@ FREERDP_LOCAL void primitives_init_alphaComp_opt(primitives_t* prims);
|
||||
FREERDP_LOCAL void primitives_init_colors_opt(primitives_t* prims);
|
||||
FREERDP_LOCAL void primitives_init_YCoCg_opt(primitives_t* prims);
|
||||
FREERDP_LOCAL void primitives_init_YUV_opt(primitives_t* prims);
|
||||
#endif
|
||||
|
||||
#if defined(WITH_OPENCL)
|
||||
FREERDP_LOCAL BOOL primitives_init_opencl(primitives_t* prims);
|
||||
|
@ -22,6 +22,7 @@
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_set.h"
|
||||
|
||||
/* ========================================================================= */
|
||||
static pstatus_t general_set_8u(BYTE val, BYTE* pDst, UINT32 len)
|
||||
@ -120,3 +121,8 @@ void primitives_init_set(primitives_t* prims)
|
||||
prims->set_32u = general_set_32u;
|
||||
prims->zero = general_zero;
|
||||
}
|
||||
|
||||
void primitives_init_set_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
primitives_init_set_sse2(prims);
|
||||
}
|
||||
|
30
libfreerdp/primitives/prim_set.h
Normal file
30
libfreerdp/primitives/prim_set.h
Normal file
@ -0,0 +1,30 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Primitives copy
|
||||
*
|
||||
* Copyright 2024 Armin Novak <anovak@thincast.com>
|
||||
* Copyright 2024 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef FREERDP_LIB_PRIM_SET_H
|
||||
#define FREERDP_LIB_PRIM_SET_H
|
||||
|
||||
#include <winpr/wtypes.h>
|
||||
#include <freerdp/config.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
void primitives_init_set_sse2(primitives_t* prims);
|
||||
|
||||
#endif
|
@ -19,6 +19,8 @@
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_shift.h"
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static INLINE INT16 shift(INT16 val, UINT32 sh)
|
||||
{
|
||||
@ -133,3 +135,8 @@ void primitives_init_shift(primitives_t* prims)
|
||||
prims->shiftC_16s = general_shiftC_16s;
|
||||
prims->shiftC_16u = general_shiftC_16u;
|
||||
}
|
||||
|
||||
void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
primitives_init_shift_sse3(prims);
|
||||
}
|
||||
|
30
libfreerdp/primitives/prim_shift.h
Normal file
30
libfreerdp/primitives/prim_shift.h
Normal file
@ -0,0 +1,30 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Primitives copy
|
||||
*
|
||||
* Copyright 2024 Armin Novak <anovak@thincast.com>
|
||||
* Copyright 2024 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef FREERDP_LIB_PRIM_SHIFT_H
|
||||
#define FREERDP_LIB_PRIM_SHIFT_H
|
||||
|
||||
#include <winpr/wtypes.h>
|
||||
#include <freerdp/config.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
extern void primitives_init_shift_sse3(primitives_t* prims);
|
||||
|
||||
#endif
|
@ -19,6 +19,7 @@
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_sign.h"
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* Set pDst to the sign-value of the 16-bit values in pSrc (-1, 0, or 1).
|
||||
@ -40,3 +41,8 @@ void primitives_init_sign(primitives_t* prims)
|
||||
/* Start with the default. */
|
||||
prims->sign_16s = general_sign_16s;
|
||||
}
|
||||
|
||||
void primitives_init_sign_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
primitives_init_sign_ssse3(prims);
|
||||
}
|
||||
|
30
libfreerdp/primitives/prim_sign.h
Normal file
30
libfreerdp/primitives/prim_sign.h
Normal file
@ -0,0 +1,30 @@
|
||||
/**
|
||||
* FreeRDP: A Remote Desktop Protocol Implementation
|
||||
* Primitives copy
|
||||
*
|
||||
* Copyright 2024 Armin Novak <anovak@thincast.com>
|
||||
* Copyright 2024 Thincast Technologies GmbH
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef FREERDP_LIB_PRIM_SIGN_H
|
||||
#define FREERDP_LIB_PRIM_SIGN_H
|
||||
|
||||
#include <winpr/wtypes.h>
|
||||
#include <freerdp/config.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
void primitives_init_sign_ssse3(primitives_t* prims);
|
||||
|
||||
#endif
|
@ -23,19 +23,19 @@
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_YCoCg.h"
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
#include <emmintrin.h>
|
||||
#include <tmmintrin.h>
|
||||
#elif defined(WITH_NEON)
|
||||
#include <arm_neon.h>
|
||||
#endif /* WITH_SSE2 else WITH_NEON */
|
||||
#endif
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_templates.h"
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
static primitives_t* generic = NULL;
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
|
||||
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
|
||||
@ -411,9 +411,7 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
#endif /* WITH_SSE2 */
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
|
||||
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
|
||||
@ -437,153 +435,22 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT3
|
||||
height, shift, withAlpha);
|
||||
}
|
||||
}
|
||||
#elif defined(WITH_NEON)
|
||||
|
||||
static pstatus_t neon_YCoCgToRGB_8u_X(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
|
||||
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
|
||||
UINT32 width, UINT32 height, UINT8 shift, BYTE bPos,
|
||||
BYTE gPos, BYTE rPos, BYTE aPos, BOOL alpha)
|
||||
{
|
||||
BYTE* dptr = pDst;
|
||||
const BYTE* sptr = pSrc;
|
||||
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
|
||||
const int8_t cll = shift - 1; /* -1 builds in the /2's */
|
||||
const UINT32 srcPad = srcStep - (width * 4);
|
||||
const UINT32 dstPad = dstStep - (width * formatSize);
|
||||
const UINT32 pad = width % 8;
|
||||
const uint8x8_t aVal = vdup_n_u8(0xFF);
|
||||
const int8x8_t cllv = vdup_n_s8(cll);
|
||||
|
||||
for (UINT32 y = 0; y < height; y++)
|
||||
{
|
||||
for (UINT32 x = 0; x < width - pad; x += 8)
|
||||
{
|
||||
/* Note: shifts must be done before sign-conversion. */
|
||||
const uint8x8x4_t raw = vld4_u8(sptr);
|
||||
const int8x8_t CgRaw = vreinterpret_s8_u8(vshl_u8(raw.val[0], cllv));
|
||||
const int8x8_t CoRaw = vreinterpret_s8_u8(vshl_u8(raw.val[1], cllv));
|
||||
const int16x8_t Cg = vmovl_s8(CgRaw);
|
||||
const int16x8_t Co = vmovl_s8(CoRaw);
|
||||
const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(raw.val[2])); /* UINT8 -> INT16 */
|
||||
const int16x8_t T = vsubq_s16(Y, Cg);
|
||||
const int16x8_t R = vaddq_s16(T, Co);
|
||||
const int16x8_t G = vaddq_s16(Y, Cg);
|
||||
const int16x8_t B = vsubq_s16(T, Co);
|
||||
uint8x8x4_t bgrx;
|
||||
bgrx.val[bPos] = vqmovun_s16(B);
|
||||
bgrx.val[gPos] = vqmovun_s16(G);
|
||||
bgrx.val[rPos] = vqmovun_s16(R);
|
||||
|
||||
if (alpha)
|
||||
bgrx.val[aPos] = raw.val[3];
|
||||
else
|
||||
bgrx.val[aPos] = aVal;
|
||||
|
||||
vst4_u8(dptr, bgrx);
|
||||
sptr += sizeof(raw);
|
||||
dptr += sizeof(bgrx);
|
||||
}
|
||||
|
||||
for (UINT32 x = 0; x < pad; x++)
|
||||
{
|
||||
/* Note: shifts must be done before sign-conversion. */
|
||||
const INT16 Cg = (INT16)((INT8)((*sptr++) << cll));
|
||||
const INT16 Co = (INT16)((INT8)((*sptr++) << cll));
|
||||
const INT16 Y = (INT16)(*sptr++); /* UINT8->INT16 */
|
||||
const INT16 T = Y - Cg;
|
||||
const INT16 R = T + Co;
|
||||
const INT16 G = Y + Cg;
|
||||
const INT16 B = T - Co;
|
||||
BYTE bgra[4];
|
||||
bgra[bPos] = CLIP(B);
|
||||
bgra[gPos] = CLIP(G);
|
||||
bgra[rPos] = CLIP(R);
|
||||
bgra[aPos] = *sptr++;
|
||||
|
||||
if (!alpha)
|
||||
bgra[aPos] = 0xFF;
|
||||
|
||||
*dptr++ = bgra[0];
|
||||
*dptr++ = bgra[1];
|
||||
*dptr++ = bgra[2];
|
||||
*dptr++ = bgra[3];
|
||||
}
|
||||
|
||||
sptr += srcPad;
|
||||
dptr += dstPad;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t neon_YCoCgToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
|
||||
BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
|
||||
UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha)
|
||||
{
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 2, 1, 0, 3, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 2, 1, 0, 3, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 0, 1, 2, 3, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 0, 1, 2, 3, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_ARGB32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 1, 2, 3, 0, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_XRGB32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 1, 2, 3, 0, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_ABGR32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 3, 2, 1, 0, withAlpha);
|
||||
|
||||
case PIXEL_FORMAT_XBGR32:
|
||||
return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
|
||||
shift, 3, 2, 1, 0, withAlpha);
|
||||
|
||||
default:
|
||||
return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
|
||||
height, shift, withAlpha);
|
||||
}
|
||||
}
|
||||
#endif /* WITH_SSE2 */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_YCoCg_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
void primitives_init_YCoCg_ssse3(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(WITH_SSE2)
|
||||
generic = primitives_get_generic();
|
||||
primitives_init_YCoCg(prims);
|
||||
/* While IPP acknowledges the existence of YCoCg-R, it doesn't currently
|
||||
* include any routines to work with it, especially with variable shift
|
||||
* width.
|
||||
*/
|
||||
#if defined(WITH_SSE2)
|
||||
|
||||
if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
|
||||
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
|
||||
{
|
||||
prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
|
||||
}
|
||||
|
||||
#elif defined(WITH_NEON)
|
||||
|
||||
if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
|
||||
{
|
||||
prims->YCoCgToRGB_8u_AC4R = neon_YCoCgToRGB_8u_AC4R;
|
||||
}
|
||||
|
||||
#endif /* WITH_SSE2 */
|
||||
#else
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
@ -29,14 +29,12 @@
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_YUV.h"
|
||||
|
||||
#if defined(WITH_SSE2)
|
||||
#include <emmintrin.h>
|
||||
#include <tmmintrin.h>
|
||||
|
||||
#if !defined(WITH_SSE2)
|
||||
#error "This file needs WITH_SSE2 enabled!"
|
||||
#endif
|
||||
|
||||
static primitives_t* generic = NULL;
|
||||
|
||||
/****************************************************************************/
|
||||
@ -1496,9 +1494,11 @@ static pstatus_t ssse3_YUV420CombineToYUV444(avc444_frame_type type,
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void primitives_init_YUV_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
void primitives_init_YUV_ssse3(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(WITH_SSE2)
|
||||
generic = primitives_get_generic();
|
||||
primitives_init_YUV(prims);
|
||||
|
||||
@ -1512,4 +1512,7 @@ void primitives_init_YUV_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
prims->YUV444ToRGB_8u_P3AC4R = ssse3_YUV444ToRGB_8u_P3AC4R;
|
||||
prims->YUV420CombineToYUV444 = ssse3_YUV420CombineToYUV444;
|
||||
}
|
||||
#else
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
@ -20,6 +20,8 @@
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_add.h"
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
#include <emmintrin.h>
|
||||
#include <pmmintrin.h>
|
||||
@ -28,9 +30,9 @@
|
||||
#include "prim_internal.h"
|
||||
#include "prim_templates.h"
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
static primitives_t* generic = NULL;
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16,
|
||||
generic->add_16s(sptr1++, sptr2++, dptr++, 1))
|
||||
@ -174,12 +176,12 @@ static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1,
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
void primitives_init_add_sse3(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(WITH_SSE2)
|
||||
generic = primitives_get_generic();
|
||||
primitives_init_add(prims);
|
||||
|
||||
#if defined(WITH_SSE2)
|
||||
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
|
||||
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
|
||||
{
|
||||
@ -187,5 +189,7 @@ void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
prims->add_16s_inplace = sse3_add_16s_inplace;
|
||||
}
|
||||
|
||||
#else
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
@ -26,6 +26,8 @@
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_alphaComp.h"
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
#include <emmintrin.h>
|
||||
#include <pmmintrin.h>
|
||||
@ -33,10 +35,9 @@
|
||||
|
||||
#include "prim_internal.h"
|
||||
|
||||
static primitives_t* generic = NULL;
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
#ifdef WITH_SSE2
|
||||
static primitives_t* generic = NULL;
|
||||
|
||||
static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step,
|
||||
const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step,
|
||||
@ -208,11 +209,11 @@ static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 sr
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
void primitives_init_alphaComp_sse3(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(WITH_SSE2)
|
||||
generic = primitives_get_generic();
|
||||
primitives_init_alphaComp(prims);
|
||||
#if defined(WITH_SSE2)
|
||||
|
||||
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
|
||||
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
|
||||
@ -220,5 +221,7 @@ void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
prims->alphaComp_argb = sse2_alphaComp_argb;
|
||||
}
|
||||
|
||||
#else
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
@ -19,6 +19,8 @@
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_andor.h"
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
#include <emmintrin.h>
|
||||
#include <pmmintrin.h>
|
||||
@ -27,9 +29,9 @@
|
||||
#include "prim_internal.h"
|
||||
#include "prim_templates.h"
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
static primitives_t* generic = NULL;
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, generic->andC_32u, _mm_and_si128,
|
||||
*dptr++ = *sptr++ & val)
|
||||
@ -37,11 +39,11 @@ SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, generic->orC_32u, _mm_or_si128, *dptr
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_andor_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
void primitives_init_andor_sse3(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(WITH_SSE2)
|
||||
generic = primitives_get_generic();
|
||||
primitives_init_andor(prims);
|
||||
#if defined(WITH_SSE2)
|
||||
|
||||
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
|
||||
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
|
||||
@ -50,5 +52,7 @@ void primitives_init_andor_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
prims->orC_32u = sse3_orC_32u;
|
||||
}
|
||||
|
||||
#else
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
@ -23,18 +23,17 @@
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_colors.h"
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
#include <emmintrin.h>
|
||||
#elif defined(WITH_NEON)
|
||||
#include <arm_neon.h>
|
||||
#endif /* WITH_SSE2 else WITH_NEON */
|
||||
#endif /* WITH_SSE2 */
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_templates.h"
|
||||
|
||||
static primitives_t* generic = NULL;
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
static primitives_t* generic = NULL;
|
||||
|
||||
#ifdef __GNUC__
|
||||
#define GNU_INLINE __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
@ -1246,329 +1245,11 @@ sse2_RGBToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit
|
||||
}
|
||||
#endif /* WITH_SSE2 */
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
#ifdef WITH_NEON
|
||||
static pstatus_t
|
||||
neon_yCbCrToRGB_16s16s_P3P3(const INT16* const WINPR_RESTRICT pSrc[3], INT32 srcStep,
|
||||
INT16* WINPR_RESTRICT pDst[3], INT32 dstStep,
|
||||
const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
|
||||
{
|
||||
/* TODO: If necessary, check alignments and call the general version. */
|
||||
int16x8_t zero = vdupq_n_s16(0);
|
||||
int16x8_t max = vdupq_n_s16(255);
|
||||
int16x8_t r_cr = vdupq_n_s16(22986); // 1.403 << 14
|
||||
int16x8_t g_cb = vdupq_n_s16(-5636); // -0.344 << 14
|
||||
int16x8_t g_cr = vdupq_n_s16(-11698); // -0.714 << 14
|
||||
int16x8_t b_cb = vdupq_n_s16(28999); // 1.770 << 14
|
||||
int16x8_t c4096 = vdupq_n_s16(4096);
|
||||
int16x8_t* y_buf = (int16x8_t*)pSrc[0];
|
||||
int16x8_t* cb_buf = (int16x8_t*)pSrc[1];
|
||||
int16x8_t* cr_buf = (int16x8_t*)pSrc[2];
|
||||
int16x8_t* r_buf = (int16x8_t*)pDst[0];
|
||||
int16x8_t* g_buf = (int16x8_t*)pDst[1];
|
||||
int16x8_t* b_buf = (int16x8_t*)pDst[2];
|
||||
int srcbump = srcStep / sizeof(int16x8_t);
|
||||
int dstbump = dstStep / sizeof(int16x8_t);
|
||||
int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
|
||||
|
||||
for (int yp = 0; yp < roi->height; ++yp)
|
||||
{
|
||||
for (int i = 0; i < imax; i++)
|
||||
{
|
||||
/*
|
||||
In order to use NEON signed 16-bit integer multiplication we need to convert
|
||||
the floating point factors to signed int without loosing information.
|
||||
The result of this multiplication is 32 bit and we have a NEON instruction
|
||||
that returns the hi word of the saturated double.
|
||||
Thus we will multiply the factors by the highest possible 2^n, take the
|
||||
upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
|
||||
shift by 1 to reverse the doubling) and correct this result by multiplying it
|
||||
by 2^(16-n).
|
||||
For the given factors in the conversion matrix the best possible n is 14.
|
||||
|
||||
Example for calculating r:
|
||||
r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
|
||||
r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
|
||||
r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
|
||||
r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
|
||||
*/
|
||||
/* y = (y_buf[i] + 4096) >> 2 */
|
||||
int16x8_t y = vld1q_s16((INT16*)&y_buf[i]);
|
||||
y = vaddq_s16(y, c4096);
|
||||
y = vshrq_n_s16(y, 2);
|
||||
/* cb = cb_buf[i]; */
|
||||
int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
|
||||
/* cr = cr_buf[i]; */
|
||||
int16x8_t cr = vld1q_s16((INT16*)&cr_buf[i]);
|
||||
/* (y + HIWORD(cr*22986)) >> 3 */
|
||||
int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
|
||||
r = vshrq_n_s16(r, 3);
|
||||
/* r_buf[i] = CLIP(r); */
|
||||
r = vminq_s16(vmaxq_s16(r, zero), max);
|
||||
vst1q_s16((INT16*)&r_buf[i], r);
|
||||
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
|
||||
int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
|
||||
g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
|
||||
g = vshrq_n_s16(g, 3);
|
||||
/* g_buf[i] = CLIP(g); */
|
||||
g = vminq_s16(vmaxq_s16(g, zero), max);
|
||||
vst1q_s16((INT16*)&g_buf[i], g);
|
||||
/* (y + HIWORD(cb*28999)) >> 3 */
|
||||
int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
|
||||
b = vshrq_n_s16(b, 3);
|
||||
/* b_buf[i] = CLIP(b); */
|
||||
b = vminq_s16(vmaxq_s16(b, zero), max);
|
||||
vst1q_s16((INT16*)&b_buf[i], b);
|
||||
}
|
||||
|
||||
y_buf += srcbump;
|
||||
cb_buf += srcbump;
|
||||
cr_buf += srcbump;
|
||||
r_buf += dstbump;
|
||||
g_buf += dstbump;
|
||||
b_buf += dstbump;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(const INT16* const WINPR_RESTRICT pSrc[3],
|
||||
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
|
||||
UINT32 dstStep,
|
||||
const prim_size_t* WINPR_RESTRICT roi, uint8_t rPos,
|
||||
uint8_t gPos, uint8_t bPos, uint8_t aPos)
|
||||
{
|
||||
BYTE* pRGB = pDst;
|
||||
const INT16* pY = pSrc[0];
|
||||
const INT16* pCb = pSrc[1];
|
||||
const INT16* pCr = pSrc[2];
|
||||
const size_t srcPad = (srcStep - (roi->width * sizeof(INT16))) / sizeof(INT16);
|
||||
const size_t dstPad = (dstStep - (roi->width * 4)) / 4;
|
||||
const size_t pad = roi->width % 8;
|
||||
const int16x4_t c4096 = vdup_n_s16(4096);
|
||||
|
||||
for (UINT32 y = 0; y < roi->height; y++)
|
||||
{
|
||||
for (UINT32 x = 0; x < roi->width - pad; x += 8)
|
||||
{
|
||||
const int16x8_t Y = vld1q_s16(pY);
|
||||
const int16x4_t Yh = vget_high_s16(Y);
|
||||
const int16x4_t Yl = vget_low_s16(Y);
|
||||
const int32x4_t YhAdd = vaddl_s16(Yh, c4096); /* Y + 4096 */
|
||||
const int32x4_t YlAdd = vaddl_s16(Yl, c4096); /* Y + 4096 */
|
||||
const int32x4_t YhW = vshlq_n_s32(YhAdd, 16);
|
||||
const int32x4_t YlW = vshlq_n_s32(YlAdd, 16);
|
||||
const int16x8_t Cr = vld1q_s16(pCr);
|
||||
const int16x4_t Crh = vget_high_s16(Cr);
|
||||
const int16x4_t Crl = vget_low_s16(Cr);
|
||||
const int16x8_t Cb = vld1q_s16(pCb);
|
||||
const int16x4_t Cbh = vget_high_s16(Cb);
|
||||
const int16x4_t Cbl = vget_low_s16(Cb);
|
||||
uint8x8x4_t bgrx;
|
||||
{
|
||||
/* R */
|
||||
const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916); /* 1.402525 * 2^16 */
|
||||
const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916); /* 1.402525 * 2^16 */
|
||||
const int32x4_t CrhRa = vaddq_s32(CrhR, YhW);
|
||||
const int32x4_t CrlRa = vaddq_s32(CrlR, YlW);
|
||||
const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21));
|
||||
const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21));
|
||||
const int16x8_t Rs = vcombine_s16(Rsl, Rsh);
|
||||
bgrx.val[rPos] = vqmovun_s16(Rs);
|
||||
}
|
||||
{
|
||||
/* G */
|
||||
const int32x4_t CbGh = vmull_n_s16(Cbh, 22527); /* 0.343730 * 2^16 */
|
||||
const int32x4_t CbGl = vmull_n_s16(Cbl, 22527); /* 0.343730 * 2^16 */
|
||||
const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819); /* 0.714401 * 2^16 */
|
||||
const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819); /* 0.714401 * 2^16 */
|
||||
const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh);
|
||||
const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl);
|
||||
const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh);
|
||||
const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl);
|
||||
const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21));
|
||||
const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21));
|
||||
const int16x8_t Gs = vcombine_s16(Gsl, Gsh);
|
||||
const uint8x8_t G = vqmovun_s16(Gs);
|
||||
bgrx.val[gPos] = G;
|
||||
}
|
||||
{
|
||||
/* B */
|
||||
const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992); /* 1.769905 * 2^16 */
|
||||
const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992); /* 1.769905 * 2^16 */
|
||||
const int32x4_t YCbBh = vaddq_s32(CbBh, YhW);
|
||||
const int32x4_t YCbBl = vaddq_s32(CbBl, YlW);
|
||||
const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21));
|
||||
const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21));
|
||||
const int16x8_t Bs = vcombine_s16(Bsl, Bsh);
|
||||
const uint8x8_t B = vqmovun_s16(Bs);
|
||||
bgrx.val[bPos] = B;
|
||||
}
|
||||
/* A */
|
||||
{
|
||||
bgrx.val[aPos] = vdup_n_u8(0xFF);
|
||||
}
|
||||
vst4_u8(pRGB, bgrx);
|
||||
pY += 8;
|
||||
pCb += 8;
|
||||
pCr += 8;
|
||||
pRGB += 32;
|
||||
}
|
||||
|
||||
for (UINT32 x = 0; x < pad; x++)
|
||||
{
|
||||
const INT32 divisor = 16;
|
||||
const INT32 Y = ((*pY++) + 4096) << divisor;
|
||||
const INT32 Cb = (*pCb++);
|
||||
const INT32 Cr = (*pCr++);
|
||||
const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
|
||||
const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
|
||||
const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
|
||||
const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
|
||||
INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
|
||||
INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
|
||||
INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
|
||||
BYTE bgrx[4];
|
||||
bgrx[bPos] = CLIP(B);
|
||||
bgrx[gPos] = CLIP(G);
|
||||
bgrx[rPos] = CLIP(R);
|
||||
bgrx[aPos] = 0xFF;
|
||||
*pRGB++ = bgrx[0];
|
||||
*pRGB++ = bgrx[1];
|
||||
*pRGB++ = bgrx[2];
|
||||
*pRGB++ = bgrx[3];
|
||||
}
|
||||
|
||||
pY += srcPad;
|
||||
pCb += srcPad;
|
||||
pCr += srcPad;
|
||||
pRGB += dstPad;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3],
|
||||
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
|
||||
UINT32 dstStep, UINT32 DstFormat,
|
||||
const prim_size_t* WINPR_RESTRICT roi)
|
||||
{
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
|
||||
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
|
||||
|
||||
case PIXEL_FORMAT_ARGB32:
|
||||
case PIXEL_FORMAT_XRGB32:
|
||||
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
|
||||
|
||||
case PIXEL_FORMAT_ABGR32:
|
||||
case PIXEL_FORMAT_XBGR32:
|
||||
return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
|
||||
|
||||
default:
|
||||
return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
||||
}
|
||||
}
|
||||
|
||||
static pstatus_t neon_RGBToRGB_16s8u_P3AC4R_X(
|
||||
const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
|
||||
UINT32 srcStep, /* bytes between rows in source data */
|
||||
BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
|
||||
UINT32 dstStep, /* bytes between rows in dest data */
|
||||
const prim_size_t* WINPR_RESTRICT roi, /* region of interest */
|
||||
uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
|
||||
{
|
||||
UINT32 pad = roi->width % 8;
|
||||
|
||||
for (UINT32 y = 0; y < roi->height; y++)
|
||||
{
|
||||
const INT16* pr = (INT16*)(((BYTE*)pSrc[0]) + y * srcStep);
|
||||
const INT16* pg = (INT16*)(((BYTE*)pSrc[1]) + y * srcStep);
|
||||
const INT16* pb = (INT16*)(((BYTE*)pSrc[2]) + y * srcStep);
|
||||
BYTE* dst = pDst + y * dstStep;
|
||||
|
||||
for (UINT32 x = 0; x < roi->width - pad; x += 8)
|
||||
{
|
||||
int16x8_t r = vld1q_s16(pr);
|
||||
int16x8_t g = vld1q_s16(pg);
|
||||
int16x8_t b = vld1q_s16(pb);
|
||||
uint8x8x4_t bgrx;
|
||||
bgrx.val[aPos] = vdup_n_u8(0xFF);
|
||||
bgrx.val[rPos] = vqmovun_s16(r);
|
||||
bgrx.val[gPos] = vqmovun_s16(g);
|
||||
bgrx.val[bPos] = vqmovun_s16(b);
|
||||
vst4_u8(dst, bgrx);
|
||||
pr += 8;
|
||||
pg += 8;
|
||||
pb += 8;
|
||||
dst += 32;
|
||||
}
|
||||
|
||||
for (UINT32 x = 0; x < pad; x++)
|
||||
{
|
||||
BYTE bgrx[4];
|
||||
bgrx[bPos] = *pb++;
|
||||
bgrx[gPos] = *pg++;
|
||||
bgrx[rPos] = *pr++;
|
||||
bgrx[aPos] = 0xFF;
|
||||
*dst++ = bgrx[0];
|
||||
*dst++ = bgrx[1];
|
||||
*dst++ = bgrx[2];
|
||||
*dst++ = bgrx[3];
|
||||
}
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t
|
||||
neon_RGBToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
|
||||
UINT32 srcStep, /* bytes between rows in source data */
|
||||
BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
|
||||
UINT32 dstStep, /* bytes between rows in dest data */
|
||||
UINT32 DstFormat,
|
||||
const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
|
||||
{
|
||||
switch (DstFormat)
|
||||
{
|
||||
case PIXEL_FORMAT_BGRA32:
|
||||
case PIXEL_FORMAT_BGRX32:
|
||||
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
|
||||
|
||||
case PIXEL_FORMAT_RGBA32:
|
||||
case PIXEL_FORMAT_RGBX32:
|
||||
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
|
||||
|
||||
case PIXEL_FORMAT_ARGB32:
|
||||
case PIXEL_FORMAT_XRGB32:
|
||||
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
|
||||
|
||||
case PIXEL_FORMAT_ABGR32:
|
||||
case PIXEL_FORMAT_XBGR32:
|
||||
return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
|
||||
|
||||
default:
|
||||
return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
|
||||
}
|
||||
}
|
||||
#endif /* WITH_NEON */
|
||||
/* I don't see a direct IPP version of this, since the input is INT16
|
||||
* YCbCr. It may be possible via Deinterleave and then YCbCrToRGB_<mod>.
|
||||
* But that would likely be slower.
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_colors_opt(primitives_t* prims)
|
||||
void primitives_init_colors_sse2(primitives_t* prims)
|
||||
{
|
||||
#if defined(WITH_SSE2)
|
||||
generic = primitives_get_generic();
|
||||
primitives_init_colors(prims);
|
||||
#if defined(WITH_SSE2)
|
||||
|
||||
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
|
||||
{
|
||||
@ -1578,14 +1259,7 @@ void primitives_init_colors_opt(primitives_t* prims)
|
||||
prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
|
||||
}
|
||||
|
||||
#elif defined(WITH_NEON)
|
||||
|
||||
if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
|
||||
{
|
||||
prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R;
|
||||
prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R;
|
||||
prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
|
||||
}
|
||||
|
||||
#endif /* WITH_SSE2 */
|
||||
#else
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
@ -268,7 +268,7 @@ static pstatus_t sse_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_copy_sse(primitives_t* prims)
|
||||
void primitives_init_copy_sse41(primitives_t* prims)
|
||||
{
|
||||
#if defined(WITH_SSE2)
|
||||
if (IsProcessorFeaturePresent(PF_SSE4_1_INSTRUCTIONS_AVAILABLE))
|
@ -26,11 +26,12 @@
|
||||
#endif /* WITH_SSE2 */
|
||||
|
||||
#include "prim_internal.h"
|
||||
|
||||
static primitives_t* generic = NULL;
|
||||
#include "prim_set.h"
|
||||
|
||||
/* ========================================================================= */
|
||||
#ifdef WITH_SSE2
|
||||
static primitives_t* generic = NULL;
|
||||
|
||||
static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 len)
|
||||
{
|
||||
BYTE byte = 0;
|
||||
@ -216,14 +217,13 @@ static pstatus_t sse2_set_32s(INT32 val, INT32* WINPR_RESTRICT pDst, UINT32 len)
|
||||
#endif /* WITH_SSE2 */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_set_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
void primitives_init_set_sse2(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(WITH_SSE2)
|
||||
generic = primitives_get_generic();
|
||||
primitives_init_set(prims);
|
||||
/* Pick tuned versions if possible. */
|
||||
|
||||
#if defined(WITH_SSE2)
|
||||
|
||||
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
|
||||
{
|
||||
prims->set_8u = sse2_set_8u;
|
||||
@ -231,5 +231,7 @@ void primitives_init_set_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
prims->set_32u = sse2_set_32u;
|
||||
}
|
||||
|
||||
#else
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
@ -19,6 +19,8 @@
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#include "prim_shift.h"
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
#include <emmintrin.h>
|
||||
#include <pmmintrin.h>
|
||||
@ -27,9 +29,9 @@
|
||||
#include "prim_internal.h"
|
||||
#include "prim_templates.h"
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
static primitives_t* generic = NULL;
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, _mm_slli_epi16,
|
||||
*dptr++ = (INT16)((UINT16)*sptr++ << val))
|
||||
@ -142,11 +144,11 @@ static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
void primitives_init_shift_sse3(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(WITH_SSE2)
|
||||
generic = primitives_get_generic();
|
||||
primitives_init_shift(prims);
|
||||
#if defined(WITH_SSE2)
|
||||
|
||||
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
|
||||
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
|
||||
@ -158,5 +160,7 @@ void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
prims->rShiftC_16u = sse2_rShiftC_16u;
|
||||
}
|
||||
|
||||
#else
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
@ -19,16 +19,18 @@
|
||||
#include <freerdp/primitives.h>
|
||||
#include <winpr/sysinfo.h>
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
#include "prim_sign.h"
|
||||
|
||||
#if defined(WITH_SSE2)
|
||||
#include <emmintrin.h>
|
||||
#include <tmmintrin.h>
|
||||
#endif /* WITH_SSE2 */
|
||||
#endif
|
||||
|
||||
#include "prim_internal.h"
|
||||
|
||||
#if defined(WITH_SSE2)
|
||||
static primitives_t* generic = NULL;
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst,
|
||||
UINT32 len)
|
||||
@ -167,13 +169,13 @@ static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_R
|
||||
#endif /* WITH_SSE2 */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_sign_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
void primitives_init_sign_ssse3(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
#if defined(WITH_SSE2)
|
||||
generic = primitives_get_generic();
|
||||
primitives_init_sign(prims);
|
||||
/* Pick tuned versions if possible. */
|
||||
/* I didn't spot an IPP version of this. */
|
||||
#if defined(WITH_SSE2)
|
||||
|
||||
if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
|
||||
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
|
||||
@ -181,5 +183,7 @@ void primitives_init_sign_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
prims->sign_16s = ssse3_sign_16s;
|
||||
}
|
||||
|
||||
#else
|
||||
WINPR_UNUSED(prims);
|
||||
#endif
|
||||
}
|
Loading…
Reference in New Issue
Block a user