diff --git a/libfreerdp/primitives/CMakeLists.txt b/libfreerdp/primitives/CMakeLists.txt index 8f56fe936..7526f7ace 100644 --- a/libfreerdp/primitives/CMakeLists.txt +++ b/libfreerdp/primitives/CMakeLists.txt @@ -2,63 +2,86 @@ set(PRIMITIVES_SRCS prim_add.c + prim_add.h prim_andor.c + prim_andor.h prim_alphaComp.c + prim_alphaComp.h prim_colors.c + prim_colors.h prim_copy.c prim_copy.h prim_set.c + prim_set.h prim_shift.c + prim_shift.h prim_sign.c + prim_sign.h prim_YUV.c + prim_YUV.h prim_YCoCg.c + prim_YCoCg.h primitives.c prim_internal.h) -if (WITH_SSE2 OR WITH_NEON) - set(PRIMITIVES_SSE2_SRCS - prim_colors_opt.c - prim_copy_sse.c - prim_copy_avx2.c - prim_set_opt.c) +set(PRIMITIVES_SSE2_SRCS + sse/prim_colors_sse2.c + sse/prim_set_sse2.c + ) - set(PRIMITIVES_SSE3_SRCS - prim_add_opt.c - prim_alphaComp_opt.c - prim_andor_opt.c - prim_shift_opt.c) +set(PRIMITIVES_SSE3_SRCS + sse/prim_add_sse3.c + sse/prim_alphaComp_sse3.c + sse/prim_andor_sse3.c + sse/prim_shift_sse3.c + ) - set(PRIMITIVES_SSSE3_SRCS - prim_sign_opt.c - prim_YCoCg_opt.c) +set(PRIMITIVES_SSSE3_SRCS + sse/prim_YUV_ssse3.c + sse/prim_sign_ssse3.c + sse/prim_YCoCg_ssse3.c + ) - if (WITH_SSE2) - set(PRIMITIVES_SSSE3_SRCS ${PRIMITIVES_SSSE3_SRCS} - prim_YUV_ssse3.c) - endif() +set(PRIMITIVES_SSE4_1_SRCS + sse/prim_copy_sse4_1.c + ) - if (WITH_NEON) - set(PRIMITIVES_SSSE3_SRCS ${PRIMITIVES_SSSE3_SRCS} - prim_YUV_neon.c) - endif() -endif() +set(PRIMITIVES_SSE4_2_SRCS + ) + +set(PRIMITIVES_AVX2_SRCS + sse/prim_copy_avx2.c + ) + +set(PRIMITIVES_NEON_SRCS + neon/prim_colors_neon.c + neon/prim_YCoCg_neon.c + neon/prim_YUV_neon.c + ) + +set(PRIMITIVES_OPENCL_SRCS + opencl/prim_YUV_opencl.c +) if (WITH_OPENCL) - set(PRIMITIVES_OPENCL_SRCS prim_YUV_opencl.c) - freerdp_include_directory_add(${OpenCL_INCLUDE_DIRS}) freerdp_library_add(OpenCL::OpenCL) endif() set(PRIMITIVES_OPT_SRCS + ${PRIMITIVES_NEON_SRCS} ${PRIMITIVES_SSE2_SRCS} ${PRIMITIVES_SSE3_SRCS} ${PRIMITIVES_SSSE3_SRCS} + ${PRIMITIVES_SSE4_1_SRCS} + ${PRIMITIVES_SSE4_2_SRCS} + ${PRIMITIVES_AVX2_SRCS} ${PRIMITIVES_OPENCL_SRCS}) set(PRIMITIVES_SRCS ${PRIMITIVES_SRCS} ${PRIMITIVES_OPT_SRCS}) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) add_library(freerdp-primitives OBJECT ${PRIMITIVES_SRCS} ) @@ -74,8 +97,15 @@ if(WITH_SSE2) if (PRIMITIVES_SSSE3_SRCS) set_source_files_properties(${PRIMITIVES_SSSE3_SRCS} PROPERTIES COMPILE_FLAGS "-mssse3" ) endif() - set_source_files_properties(prim_copy_sse.c PROPERTIES COMPILE_FLAGS "-msse4.1" ) - set_source_files_properties(prim_copy_avx2.c PROPERTIES COMPILE_FLAGS "-mavx2" ) + if (PRIMITIVES_SSE4_1_SRCS) + set_source_files_properties(${PRIMITIVES_SSE4_1_SRCS} PROPERTIES COMPILE_FLAGS "-msse4.1" ) + endif() + if (PRIMITIVES_SSE4_2_SRCS) + set_source_files_properties(${PRIMITIVES_SSE4_2_SRCS} PROPERTIES COMPILE_FLAGS "-msse4.2" ) + endif() + if (PRIMITIVES_AVX2_SRCS) + set_source_files_properties(${PRIMITIVES_AVX2_SRCS} PROPERTIES COMPILE_FLAGS "-mavx2" ) + endif() endif() if(MSVC) diff --git a/libfreerdp/primitives/neon/prim_YCoCg_neon.c b/libfreerdp/primitives/neon/prim_YCoCg_neon.c new file mode 100644 index 000000000..ff1ff002d --- /dev/null +++ b/libfreerdp/primitives/neon/prim_YCoCg_neon.c @@ -0,0 +1,173 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Optimized YCoCg<->RGB conversion operations. + * vi:ts=4 sw=4: + * + * (c) Copyright 2014 Hewlett-Packard Development Company, L.P. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#if defined(WITH_NEON) +#include +#endif + +#include "prim_internal.h" +#include "prim_templates.h" +#include "prim_YCoCg.h" + +#if defined(WITH_NEON) +static primitives_t* generic = NULL; + +static pstatus_t neon_YCoCgToRGB_8u_X(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep, + BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep, + UINT32 width, UINT32 height, UINT8 shift, BYTE bPos, + BYTE gPos, BYTE rPos, BYTE aPos, BOOL alpha) +{ + BYTE* dptr = pDst; + const BYTE* sptr = pSrc; + const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat); + const int8_t cll = shift - 1; /* -1 builds in the /2's */ + const UINT32 srcPad = srcStep - (width * 4); + const UINT32 dstPad = dstStep - (width * formatSize); + const UINT32 pad = width % 8; + const uint8x8_t aVal = vdup_n_u8(0xFF); + const int8x8_t cllv = vdup_n_s8(cll); + + for (UINT32 y = 0; y < height; y++) + { + for (UINT32 x = 0; x < width - pad; x += 8) + { + /* Note: shifts must be done before sign-conversion. */ + const uint8x8x4_t raw = vld4_u8(sptr); + const int8x8_t CgRaw = vreinterpret_s8_u8(vshl_u8(raw.val[0], cllv)); + const int8x8_t CoRaw = vreinterpret_s8_u8(vshl_u8(raw.val[1], cllv)); + const int16x8_t Cg = vmovl_s8(CgRaw); + const int16x8_t Co = vmovl_s8(CoRaw); + const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(raw.val[2])); /* UINT8 -> INT16 */ + const int16x8_t T = vsubq_s16(Y, Cg); + const int16x8_t R = vaddq_s16(T, Co); + const int16x8_t G = vaddq_s16(Y, Cg); + const int16x8_t B = vsubq_s16(T, Co); + uint8x8x4_t bgrx; + bgrx.val[bPos] = vqmovun_s16(B); + bgrx.val[gPos] = vqmovun_s16(G); + bgrx.val[rPos] = vqmovun_s16(R); + + if (alpha) + bgrx.val[aPos] = raw.val[3]; + else + bgrx.val[aPos] = aVal; + + vst4_u8(dptr, bgrx); + sptr += sizeof(raw); + dptr += sizeof(bgrx); + } + + for (UINT32 x = 0; x < pad; x++) + { + /* Note: shifts must be done before sign-conversion. */ + const INT16 Cg = (INT16)((INT8)((*sptr++) << cll)); + const INT16 Co = (INT16)((INT8)((*sptr++) << cll)); + const INT16 Y = (INT16)(*sptr++); /* UINT8->INT16 */ + const INT16 T = Y - Cg; + const INT16 R = T + Co; + const INT16 G = Y + Cg; + const INT16 B = T - Co; + BYTE bgra[4]; + bgra[bPos] = CLIP(B); + bgra[gPos] = CLIP(G); + bgra[rPos] = CLIP(R); + bgra[aPos] = *sptr++; + + if (!alpha) + bgra[aPos] = 0xFF; + + *dptr++ = bgra[0]; + *dptr++ = bgra[1]; + *dptr++ = bgra[2]; + *dptr++ = bgra[3]; + } + + sptr += srcPad; + dptr += dstPad; + } + + return PRIMITIVES_SUCCESS; +} + +static pstatus_t neon_YCoCgToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep, + BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep, + UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha) +{ + switch (DstFormat) + { + case PIXEL_FORMAT_BGRA32: + return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, + shift, 2, 1, 0, 3, withAlpha); + + case PIXEL_FORMAT_BGRX32: + return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, + shift, 2, 1, 0, 3, withAlpha); + + case PIXEL_FORMAT_RGBA32: + return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, + shift, 0, 1, 2, 3, withAlpha); + + case PIXEL_FORMAT_RGBX32: + return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, + shift, 0, 1, 2, 3, withAlpha); + + case PIXEL_FORMAT_ARGB32: + return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, + shift, 1, 2, 3, 0, withAlpha); + + case PIXEL_FORMAT_XRGB32: + return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, + shift, 1, 2, 3, 0, withAlpha); + + case PIXEL_FORMAT_ABGR32: + return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, + shift, 3, 2, 1, 0, withAlpha); + + case PIXEL_FORMAT_XBGR32: + return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, + shift, 3, 2, 1, 0, withAlpha); + + default: + return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, + height, shift, withAlpha); + } +} +#endif + +/* ------------------------------------------------------------------------- */ +void primitives_init_YCoCg_neon(primitives_t* WINPR_RESTRICT prims) +{ +#if defined(WITH_NEON) + generic = primitives_get_generic(); + primitives_init_YCoCg(prims); + + if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE)) + { + prims->YCoCgToRGB_8u_AC4R = neon_YCoCgToRGB_8u_AC4R; + } +#else + WINPR_UNUSED(prims); +#endif +} diff --git a/libfreerdp/primitives/prim_YUV_neon.c b/libfreerdp/primitives/neon/prim_YUV_neon.c similarity index 99% rename from libfreerdp/primitives/prim_YUV_neon.c rename to libfreerdp/primitives/neon/prim_YUV_neon.c index 107ced2b2..fd1cafac4 100644 --- a/libfreerdp/primitives/prim_YUV_neon.c +++ b/libfreerdp/primitives/neon/prim_YUV_neon.c @@ -28,11 +28,9 @@ #include #include "prim_internal.h" +#include "prim_YUV.h" -#if !defined(WITH_NEON) -#error "This file must only be included if WITH_NEON is active!" -#endif - +#if defined(WITH_NEON) #include static primitives_t* generic = NULL; @@ -742,9 +740,11 @@ static pstatus_t neon_YUV420CombineToYUV444(avc444_frame_type type, return -1; } } +#endif -void primitives_init_YUV_opt(primitives_t* prims) +void primitives_init_YUV_neon(primitives_t* prims) { +#if defined(WITH_NEON) generic = primitives_get_generic(); primitives_init_YUV(prims); @@ -754,4 +754,7 @@ void primitives_init_YUV_opt(primitives_t* prims) prims->YUV444ToRGB_8u_P3AC4R = neon_YUV444ToRGB_8u_P3AC4R; prims->YUV420CombineToYUV444 = neon_YUV420CombineToYUV444; } +#else + WINPR_UNUSED(prims); +#endif } diff --git a/libfreerdp/primitives/neon/prim_colors_neon.c b/libfreerdp/primitives/neon/prim_colors_neon.c new file mode 100644 index 000000000..cf61c4055 --- /dev/null +++ b/libfreerdp/primitives/neon/prim_colors_neon.c @@ -0,0 +1,365 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Optimized Color conversion operations. + * vi:ts=4 sw=4: + * + * Copyright 2011 Stephen Erisman + * Copyright 2011 Norbert Federa + * Copyright 2011 Martin Fleisz + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#include + +#include +#include +#include + +#if defined(WITH_NEON) +#include +#endif + +#include "prim_internal.h" +#include "prim_templates.h" +#include "prim_colors.h" + +/*---------------------------------------------------------------------------*/ +#ifdef WITH_NEON +static primitives_t* generic = NULL; + +static pstatus_t +neon_yCbCrToRGB_16s16s_P3P3(const INT16* const WINPR_RESTRICT pSrc[3], INT32 srcStep, + INT16* WINPR_RESTRICT pDst[3], INT32 dstStep, + const prim_size_t* WINPR_RESTRICT roi) /* region of interest */ +{ + /* TODO: If necessary, check alignments and call the general version. */ + int16x8_t zero = vdupq_n_s16(0); + int16x8_t max = vdupq_n_s16(255); + int16x8_t r_cr = vdupq_n_s16(22986); // 1.403 << 14 + int16x8_t g_cb = vdupq_n_s16(-5636); // -0.344 << 14 + int16x8_t g_cr = vdupq_n_s16(-11698); // -0.714 << 14 + int16x8_t b_cb = vdupq_n_s16(28999); // 1.770 << 14 + int16x8_t c4096 = vdupq_n_s16(4096); + int16x8_t* y_buf = (int16x8_t*)pSrc[0]; + int16x8_t* cb_buf = (int16x8_t*)pSrc[1]; + int16x8_t* cr_buf = (int16x8_t*)pSrc[2]; + int16x8_t* r_buf = (int16x8_t*)pDst[0]; + int16x8_t* g_buf = (int16x8_t*)pDst[1]; + int16x8_t* b_buf = (int16x8_t*)pDst[2]; + int srcbump = srcStep / sizeof(int16x8_t); + int dstbump = dstStep / sizeof(int16x8_t); + int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t); + + for (int yp = 0; yp < roi->height; ++yp) + { + for (int i = 0; i < imax; i++) + { + /* + In order to use NEON signed 16-bit integer multiplication we need to convert + the floating point factors to signed int without loosing information. + The result of this multiplication is 32 bit and we have a NEON instruction + that returns the hi word of the saturated double. + Thus we will multiply the factors by the highest possible 2^n, take the + upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right + shift by 1 to reverse the doubling) and correct this result by multiplying it + by 2^(16-n). + For the given factors in the conversion matrix the best possible n is 14. + + Example for calculating r: + r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula + r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above + r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification + r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3 + */ + /* y = (y_buf[i] + 4096) >> 2 */ + int16x8_t y = vld1q_s16((INT16*)&y_buf[i]); + y = vaddq_s16(y, c4096); + y = vshrq_n_s16(y, 2); + /* cb = cb_buf[i]; */ + int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]); + /* cr = cr_buf[i]; */ + int16x8_t cr = vld1q_s16((INT16*)&cr_buf[i]); + /* (y + HIWORD(cr*22986)) >> 3 */ + int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1)); + r = vshrq_n_s16(r, 3); + /* r_buf[i] = CLIP(r); */ + r = vminq_s16(vmaxq_s16(r, zero), max); + vst1q_s16((INT16*)&r_buf[i], r); + /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */ + int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1)); + g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1)); + g = vshrq_n_s16(g, 3); + /* g_buf[i] = CLIP(g); */ + g = vminq_s16(vmaxq_s16(g, zero), max); + vst1q_s16((INT16*)&g_buf[i], g); + /* (y + HIWORD(cb*28999)) >> 3 */ + int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1)); + b = vshrq_n_s16(b, 3); + /* b_buf[i] = CLIP(b); */ + b = vminq_s16(vmaxq_s16(b, zero), max); + vst1q_s16((INT16*)&b_buf[i], b); + } + + y_buf += srcbump; + cb_buf += srcbump; + cr_buf += srcbump; + r_buf += dstbump; + g_buf += dstbump; + b_buf += dstbump; + } + + return PRIMITIVES_SUCCESS; +} + +static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(const INT16* const WINPR_RESTRICT pSrc[3], + UINT32 srcStep, BYTE* WINPR_RESTRICT pDst, + UINT32 dstStep, + const prim_size_t* WINPR_RESTRICT roi, uint8_t rPos, + uint8_t gPos, uint8_t bPos, uint8_t aPos) +{ + BYTE* pRGB = pDst; + const INT16* pY = pSrc[0]; + const INT16* pCb = pSrc[1]; + const INT16* pCr = pSrc[2]; + const size_t srcPad = (srcStep - (roi->width * sizeof(INT16))) / sizeof(INT16); + const size_t dstPad = (dstStep - (roi->width * 4)) / 4; + const size_t pad = roi->width % 8; + const int16x4_t c4096 = vdup_n_s16(4096); + + for (UINT32 y = 0; y < roi->height; y++) + { + for (UINT32 x = 0; x < roi->width - pad; x += 8) + { + const int16x8_t Y = vld1q_s16(pY); + const int16x4_t Yh = vget_high_s16(Y); + const int16x4_t Yl = vget_low_s16(Y); + const int32x4_t YhAdd = vaddl_s16(Yh, c4096); /* Y + 4096 */ + const int32x4_t YlAdd = vaddl_s16(Yl, c4096); /* Y + 4096 */ + const int32x4_t YhW = vshlq_n_s32(YhAdd, 16); + const int32x4_t YlW = vshlq_n_s32(YlAdd, 16); + const int16x8_t Cr = vld1q_s16(pCr); + const int16x4_t Crh = vget_high_s16(Cr); + const int16x4_t Crl = vget_low_s16(Cr); + const int16x8_t Cb = vld1q_s16(pCb); + const int16x4_t Cbh = vget_high_s16(Cb); + const int16x4_t Cbl = vget_low_s16(Cb); + uint8x8x4_t bgrx; + { + /* R */ + const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916); /* 1.402525 * 2^16 */ + const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916); /* 1.402525 * 2^16 */ + const int32x4_t CrhRa = vaddq_s32(CrhR, YhW); + const int32x4_t CrlRa = vaddq_s32(CrlR, YlW); + const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21)); + const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21)); + const int16x8_t Rs = vcombine_s16(Rsl, Rsh); + bgrx.val[rPos] = vqmovun_s16(Rs); + } + { + /* G */ + const int32x4_t CbGh = vmull_n_s16(Cbh, 22527); /* 0.343730 * 2^16 */ + const int32x4_t CbGl = vmull_n_s16(Cbl, 22527); /* 0.343730 * 2^16 */ + const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819); /* 0.714401 * 2^16 */ + const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819); /* 0.714401 * 2^16 */ + const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh); + const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl); + const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh); + const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl); + const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21)); + const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21)); + const int16x8_t Gs = vcombine_s16(Gsl, Gsh); + const uint8x8_t G = vqmovun_s16(Gs); + bgrx.val[gPos] = G; + } + { + /* B */ + const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992); /* 1.769905 * 2^16 */ + const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992); /* 1.769905 * 2^16 */ + const int32x4_t YCbBh = vaddq_s32(CbBh, YhW); + const int32x4_t YCbBl = vaddq_s32(CbBl, YlW); + const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21)); + const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21)); + const int16x8_t Bs = vcombine_s16(Bsl, Bsh); + const uint8x8_t B = vqmovun_s16(Bs); + bgrx.val[bPos] = B; + } + /* A */ + { + bgrx.val[aPos] = vdup_n_u8(0xFF); + } + vst4_u8(pRGB, bgrx); + pY += 8; + pCb += 8; + pCr += 8; + pRGB += 32; + } + + for (UINT32 x = 0; x < pad; x++) + { + const INT32 divisor = 16; + const INT32 Y = ((*pY++) + 4096) << divisor; + const INT32 Cb = (*pCb++); + const INT32 Cr = (*pCr++); + const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor)); + const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor)); + const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor)); + const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor)); + INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5); + INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5); + INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5); + BYTE bgrx[4]; + bgrx[bPos] = CLIP(B); + bgrx[gPos] = CLIP(G); + bgrx[rPos] = CLIP(R); + bgrx[aPos] = 0xFF; + *pRGB++ = bgrx[0]; + *pRGB++ = bgrx[1]; + *pRGB++ = bgrx[2]; + *pRGB++ = bgrx[3]; + } + + pY += srcPad; + pCb += srcPad; + pCr += srcPad; + pRGB += dstPad; + } + + return PRIMITIVES_SUCCESS; +} + +static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3], + UINT32 srcStep, BYTE* WINPR_RESTRICT pDst, + UINT32 dstStep, UINT32 DstFormat, + const prim_size_t* WINPR_RESTRICT roi) +{ + switch (DstFormat) + { + case PIXEL_FORMAT_BGRA32: + case PIXEL_FORMAT_BGRX32: + return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3); + + case PIXEL_FORMAT_RGBA32: + case PIXEL_FORMAT_RGBX32: + return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3); + + case PIXEL_FORMAT_ARGB32: + case PIXEL_FORMAT_XRGB32: + return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0); + + case PIXEL_FORMAT_ABGR32: + case PIXEL_FORMAT_XBGR32: + return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0); + + default: + return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); + } +} + +static pstatus_t neon_RGBToRGB_16s8u_P3AC4R_X( + const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */ + UINT32 srcStep, /* bytes between rows in source data */ + BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */ + UINT32 dstStep, /* bytes between rows in dest data */ + const prim_size_t* WINPR_RESTRICT roi, /* region of interest */ + uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos) +{ + UINT32 pad = roi->width % 8; + + for (UINT32 y = 0; y < roi->height; y++) + { + const INT16* pr = (INT16*)(((BYTE*)pSrc[0]) + y * srcStep); + const INT16* pg = (INT16*)(((BYTE*)pSrc[1]) + y * srcStep); + const INT16* pb = (INT16*)(((BYTE*)pSrc[2]) + y * srcStep); + BYTE* dst = pDst + y * dstStep; + + for (UINT32 x = 0; x < roi->width - pad; x += 8) + { + int16x8_t r = vld1q_s16(pr); + int16x8_t g = vld1q_s16(pg); + int16x8_t b = vld1q_s16(pb); + uint8x8x4_t bgrx; + bgrx.val[aPos] = vdup_n_u8(0xFF); + bgrx.val[rPos] = vqmovun_s16(r); + bgrx.val[gPos] = vqmovun_s16(g); + bgrx.val[bPos] = vqmovun_s16(b); + vst4_u8(dst, bgrx); + pr += 8; + pg += 8; + pb += 8; + dst += 32; + } + + for (UINT32 x = 0; x < pad; x++) + { + BYTE bgrx[4]; + bgrx[bPos] = *pb++; + bgrx[gPos] = *pg++; + bgrx[rPos] = *pr++; + bgrx[aPos] = 0xFF; + *dst++ = bgrx[0]; + *dst++ = bgrx[1]; + *dst++ = bgrx[2]; + *dst++ = bgrx[3]; + } + } + + return PRIMITIVES_SUCCESS; +} + +static pstatus_t +neon_RGBToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */ + UINT32 srcStep, /* bytes between rows in source data */ + BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */ + UINT32 dstStep, /* bytes between rows in dest data */ + UINT32 DstFormat, + const prim_size_t* WINPR_RESTRICT roi) /* region of interest */ +{ + switch (DstFormat) + { + case PIXEL_FORMAT_BGRA32: + case PIXEL_FORMAT_BGRX32: + return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3); + + case PIXEL_FORMAT_RGBA32: + case PIXEL_FORMAT_RGBX32: + return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3); + + case PIXEL_FORMAT_ARGB32: + case PIXEL_FORMAT_XRGB32: + return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0); + + case PIXEL_FORMAT_ABGR32: + case PIXEL_FORMAT_XBGR32: + return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0); + + default: + return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); + } +} +#endif /* WITH_NEON */ + +/* ------------------------------------------------------------------------- */ +void primitives_init_colors_neon(primitives_t* prims) +{ +#if defined(WITH_NEON) + generic = primitives_get_generic(); + primitives_init_colors(prims); + + if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE)) + { + prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R; + prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R; + prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3; + } +#else + WINPR_UNUSED(prims); +#endif +} diff --git a/libfreerdp/primitives/prim_YUV_opencl.c b/libfreerdp/primitives/opencl/prim_YUV_opencl.c similarity index 99% rename from libfreerdp/primitives/prim_YUV_opencl.c rename to libfreerdp/primitives/opencl/prim_YUV_opencl.c index 2ca1b31d8..304d44d96 100644 --- a/libfreerdp/primitives/prim_YUV_opencl.c +++ b/libfreerdp/primitives/opencl/prim_YUV_opencl.c @@ -30,7 +30,6 @@ #else #include #endif -#endif #include #define TAG FREERDP_TAG("primitives") @@ -481,9 +480,11 @@ static pstatus_t opencl_YUV444ToRGB_8u_P3AC4R(const BYTE* const WINPR_RESTRICT p return opencl_YUVToRGB(kernel_name, pSrc, srcStep, pDst, dstStep, roi); } +#endif BOOL primitives_init_opencl(primitives_t* prims) { +#if defined(WITH_OPENCL) primitives_t* p = primitives_get_by_type(PRIMITIVES_ONLY_CPU); if (!prims || !p) return FALSE; @@ -496,5 +497,6 @@ BOOL primitives_init_opencl(primitives_t* prims) prims->YUV444ToRGB_8u_P3AC4R = opencl_YUV444ToRGB_8u_P3AC4R; prims->flags |= PRIM_FLAGS_HAVE_EXTGPU; prims->uninit = primitives_uninit_opencl; +#endif return TRUE; } diff --git a/libfreerdp/primitives/primitives.cl b/libfreerdp/primitives/opencl/primitives.cl similarity index 100% rename from libfreerdp/primitives/primitives.cl rename to libfreerdp/primitives/opencl/primitives.cl diff --git a/libfreerdp/primitives/prim_YCoCg.c b/libfreerdp/primitives/prim_YCoCg.c index 7c1a429f9..4a36f1e9e 100644 --- a/libfreerdp/primitives/prim_YCoCg.c +++ b/libfreerdp/primitives/prim_YCoCg.c @@ -23,6 +23,7 @@ #include #include "prim_internal.h" +#include "prim_YCoCg.h" /* helper function to convert raw 8 bit values to signed 16bit values. */ @@ -71,3 +72,9 @@ void primitives_init_YCoCg(primitives_t* prims) { prims->YCoCgToRGB_8u_AC4R = general_YCoCgToRGB_8u_AC4R; } + +void primitives_init_YCoCg_opt(primitives_t* WINPR_RESTRICT prims) +{ + primitives_init_YCoCg_ssse3(prims); + primitives_init_YCoCg_neon(prims); +} diff --git a/libfreerdp/primitives/prim_YCoCg.h b/libfreerdp/primitives/prim_YCoCg.h new file mode 100644 index 000000000..bd878d5b6 --- /dev/null +++ b/libfreerdp/primitives/prim_YCoCg.h @@ -0,0 +1,31 @@ +/** + * FreeRDP: A Remote Desktop Protocol Implementation + * Primitives copy + * + * Copyright 2024 Armin Novak + * Copyright 2024 Thincast Technologies GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FREERDP_LIB_PRIM_YCoCg_H +#define FREERDP_LIB_PRIM_YCoCg_H + +#include +#include +#include + +void primitives_init_YCoCg_ssse3(primitives_t* WINPR_RESTRICT prims); +void primitives_init_YCoCg_neon(primitives_t* WINPR_RESTRICT prims); + +#endif diff --git a/libfreerdp/primitives/prim_YUV.c b/libfreerdp/primitives/prim_YUV.c index ec021399a..e0d6d5812 100644 --- a/libfreerdp/primitives/prim_YUV.c +++ b/libfreerdp/primitives/prim_YUV.c @@ -29,6 +29,7 @@ #include #include #include "prim_internal.h" +#include "prim_YUV.h" static pstatus_t general_LumaToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[3], const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3], @@ -1875,3 +1876,9 @@ void primitives_init_YUV(primitives_t* WINPR_RESTRICT prims) prims->RGBToAVC444YUV = general_RGBToAVC444YUV; prims->RGBToAVC444YUVv2 = general_RGBToAVC444YUVv2; } + +void primitives_init_YUV_opt(primitives_t* WINPR_RESTRICT prims) +{ + primitives_init_YUV_ssse3(prims); + primitives_init_YUV_neon(prims); +} diff --git a/libfreerdp/primitives/prim_YUV.h b/libfreerdp/primitives/prim_YUV.h new file mode 100644 index 000000000..0f2f12a19 --- /dev/null +++ b/libfreerdp/primitives/prim_YUV.h @@ -0,0 +1,31 @@ +/** + * FreeRDP: A Remote Desktop Protocol Implementation + * Primitives copy + * + * Copyright 2024 Armin Novak + * Copyright 2024 Thincast Technologies GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FREERDP_LIB_PRIM_YUV_H +#define FREERDP_LIB_PRIM_YUV_H + +#include +#include +#include + +void primitives_init_YUV_ssse3(primitives_t* prims); +void primitives_init_YUV_neon(primitives_t* prims); + +#endif diff --git a/libfreerdp/primitives/prim_add.c b/libfreerdp/primitives/prim_add.c index 6a9a9994d..768419c8f 100644 --- a/libfreerdp/primitives/prim_add.c +++ b/libfreerdp/primitives/prim_add.c @@ -22,6 +22,7 @@ #include #include "prim_internal.h" +#include "prim_add.h" /* ---------------------------------------------------------------------------- * 16-bit signed add with saturation (under and over). @@ -74,3 +75,8 @@ void primitives_init_add(primitives_t* prims) prims->add_16s = general_add_16s; prims->add_16s_inplace = general_add_16s_inplace; } + +void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims) +{ + primitives_init_add_sse3(prims); +} diff --git a/libfreerdp/primitives/prim_add.h b/libfreerdp/primitives/prim_add.h new file mode 100644 index 000000000..1c151f0f1 --- /dev/null +++ b/libfreerdp/primitives/prim_add.h @@ -0,0 +1,30 @@ +/** + * FreeRDP: A Remote Desktop Protocol Implementation + * Primitives copy + * + * Copyright 2024 Armin Novak + * Copyright 2024 Thincast Technologies GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FREERDP_LIB_PRIM_ADD_H +#define FREERDP_LIB_PRIM_ADD_H + +#include +#include +#include + +void primitives_init_add_sse3(primitives_t* prims); + +#endif diff --git a/libfreerdp/primitives/prim_alphaComp.c b/libfreerdp/primitives/prim_alphaComp.c index ae4917d88..5657d70f2 100644 --- a/libfreerdp/primitives/prim_alphaComp.c +++ b/libfreerdp/primitives/prim_alphaComp.c @@ -25,6 +25,7 @@ #include #include "prim_internal.h" +#include "prim_alphaComp.h" #define ALPHA(_k_) (((_k_)&0xFF000000U) >> 24) #define RED(_k_) (((_k_)&0x00FF0000U) >> 16) @@ -91,3 +92,8 @@ void primitives_init_alphaComp(primitives_t* prims) { prims->alphaComp_argb = general_alphaComp_argb; } + +void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims) +{ + primitives_init_alphaComp_sse3(prims); +} diff --git a/libfreerdp/primitives/prim_alphaComp.h b/libfreerdp/primitives/prim_alphaComp.h new file mode 100644 index 000000000..efd6a58f0 --- /dev/null +++ b/libfreerdp/primitives/prim_alphaComp.h @@ -0,0 +1,30 @@ +/** + * FreeRDP: A Remote Desktop Protocol Implementation + * Primitives copy + * + * Copyright 2024 Armin Novak + * Copyright 2024 Thincast Technologies GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FREERDP_LIB_PRIM_ALPHA_COMP_H +#define FREERDP_LIB_PRIM_ALPHA_COMP_H + +#include +#include +#include + +void primitives_init_alphaComp_sse3(primitives_t* prims); + +#endif diff --git a/libfreerdp/primitives/prim_andor.c b/libfreerdp/primitives/prim_andor.c index 92165464b..d4f8d821d 100644 --- a/libfreerdp/primitives/prim_andor.c +++ b/libfreerdp/primitives/prim_andor.c @@ -19,6 +19,7 @@ #include #include "prim_internal.h" +#include "prim_andor.h" /* ---------------------------------------------------------------------------- * 32-bit AND with a constant. @@ -55,3 +56,8 @@ void primitives_init_andor(primitives_t* prims) prims->andC_32u = general_andC_32u; prims->orC_32u = general_orC_32u; } + +void primitives_init_andor_opt(primitives_t* WINPR_RESTRICT prims) +{ + primitives_init_andor_sse3(prims); +} diff --git a/libfreerdp/primitives/prim_andor.h b/libfreerdp/primitives/prim_andor.h new file mode 100644 index 000000000..6ec37e4af --- /dev/null +++ b/libfreerdp/primitives/prim_andor.h @@ -0,0 +1,30 @@ +/** + * FreeRDP: A Remote Desktop Protocol Implementation + * Primitives copy + * + * Copyright 2024 Armin Novak + * Copyright 2024 Thincast Technologies GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FREERDP_LIB_PRIM_ANDOR_H +#define FREERDP_LIB_PRIM_ANDOR_H + +#include +#include +#include + +void primitives_init_andor_sse3(primitives_t* prims); + +#endif diff --git a/libfreerdp/primitives/prim_colors.c b/libfreerdp/primitives/prim_colors.c index 4a23129a6..4448a3319 100644 --- a/libfreerdp/primitives/prim_colors.c +++ b/libfreerdp/primitives/prim_colors.c @@ -24,6 +24,7 @@ #include #include "prim_internal.h" +#include "prim_colors.h" #ifndef MINMAX #define MINMAX(_v_, _l_, _h_) ((_v_) < (_l_) ? (_l_) : ((_v_) > (_h_) ? (_h_) : (_v_))) @@ -507,3 +508,10 @@ void primitives_init_colors(primitives_t* prims) prims->RGBToYCbCr_16s16s_P3P3 = general_RGBToYCbCr_16s16s_P3P3; prims->RGBToRGB_16s8u_P3AC4R = general_RGBToRGB_16s8u_P3AC4R; } + +/* ------------------------------------------------------------------------- */ +void primitives_init_colors_opt(primitives_t* prims) +{ + primitives_init_colors_sse2(prims); + primitives_init_colors_neon(prims); +} diff --git a/libfreerdp/primitives/prim_colors.h b/libfreerdp/primitives/prim_colors.h new file mode 100644 index 000000000..65bbd43cf --- /dev/null +++ b/libfreerdp/primitives/prim_colors.h @@ -0,0 +1,31 @@ +/** + * FreeRDP: A Remote Desktop Protocol Implementation + * Primitives colors + * + * Copyright 2024 Armin Novak + * Copyright 2024 Thincast Technologies GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FREERDP_LIB_PRIM_COLORS_H +#define FREERDP_LIB_PRIM_COLORS_H + +#include +#include +#include + +void primitives_init_colors_sse2(primitives_t* prims); +void primitives_init_colors_neon(primitives_t* prims); + +#endif diff --git a/libfreerdp/primitives/prim_copy.c b/libfreerdp/primitives/prim_copy.c index 9021399eb..195d289bd 100644 --- a/libfreerdp/primitives/prim_copy.c +++ b/libfreerdp/primitives/prim_copy.c @@ -392,22 +392,8 @@ void primitives_init_copy(primitives_t* prims) prims->copy_no_overlap = generic_image_copy_no_overlap; } -#if defined(WITH_SSE2) || defined(WITH_NEON) void primitives_init_copy_opt(primitives_t* prims) { - generic = primitives_get_generic(); - primitives_init_copy(prims); - /* Pick tuned versions if possible. */ - /* Performance with an SSE2 version with no prefetch seemed to be - * all over the map vs. memcpy. - * Sometimes it was significantly faster, sometimes dreadfully slower, - * and it seemed to vary a lot depending on block size and processor. - * Hence, no SSE version is used here unless once can be written that - * is consistently faster than memcpy. - */ - /* This is just an alias with void* parameters */ - prims->copy = (__copy_t)(prims->copy_8u); - primitives_init_copy_sse(prims); + primitives_init_copy_sse41(prims); primitives_init_copy_avx2(prims); } -#endif diff --git a/libfreerdp/primitives/prim_copy.h b/libfreerdp/primitives/prim_copy.h index 18b927d0d..ace7f925f 100644 --- a/libfreerdp/primitives/prim_copy.h +++ b/libfreerdp/primitives/prim_copy.h @@ -22,6 +22,7 @@ #define FREERDP_LIB_PRIM_COPY_H #include +#include #include pstatus_t generic_image_copy_no_overlap_convert( @@ -37,6 +38,7 @@ pstatus_t generic_image_copy_no_overlap_memcpy( SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset, UINT32 flags); -extern void primitives_init_copy_sse(primitives_t* prims); -extern void primitives_init_copy_avx2(primitives_t* prims); +void primitives_init_copy_sse41(primitives_t* prims); +void primitives_init_copy_avx2(primitives_t* prims); + #endif diff --git a/libfreerdp/primitives/prim_internal.h b/libfreerdp/primitives/prim_internal.h index cf5c12467..b3a2a5cf0 100644 --- a/libfreerdp/primitives/prim_internal.h +++ b/libfreerdp/primitives/prim_internal.h @@ -275,7 +275,6 @@ FREERDP_LOCAL void primitives_init_colors(primitives_t* prims); FREERDP_LOCAL void primitives_init_YCoCg(primitives_t* prims); FREERDP_LOCAL void primitives_init_YUV(primitives_t* prims); -#if defined(WITH_SSE2) || defined(WITH_NEON) FREERDP_LOCAL void primitives_init_copy_opt(primitives_t* prims); FREERDP_LOCAL void primitives_init_set_opt(primitives_t* prims); FREERDP_LOCAL void primitives_init_add_opt(primitives_t* prims); @@ -286,7 +285,6 @@ FREERDP_LOCAL void primitives_init_alphaComp_opt(primitives_t* prims); FREERDP_LOCAL void primitives_init_colors_opt(primitives_t* prims); FREERDP_LOCAL void primitives_init_YCoCg_opt(primitives_t* prims); FREERDP_LOCAL void primitives_init_YUV_opt(primitives_t* prims); -#endif #if defined(WITH_OPENCL) FREERDP_LOCAL BOOL primitives_init_opencl(primitives_t* prims); diff --git a/libfreerdp/primitives/prim_set.c b/libfreerdp/primitives/prim_set.c index c4012e645..3735e4836 100644 --- a/libfreerdp/primitives/prim_set.c +++ b/libfreerdp/primitives/prim_set.c @@ -22,6 +22,7 @@ #include #include "prim_internal.h" +#include "prim_set.h" /* ========================================================================= */ static pstatus_t general_set_8u(BYTE val, BYTE* pDst, UINT32 len) @@ -120,3 +121,8 @@ void primitives_init_set(primitives_t* prims) prims->set_32u = general_set_32u; prims->zero = general_zero; } + +void primitives_init_set_opt(primitives_t* WINPR_RESTRICT prims) +{ + primitives_init_set_sse2(prims); +} diff --git a/libfreerdp/primitives/prim_set.h b/libfreerdp/primitives/prim_set.h new file mode 100644 index 000000000..81f81243d --- /dev/null +++ b/libfreerdp/primitives/prim_set.h @@ -0,0 +1,30 @@ +/** + * FreeRDP: A Remote Desktop Protocol Implementation + * Primitives copy + * + * Copyright 2024 Armin Novak + * Copyright 2024 Thincast Technologies GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FREERDP_LIB_PRIM_SET_H +#define FREERDP_LIB_PRIM_SET_H + +#include +#include +#include + +void primitives_init_set_sse2(primitives_t* prims); + +#endif diff --git a/libfreerdp/primitives/prim_shift.c b/libfreerdp/primitives/prim_shift.c index 3677fd113..9af34fde0 100644 --- a/libfreerdp/primitives/prim_shift.c +++ b/libfreerdp/primitives/prim_shift.c @@ -19,6 +19,8 @@ #include #include "prim_internal.h" +#include "prim_shift.h" + /* ------------------------------------------------------------------------- */ static INLINE INT16 shift(INT16 val, UINT32 sh) { @@ -133,3 +135,8 @@ void primitives_init_shift(primitives_t* prims) prims->shiftC_16s = general_shiftC_16s; prims->shiftC_16u = general_shiftC_16u; } + +void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims) +{ + primitives_init_shift_sse3(prims); +} diff --git a/libfreerdp/primitives/prim_shift.h b/libfreerdp/primitives/prim_shift.h new file mode 100644 index 000000000..d7cc32324 --- /dev/null +++ b/libfreerdp/primitives/prim_shift.h @@ -0,0 +1,30 @@ +/** + * FreeRDP: A Remote Desktop Protocol Implementation + * Primitives copy + * + * Copyright 2024 Armin Novak + * Copyright 2024 Thincast Technologies GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FREERDP_LIB_PRIM_SHIFT_H +#define FREERDP_LIB_PRIM_SHIFT_H + +#include +#include +#include + +extern void primitives_init_shift_sse3(primitives_t* prims); + +#endif diff --git a/libfreerdp/primitives/prim_sign.c b/libfreerdp/primitives/prim_sign.c index d89dc474f..efa18850d 100644 --- a/libfreerdp/primitives/prim_sign.c +++ b/libfreerdp/primitives/prim_sign.c @@ -19,6 +19,7 @@ #include #include "prim_internal.h" +#include "prim_sign.h" /* ---------------------------------------------------------------------------- * Set pDst to the sign-value of the 16-bit values in pSrc (-1, 0, or 1). @@ -40,3 +41,8 @@ void primitives_init_sign(primitives_t* prims) /* Start with the default. */ prims->sign_16s = general_sign_16s; } + +void primitives_init_sign_opt(primitives_t* WINPR_RESTRICT prims) +{ + primitives_init_sign_ssse3(prims); +} diff --git a/libfreerdp/primitives/prim_sign.h b/libfreerdp/primitives/prim_sign.h new file mode 100644 index 000000000..61ac2dbb3 --- /dev/null +++ b/libfreerdp/primitives/prim_sign.h @@ -0,0 +1,30 @@ +/** + * FreeRDP: A Remote Desktop Protocol Implementation + * Primitives copy + * + * Copyright 2024 Armin Novak + * Copyright 2024 Thincast Technologies GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FREERDP_LIB_PRIM_SIGN_H +#define FREERDP_LIB_PRIM_SIGN_H + +#include +#include +#include + +void primitives_init_sign_ssse3(primitives_t* prims); + +#endif diff --git a/libfreerdp/primitives/prim_YCoCg_opt.c b/libfreerdp/primitives/sse/prim_YCoCg_ssse3.c similarity index 74% rename from libfreerdp/primitives/prim_YCoCg_opt.c rename to libfreerdp/primitives/sse/prim_YCoCg_ssse3.c index bba13fac3..8408d50ef 100644 --- a/libfreerdp/primitives/prim_YCoCg_opt.c +++ b/libfreerdp/primitives/sse/prim_YCoCg_ssse3.c @@ -23,19 +23,19 @@ #include #include +#include "prim_YCoCg.h" + #ifdef WITH_SSE2 #include #include -#elif defined(WITH_NEON) -#include -#endif /* WITH_SSE2 else WITH_NEON */ +#endif #include "prim_internal.h" #include "prim_templates.h" +#ifdef WITH_SSE2 static primitives_t* generic = NULL; -#ifdef WITH_SSE2 /* ------------------------------------------------------------------------- */ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep, BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, @@ -411,9 +411,7 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT return PRIMITIVES_SUCCESS; } -#endif /* WITH_SSE2 */ -#ifdef WITH_SSE2 /* ------------------------------------------------------------------------- */ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep, BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, @@ -437,153 +435,22 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT3 height, shift, withAlpha); } } -#elif defined(WITH_NEON) -static pstatus_t neon_YCoCgToRGB_8u_X(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep, - BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep, - UINT32 width, UINT32 height, UINT8 shift, BYTE bPos, - BYTE gPos, BYTE rPos, BYTE aPos, BOOL alpha) -{ - BYTE* dptr = pDst; - const BYTE* sptr = pSrc; - const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat); - const int8_t cll = shift - 1; /* -1 builds in the /2's */ - const UINT32 srcPad = srcStep - (width * 4); - const UINT32 dstPad = dstStep - (width * formatSize); - const UINT32 pad = width % 8; - const uint8x8_t aVal = vdup_n_u8(0xFF); - const int8x8_t cllv = vdup_n_s8(cll); - - for (UINT32 y = 0; y < height; y++) - { - for (UINT32 x = 0; x < width - pad; x += 8) - { - /* Note: shifts must be done before sign-conversion. */ - const uint8x8x4_t raw = vld4_u8(sptr); - const int8x8_t CgRaw = vreinterpret_s8_u8(vshl_u8(raw.val[0], cllv)); - const int8x8_t CoRaw = vreinterpret_s8_u8(vshl_u8(raw.val[1], cllv)); - const int16x8_t Cg = vmovl_s8(CgRaw); - const int16x8_t Co = vmovl_s8(CoRaw); - const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(raw.val[2])); /* UINT8 -> INT16 */ - const int16x8_t T = vsubq_s16(Y, Cg); - const int16x8_t R = vaddq_s16(T, Co); - const int16x8_t G = vaddq_s16(Y, Cg); - const int16x8_t B = vsubq_s16(T, Co); - uint8x8x4_t bgrx; - bgrx.val[bPos] = vqmovun_s16(B); - bgrx.val[gPos] = vqmovun_s16(G); - bgrx.val[rPos] = vqmovun_s16(R); - - if (alpha) - bgrx.val[aPos] = raw.val[3]; - else - bgrx.val[aPos] = aVal; - - vst4_u8(dptr, bgrx); - sptr += sizeof(raw); - dptr += sizeof(bgrx); - } - - for (UINT32 x = 0; x < pad; x++) - { - /* Note: shifts must be done before sign-conversion. */ - const INT16 Cg = (INT16)((INT8)((*sptr++) << cll)); - const INT16 Co = (INT16)((INT8)((*sptr++) << cll)); - const INT16 Y = (INT16)(*sptr++); /* UINT8->INT16 */ - const INT16 T = Y - Cg; - const INT16 R = T + Co; - const INT16 G = Y + Cg; - const INT16 B = T - Co; - BYTE bgra[4]; - bgra[bPos] = CLIP(B); - bgra[gPos] = CLIP(G); - bgra[rPos] = CLIP(R); - bgra[aPos] = *sptr++; - - if (!alpha) - bgra[aPos] = 0xFF; - - *dptr++ = bgra[0]; - *dptr++ = bgra[1]; - *dptr++ = bgra[2]; - *dptr++ = bgra[3]; - } - - sptr += srcPad; - dptr += dstPad; - } - - return PRIMITIVES_SUCCESS; -} - -static pstatus_t neon_YCoCgToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep, - BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep, - UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha) -{ - switch (DstFormat) - { - case PIXEL_FORMAT_BGRA32: - return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, - shift, 2, 1, 0, 3, withAlpha); - - case PIXEL_FORMAT_BGRX32: - return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, - shift, 2, 1, 0, 3, withAlpha); - - case PIXEL_FORMAT_RGBA32: - return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, - shift, 0, 1, 2, 3, withAlpha); - - case PIXEL_FORMAT_RGBX32: - return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, - shift, 0, 1, 2, 3, withAlpha); - - case PIXEL_FORMAT_ARGB32: - return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, - shift, 1, 2, 3, 0, withAlpha); - - case PIXEL_FORMAT_XRGB32: - return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, - shift, 1, 2, 3, 0, withAlpha); - - case PIXEL_FORMAT_ABGR32: - return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, - shift, 3, 2, 1, 0, withAlpha); - - case PIXEL_FORMAT_XBGR32: - return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, - shift, 3, 2, 1, 0, withAlpha); - - default: - return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, - height, shift, withAlpha); - } -} #endif /* WITH_SSE2 */ /* ------------------------------------------------------------------------- */ -void primitives_init_YCoCg_opt(primitives_t* WINPR_RESTRICT prims) +void primitives_init_YCoCg_ssse3(primitives_t* WINPR_RESTRICT prims) { +#if defined(WITH_SSE2) generic = primitives_get_generic(); primitives_init_YCoCg(prims); - /* While IPP acknowledges the existence of YCoCg-R, it doesn't currently - * include any routines to work with it, especially with variable shift - * width. - */ -#if defined(WITH_SSE2) if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) && IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) { prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R; } - -#elif defined(WITH_NEON) - - if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE)) - { - prims->YCoCgToRGB_8u_AC4R = neon_YCoCgToRGB_8u_AC4R; - } - -#endif /* WITH_SSE2 */ +#else + WINPR_UNUSED(prims); +#endif } diff --git a/libfreerdp/primitives/prim_YUV_ssse3.c b/libfreerdp/primitives/sse/prim_YUV_ssse3.c similarity index 99% rename from libfreerdp/primitives/prim_YUV_ssse3.c rename to libfreerdp/primitives/sse/prim_YUV_ssse3.c index 2fbef3e94..c204b74e5 100644 --- a/libfreerdp/primitives/prim_YUV_ssse3.c +++ b/libfreerdp/primitives/sse/prim_YUV_ssse3.c @@ -29,14 +29,12 @@ #include #include "prim_internal.h" +#include "prim_YUV.h" +#if defined(WITH_SSE2) #include #include -#if !defined(WITH_SSE2) -#error "This file needs WITH_SSE2 enabled!" -#endif - static primitives_t* generic = NULL; /****************************************************************************/ @@ -1496,9 +1494,11 @@ static pstatus_t ssse3_YUV420CombineToYUV444(avc444_frame_type type, return -1; } } +#endif -void primitives_init_YUV_opt(primitives_t* WINPR_RESTRICT prims) +void primitives_init_YUV_ssse3(primitives_t* WINPR_RESTRICT prims) { +#if defined(WITH_SSE2) generic = primitives_get_generic(); primitives_init_YUV(prims); @@ -1512,4 +1512,7 @@ void primitives_init_YUV_opt(primitives_t* WINPR_RESTRICT prims) prims->YUV444ToRGB_8u_P3AC4R = ssse3_YUV444ToRGB_8u_P3AC4R; prims->YUV420CombineToYUV444 = ssse3_YUV420CombineToYUV444; } +#else + WINPR_UNUSED(prims); +#endif } diff --git a/libfreerdp/primitives/prim_add_opt.c b/libfreerdp/primitives/sse/prim_add_sse3.c similarity index 97% rename from libfreerdp/primitives/prim_add_opt.c rename to libfreerdp/primitives/sse/prim_add_sse3.c index 7274683a4..0a97440a2 100644 --- a/libfreerdp/primitives/prim_add_opt.c +++ b/libfreerdp/primitives/sse/prim_add_sse3.c @@ -20,6 +20,8 @@ #include #include +#include "prim_add.h" + #ifdef WITH_SSE2 #include #include @@ -28,9 +30,9 @@ #include "prim_internal.h" #include "prim_templates.h" +#ifdef WITH_SSE2 static primitives_t* generic = NULL; -#ifdef WITH_SSE2 /* ------------------------------------------------------------------------- */ SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16, generic->add_16s(sptr1++, sptr2++, dptr++, 1)) @@ -174,12 +176,12 @@ static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1, #endif /* ------------------------------------------------------------------------- */ -void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims) +void primitives_init_add_sse3(primitives_t* WINPR_RESTRICT prims) { +#if defined(WITH_SSE2) generic = primitives_get_generic(); primitives_init_add(prims); -#if defined(WITH_SSE2) if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) && IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */ { @@ -187,5 +189,7 @@ void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims) prims->add_16s_inplace = sse3_add_16s_inplace; } +#else + WINPR_UNUSED(prims); #endif } diff --git a/libfreerdp/primitives/prim_alphaComp_opt.c b/libfreerdp/primitives/sse/prim_alphaComp_sse3.c similarity index 97% rename from libfreerdp/primitives/prim_alphaComp_opt.c rename to libfreerdp/primitives/sse/prim_alphaComp_sse3.c index 5e3ec6fc2..392f9d31b 100644 --- a/libfreerdp/primitives/prim_alphaComp_opt.c +++ b/libfreerdp/primitives/sse/prim_alphaComp_sse3.c @@ -26,6 +26,8 @@ #include #include +#include "prim_alphaComp.h" + #ifdef WITH_SSE2 #include #include @@ -33,10 +35,9 @@ #include "prim_internal.h" -static primitives_t* generic = NULL; - /* ------------------------------------------------------------------------- */ #ifdef WITH_SSE2 +static primitives_t* generic = NULL; static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step, const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step, @@ -208,11 +209,11 @@ static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 sr #endif /* ------------------------------------------------------------------------- */ -void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims) +void primitives_init_alphaComp_sse3(primitives_t* WINPR_RESTRICT prims) { +#if defined(WITH_SSE2) generic = primitives_get_generic(); primitives_init_alphaComp(prims); -#if defined(WITH_SSE2) if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) && IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */ @@ -220,5 +221,7 @@ void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims) prims->alphaComp_argb = sse2_alphaComp_argb; } +#else + WINPR_UNUSED(prims); #endif } diff --git a/libfreerdp/primitives/prim_andor_opt.c b/libfreerdp/primitives/sse/prim_andor_sse3.c similarity index 93% rename from libfreerdp/primitives/prim_andor_opt.c rename to libfreerdp/primitives/sse/prim_andor_sse3.c index 4d8b5916b..57809b2ad 100644 --- a/libfreerdp/primitives/prim_andor_opt.c +++ b/libfreerdp/primitives/sse/prim_andor_sse3.c @@ -19,6 +19,8 @@ #include #include +#include "prim_andor.h" + #ifdef WITH_SSE2 #include #include @@ -27,9 +29,9 @@ #include "prim_internal.h" #include "prim_templates.h" +#ifdef WITH_SSE2 static primitives_t* generic = NULL; -#ifdef WITH_SSE2 /* ------------------------------------------------------------------------- */ SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, generic->andC_32u, _mm_and_si128, *dptr++ = *sptr++ & val) @@ -37,11 +39,11 @@ SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, generic->orC_32u, _mm_or_si128, *dptr #endif /* ------------------------------------------------------------------------- */ -void primitives_init_andor_opt(primitives_t* WINPR_RESTRICT prims) +void primitives_init_andor_sse3(primitives_t* WINPR_RESTRICT prims) { +#if defined(WITH_SSE2) generic = primitives_get_generic(); primitives_init_andor(prims); -#if defined(WITH_SSE2) if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) && IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) @@ -50,5 +52,7 @@ void primitives_init_andor_opt(primitives_t* WINPR_RESTRICT prims) prims->orC_32u = sse3_orC_32u; } +#else + WINPR_UNUSED(prims); #endif } diff --git a/libfreerdp/primitives/prim_colors_opt.c b/libfreerdp/primitives/sse/prim_colors_sse2.c similarity index 77% rename from libfreerdp/primitives/prim_colors_opt.c rename to libfreerdp/primitives/sse/prim_colors_sse2.c index 60debc3c6..40eea9ddc 100644 --- a/libfreerdp/primitives/prim_colors_opt.c +++ b/libfreerdp/primitives/sse/prim_colors_sse2.c @@ -23,18 +23,17 @@ #include #include +#include "prim_colors.h" + #ifdef WITH_SSE2 #include -#elif defined(WITH_NEON) -#include -#endif /* WITH_SSE2 else WITH_NEON */ +#endif /* WITH_SSE2 */ #include "prim_internal.h" #include "prim_templates.h" -static primitives_t* generic = NULL; - #ifdef WITH_SSE2 +static primitives_t* generic = NULL; #ifdef __GNUC__ #define GNU_INLINE __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1246,329 +1245,11 @@ sse2_RGBToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit } #endif /* WITH_SSE2 */ -/*---------------------------------------------------------------------------*/ -#ifdef WITH_NEON -static pstatus_t -neon_yCbCrToRGB_16s16s_P3P3(const INT16* const WINPR_RESTRICT pSrc[3], INT32 srcStep, - INT16* WINPR_RESTRICT pDst[3], INT32 dstStep, - const prim_size_t* WINPR_RESTRICT roi) /* region of interest */ -{ - /* TODO: If necessary, check alignments and call the general version. */ - int16x8_t zero = vdupq_n_s16(0); - int16x8_t max = vdupq_n_s16(255); - int16x8_t r_cr = vdupq_n_s16(22986); // 1.403 << 14 - int16x8_t g_cb = vdupq_n_s16(-5636); // -0.344 << 14 - int16x8_t g_cr = vdupq_n_s16(-11698); // -0.714 << 14 - int16x8_t b_cb = vdupq_n_s16(28999); // 1.770 << 14 - int16x8_t c4096 = vdupq_n_s16(4096); - int16x8_t* y_buf = (int16x8_t*)pSrc[0]; - int16x8_t* cb_buf = (int16x8_t*)pSrc[1]; - int16x8_t* cr_buf = (int16x8_t*)pSrc[2]; - int16x8_t* r_buf = (int16x8_t*)pDst[0]; - int16x8_t* g_buf = (int16x8_t*)pDst[1]; - int16x8_t* b_buf = (int16x8_t*)pDst[2]; - int srcbump = srcStep / sizeof(int16x8_t); - int dstbump = dstStep / sizeof(int16x8_t); - int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t); - - for (int yp = 0; yp < roi->height; ++yp) - { - for (int i = 0; i < imax; i++) - { - /* - In order to use NEON signed 16-bit integer multiplication we need to convert - the floating point factors to signed int without loosing information. - The result of this multiplication is 32 bit and we have a NEON instruction - that returns the hi word of the saturated double. - Thus we will multiply the factors by the highest possible 2^n, take the - upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right - shift by 1 to reverse the doubling) and correct this result by multiplying it - by 2^(16-n). - For the given factors in the conversion matrix the best possible n is 14. - - Example for calculating r: - r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula - r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above - r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification - r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3 - */ - /* y = (y_buf[i] + 4096) >> 2 */ - int16x8_t y = vld1q_s16((INT16*)&y_buf[i]); - y = vaddq_s16(y, c4096); - y = vshrq_n_s16(y, 2); - /* cb = cb_buf[i]; */ - int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]); - /* cr = cr_buf[i]; */ - int16x8_t cr = vld1q_s16((INT16*)&cr_buf[i]); - /* (y + HIWORD(cr*22986)) >> 3 */ - int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1)); - r = vshrq_n_s16(r, 3); - /* r_buf[i] = CLIP(r); */ - r = vminq_s16(vmaxq_s16(r, zero), max); - vst1q_s16((INT16*)&r_buf[i], r); - /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */ - int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1)); - g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1)); - g = vshrq_n_s16(g, 3); - /* g_buf[i] = CLIP(g); */ - g = vminq_s16(vmaxq_s16(g, zero), max); - vst1q_s16((INT16*)&g_buf[i], g); - /* (y + HIWORD(cb*28999)) >> 3 */ - int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1)); - b = vshrq_n_s16(b, 3); - /* b_buf[i] = CLIP(b); */ - b = vminq_s16(vmaxq_s16(b, zero), max); - vst1q_s16((INT16*)&b_buf[i], b); - } - - y_buf += srcbump; - cb_buf += srcbump; - cr_buf += srcbump; - r_buf += dstbump; - g_buf += dstbump; - b_buf += dstbump; - } - - return PRIMITIVES_SUCCESS; -} - -static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(const INT16* const WINPR_RESTRICT pSrc[3], - UINT32 srcStep, BYTE* WINPR_RESTRICT pDst, - UINT32 dstStep, - const prim_size_t* WINPR_RESTRICT roi, uint8_t rPos, - uint8_t gPos, uint8_t bPos, uint8_t aPos) -{ - BYTE* pRGB = pDst; - const INT16* pY = pSrc[0]; - const INT16* pCb = pSrc[1]; - const INT16* pCr = pSrc[2]; - const size_t srcPad = (srcStep - (roi->width * sizeof(INT16))) / sizeof(INT16); - const size_t dstPad = (dstStep - (roi->width * 4)) / 4; - const size_t pad = roi->width % 8; - const int16x4_t c4096 = vdup_n_s16(4096); - - for (UINT32 y = 0; y < roi->height; y++) - { - for (UINT32 x = 0; x < roi->width - pad; x += 8) - { - const int16x8_t Y = vld1q_s16(pY); - const int16x4_t Yh = vget_high_s16(Y); - const int16x4_t Yl = vget_low_s16(Y); - const int32x4_t YhAdd = vaddl_s16(Yh, c4096); /* Y + 4096 */ - const int32x4_t YlAdd = vaddl_s16(Yl, c4096); /* Y + 4096 */ - const int32x4_t YhW = vshlq_n_s32(YhAdd, 16); - const int32x4_t YlW = vshlq_n_s32(YlAdd, 16); - const int16x8_t Cr = vld1q_s16(pCr); - const int16x4_t Crh = vget_high_s16(Cr); - const int16x4_t Crl = vget_low_s16(Cr); - const int16x8_t Cb = vld1q_s16(pCb); - const int16x4_t Cbh = vget_high_s16(Cb); - const int16x4_t Cbl = vget_low_s16(Cb); - uint8x8x4_t bgrx; - { - /* R */ - const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916); /* 1.402525 * 2^16 */ - const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916); /* 1.402525 * 2^16 */ - const int32x4_t CrhRa = vaddq_s32(CrhR, YhW); - const int32x4_t CrlRa = vaddq_s32(CrlR, YlW); - const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21)); - const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21)); - const int16x8_t Rs = vcombine_s16(Rsl, Rsh); - bgrx.val[rPos] = vqmovun_s16(Rs); - } - { - /* G */ - const int32x4_t CbGh = vmull_n_s16(Cbh, 22527); /* 0.343730 * 2^16 */ - const int32x4_t CbGl = vmull_n_s16(Cbl, 22527); /* 0.343730 * 2^16 */ - const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819); /* 0.714401 * 2^16 */ - const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819); /* 0.714401 * 2^16 */ - const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh); - const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl); - const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh); - const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl); - const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21)); - const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21)); - const int16x8_t Gs = vcombine_s16(Gsl, Gsh); - const uint8x8_t G = vqmovun_s16(Gs); - bgrx.val[gPos] = G; - } - { - /* B */ - const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992); /* 1.769905 * 2^16 */ - const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992); /* 1.769905 * 2^16 */ - const int32x4_t YCbBh = vaddq_s32(CbBh, YhW); - const int32x4_t YCbBl = vaddq_s32(CbBl, YlW); - const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21)); - const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21)); - const int16x8_t Bs = vcombine_s16(Bsl, Bsh); - const uint8x8_t B = vqmovun_s16(Bs); - bgrx.val[bPos] = B; - } - /* A */ - { - bgrx.val[aPos] = vdup_n_u8(0xFF); - } - vst4_u8(pRGB, bgrx); - pY += 8; - pCb += 8; - pCr += 8; - pRGB += 32; - } - - for (UINT32 x = 0; x < pad; x++) - { - const INT32 divisor = 16; - const INT32 Y = ((*pY++) + 4096) << divisor; - const INT32 Cb = (*pCb++); - const INT32 Cr = (*pCr++); - const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor)); - const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor)); - const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor)); - const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor)); - INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5); - INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5); - INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5); - BYTE bgrx[4]; - bgrx[bPos] = CLIP(B); - bgrx[gPos] = CLIP(G); - bgrx[rPos] = CLIP(R); - bgrx[aPos] = 0xFF; - *pRGB++ = bgrx[0]; - *pRGB++ = bgrx[1]; - *pRGB++ = bgrx[2]; - *pRGB++ = bgrx[3]; - } - - pY += srcPad; - pCb += srcPad; - pCr += srcPad; - pRGB += dstPad; - } - - return PRIMITIVES_SUCCESS; -} - -static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3], - UINT32 srcStep, BYTE* WINPR_RESTRICT pDst, - UINT32 dstStep, UINT32 DstFormat, - const prim_size_t* WINPR_RESTRICT roi) -{ - switch (DstFormat) - { - case PIXEL_FORMAT_BGRA32: - case PIXEL_FORMAT_BGRX32: - return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3); - - case PIXEL_FORMAT_RGBA32: - case PIXEL_FORMAT_RGBX32: - return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3); - - case PIXEL_FORMAT_ARGB32: - case PIXEL_FORMAT_XRGB32: - return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0); - - case PIXEL_FORMAT_ABGR32: - case PIXEL_FORMAT_XBGR32: - return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0); - - default: - return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); - } -} - -static pstatus_t neon_RGBToRGB_16s8u_P3AC4R_X( - const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */ - UINT32 srcStep, /* bytes between rows in source data */ - BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */ - UINT32 dstStep, /* bytes between rows in dest data */ - const prim_size_t* WINPR_RESTRICT roi, /* region of interest */ - uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos) -{ - UINT32 pad = roi->width % 8; - - for (UINT32 y = 0; y < roi->height; y++) - { - const INT16* pr = (INT16*)(((BYTE*)pSrc[0]) + y * srcStep); - const INT16* pg = (INT16*)(((BYTE*)pSrc[1]) + y * srcStep); - const INT16* pb = (INT16*)(((BYTE*)pSrc[2]) + y * srcStep); - BYTE* dst = pDst + y * dstStep; - - for (UINT32 x = 0; x < roi->width - pad; x += 8) - { - int16x8_t r = vld1q_s16(pr); - int16x8_t g = vld1q_s16(pg); - int16x8_t b = vld1q_s16(pb); - uint8x8x4_t bgrx; - bgrx.val[aPos] = vdup_n_u8(0xFF); - bgrx.val[rPos] = vqmovun_s16(r); - bgrx.val[gPos] = vqmovun_s16(g); - bgrx.val[bPos] = vqmovun_s16(b); - vst4_u8(dst, bgrx); - pr += 8; - pg += 8; - pb += 8; - dst += 32; - } - - for (UINT32 x = 0; x < pad; x++) - { - BYTE bgrx[4]; - bgrx[bPos] = *pb++; - bgrx[gPos] = *pg++; - bgrx[rPos] = *pr++; - bgrx[aPos] = 0xFF; - *dst++ = bgrx[0]; - *dst++ = bgrx[1]; - *dst++ = bgrx[2]; - *dst++ = bgrx[3]; - } - } - - return PRIMITIVES_SUCCESS; -} - -static pstatus_t -neon_RGBToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */ - UINT32 srcStep, /* bytes between rows in source data */ - BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */ - UINT32 dstStep, /* bytes between rows in dest data */ - UINT32 DstFormat, - const prim_size_t* WINPR_RESTRICT roi) /* region of interest */ -{ - switch (DstFormat) - { - case PIXEL_FORMAT_BGRA32: - case PIXEL_FORMAT_BGRX32: - return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3); - - case PIXEL_FORMAT_RGBA32: - case PIXEL_FORMAT_RGBX32: - return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3); - - case PIXEL_FORMAT_ARGB32: - case PIXEL_FORMAT_XRGB32: - return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0); - - case PIXEL_FORMAT_ABGR32: - case PIXEL_FORMAT_XBGR32: - return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0); - - default: - return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); - } -} -#endif /* WITH_NEON */ -/* I don't see a direct IPP version of this, since the input is INT16 - * YCbCr. It may be possible via Deinterleave and then YCbCrToRGB_. - * But that would likely be slower. - */ - -/* ------------------------------------------------------------------------- */ -void primitives_init_colors_opt(primitives_t* prims) +void primitives_init_colors_sse2(primitives_t* prims) { +#if defined(WITH_SSE2) generic = primitives_get_generic(); primitives_init_colors(prims); -#if defined(WITH_SSE2) if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE)) { @@ -1578,14 +1259,7 @@ void primitives_init_colors_opt(primitives_t* prims) prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3; } -#elif defined(WITH_NEON) - - if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE)) - { - prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R; - prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R; - prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3; - } - -#endif /* WITH_SSE2 */ +#else + WINPR_UNUSED(prims); +#endif } diff --git a/libfreerdp/primitives/prim_copy_avx2.c b/libfreerdp/primitives/sse/prim_copy_avx2.c similarity index 100% rename from libfreerdp/primitives/prim_copy_avx2.c rename to libfreerdp/primitives/sse/prim_copy_avx2.c diff --git a/libfreerdp/primitives/prim_copy_sse.c b/libfreerdp/primitives/sse/prim_copy_sse4_1.c similarity index 99% rename from libfreerdp/primitives/prim_copy_sse.c rename to libfreerdp/primitives/sse/prim_copy_sse4_1.c index c2d102e78..d073928a3 100644 --- a/libfreerdp/primitives/prim_copy_sse.c +++ b/libfreerdp/primitives/sse/prim_copy_sse4_1.c @@ -268,7 +268,7 @@ static pstatus_t sse_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD #endif /* ------------------------------------------------------------------------- */ -void primitives_init_copy_sse(primitives_t* prims) +void primitives_init_copy_sse41(primitives_t* prims) { #if defined(WITH_SSE2) if (IsProcessorFeaturePresent(PF_SSE4_1_INSTRUCTIONS_AVAILABLE)) diff --git a/libfreerdp/primitives/prim_set_opt.c b/libfreerdp/primitives/sse/prim_set_sse2.c similarity index 97% rename from libfreerdp/primitives/prim_set_opt.c rename to libfreerdp/primitives/sse/prim_set_sse2.c index f77cb310d..b4c1949f0 100644 --- a/libfreerdp/primitives/prim_set_opt.c +++ b/libfreerdp/primitives/sse/prim_set_sse2.c @@ -26,11 +26,12 @@ #endif /* WITH_SSE2 */ #include "prim_internal.h" - -static primitives_t* generic = NULL; +#include "prim_set.h" /* ========================================================================= */ #ifdef WITH_SSE2 +static primitives_t* generic = NULL; + static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 len) { BYTE byte = 0; @@ -216,14 +217,13 @@ static pstatus_t sse2_set_32s(INT32 val, INT32* WINPR_RESTRICT pDst, UINT32 len) #endif /* WITH_SSE2 */ /* ------------------------------------------------------------------------- */ -void primitives_init_set_opt(primitives_t* WINPR_RESTRICT prims) +void primitives_init_set_sse2(primitives_t* WINPR_RESTRICT prims) { +#if defined(WITH_SSE2) generic = primitives_get_generic(); primitives_init_set(prims); /* Pick tuned versions if possible. */ -#if defined(WITH_SSE2) - if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE)) { prims->set_8u = sse2_set_8u; @@ -231,5 +231,7 @@ void primitives_init_set_opt(primitives_t* WINPR_RESTRICT prims) prims->set_32u = sse2_set_32u; } +#else + WINPR_UNUSED(prims); #endif } diff --git a/libfreerdp/primitives/prim_shift_opt.c b/libfreerdp/primitives/sse/prim_shift_sse3.c similarity index 97% rename from libfreerdp/primitives/prim_shift_opt.c rename to libfreerdp/primitives/sse/prim_shift_sse3.c index a4dd3c6f0..ea50eb6a4 100644 --- a/libfreerdp/primitives/prim_shift_opt.c +++ b/libfreerdp/primitives/sse/prim_shift_sse3.c @@ -19,6 +19,8 @@ #include #include +#include "prim_shift.h" + #ifdef WITH_SSE2 #include #include @@ -27,9 +29,9 @@ #include "prim_internal.h" #include "prim_templates.h" +#ifdef WITH_SSE2 static primitives_t* generic = NULL; -#ifdef WITH_SSE2 /* ------------------------------------------------------------------------- */ SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, _mm_slli_epi16, *dptr++ = (INT16)((UINT16)*sptr++ << val)) @@ -142,11 +144,11 @@ static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 */ /* ------------------------------------------------------------------------- */ -void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims) +void primitives_init_shift_sse3(primitives_t* WINPR_RESTRICT prims) { +#if defined(WITH_SSE2) generic = primitives_get_generic(); primitives_init_shift(prims); -#if defined(WITH_SSE2) if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) && IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) @@ -158,5 +160,7 @@ void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims) prims->rShiftC_16u = sse2_rShiftC_16u; } +#else + WINPR_UNUSED(prims); #endif } diff --git a/libfreerdp/primitives/prim_sign_opt.c b/libfreerdp/primitives/sse/prim_sign_ssse3.c similarity index 96% rename from libfreerdp/primitives/prim_sign_opt.c rename to libfreerdp/primitives/sse/prim_sign_ssse3.c index dae76a6c9..c430c827d 100644 --- a/libfreerdp/primitives/prim_sign_opt.c +++ b/libfreerdp/primitives/sse/prim_sign_ssse3.c @@ -19,16 +19,18 @@ #include #include -#ifdef WITH_SSE2 +#include "prim_sign.h" + +#if defined(WITH_SSE2) #include #include -#endif /* WITH_SSE2 */ +#endif #include "prim_internal.h" +#if defined(WITH_SSE2) static primitives_t* generic = NULL; -#ifdef WITH_SSE2 /* ------------------------------------------------------------------------- */ static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst, UINT32 len) @@ -167,13 +169,13 @@ static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_R #endif /* WITH_SSE2 */ /* ------------------------------------------------------------------------- */ -void primitives_init_sign_opt(primitives_t* WINPR_RESTRICT prims) +void primitives_init_sign_ssse3(primitives_t* WINPR_RESTRICT prims) { +#if defined(WITH_SSE2) generic = primitives_get_generic(); primitives_init_sign(prims); /* Pick tuned versions if possible. */ /* I didn't spot an IPP version of this. */ -#if defined(WITH_SSE2) if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) && IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) @@ -181,5 +183,7 @@ void primitives_init_sign_opt(primitives_t* WINPR_RESTRICT prims) prims->sign_16s = ssse3_sign_16s; } +#else + WINPR_UNUSED(prims); #endif }