diff --git a/CMakeLists.txt b/CMakeLists.txt index aa541170c..3e6098432 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -231,6 +231,10 @@ set(GSTREAMER_FEATURE_TYPE "RECOMMENDED") set(GSTREAMER_FEATURE_PURPOSE "multimedia") set(GSTREAMER_FEATURE_DESCRIPTION "multimedia redirection, audio and video playback") +set(IPP_FEATURE_TYPE "OPTIONAL") +set(IPP_FEATURE_PURPOSE "performance") +set(IPP_FEATURE_DESCRIPTION "Intel Integrated Performance Primitives library") + if(WIN32) set(X11_FEATURE_TYPE "DISABLED") set(ZLIB_FEATURE_TYPE "DISABLED") @@ -285,6 +289,9 @@ find_feature(PCSC ${PCSC_FEATURE_TYPE} ${PCSC_FEATURE_PURPOSE} ${PCSC_FEATURE_DE find_feature(FFmpeg ${FFMPEG_FEATURE_TYPE} ${FFMPEG_FEATURE_PURPOSE} ${FFMPEG_FEATURE_DESCRIPTION}) find_feature(Gstreamer ${GSTREAMER_FEATURE_TYPE} ${GSTREAMER_FEATURE_PURPOSE} ${GSTREAMER_FEATURE_DESCRIPTION}) +# Intel Performance Primitives +find_feature(IPP ${IPP_FEATURE_TYPE} ${IPP_FEATURE_PURPOSE} ${IPP_FEATURE_DESCRIPTION}) + # Installation Paths if(WIN32) set(CMAKE_INSTALL_BINDIR ".") diff --git a/client/DirectFB/CMakeLists.txt b/client/DirectFB/CMakeLists.txt index bd3637de4..52c07ef69 100644 --- a/client/DirectFB/CMakeLists.txt +++ b/client/DirectFB/CMakeLists.txt @@ -36,7 +36,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} freerdp-client) set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS MONOLITHIC ${MONOLITHIC_BUILD} MODULE freerdp - MODULES freerdp-core freerdp-gdi freerdp-locale freerdp-codec freerdp-utils) + MODULES freerdp-core freerdp-gdi freerdp-locale freerdp-codec freerdp-primitives freerdp-utils) target_link_libraries(${MODULE_NAME} ${${MODULE_PREFIX}_LIBS}) install(TARGETS ${MODULE_NAME} DESTINATION ${CMAKE_INSTALL_BINDIR}) diff --git a/client/Mac/CMakeLists.txt b/client/Mac/CMakeLists.txt index 8e7a1a0ec..734b66933 100644 --- a/client/Mac/CMakeLists.txt +++ b/client/Mac/CMakeLists.txt @@ -78,7 +78,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} freerdp-client) set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS MONOLITHIC ${MONOLITHIC_BUILD} MODULE freerdp - MODULES freerdp-core freerdp-cache freerdp-gdi freerdp-codec freerdp-rail freerdp-utils) + MODULES freerdp-core freerdp-cache freerdp-gdi freerdp-codec freerdp-primitives freerdp-rail freerdp-utils) set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS MONOLITHIC ${MONOLITHIC_BUILD} MODULE winpr diff --git a/client/Windows/CMakeLists.txt b/client/Windows/CMakeLists.txt index 312f75db4..92e75a532 100644 --- a/client/Windows/CMakeLists.txt +++ b/client/Windows/CMakeLists.txt @@ -46,7 +46,7 @@ set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS MONOLITHIC ${MONOLITHIC_BUILD} MODULE freerdp - MODULES freerdp-core freerdp-gdi freerdp-codec freerdp-utils) + MODULES freerdp-core freerdp-gdi freerdp-codec freerdp-primitives freerdp-utils) target_link_libraries(${MODULE_NAME} ${${MODULE_PREFIX}_LIBS}) diff --git a/client/X11/CMakeLists.txt b/client/X11/CMakeLists.txt index ad93f63cd..735ebeb00 100644 --- a/client/X11/CMakeLists.txt +++ b/client/X11/CMakeLists.txt @@ -120,10 +120,14 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} freerdp-client) set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS MONOLITHIC ${MONOLITHIC_BUILD} MODULE freerdp - MODULES freerdp-core freerdp-gdi freerdp-locale freerdp-rail freerdp-utils) + MODULES freerdp-core freerdp-gdi freerdp-locale freerdp-primitives freerdp-rail freerdp-utils) target_link_libraries(${MODULE_NAME} ${${MODULE_PREFIX}_LIBS}) +if(WITH_IPP) + target_link_libraries(xfreerdp ${IPP_LIBRARY_LIST}) +endif() + install(TARGETS ${MODULE_NAME} DESTINATION ${CMAKE_INSTALL_BINDIR}) set_property(TARGET ${MODULE_NAME} PROPERTY FOLDER "Client/X11") diff --git a/cmake/ConfigOptions.cmake b/cmake/ConfigOptions.cmake index 7f7c09be9..c207611a5 100644 --- a/cmake/ConfigOptions.cmake +++ b/cmake/ConfigOptions.cmake @@ -9,6 +9,7 @@ endif() option(WITH_MANPAGES "Generate manpages." ON) option(WITH_PROFILER "Compile profiler." OFF) +option(WITH_IPP "Use Intel Performance Primitives." OFF) if((TARGET_ARCH MATCHES "x86|x64") AND (NOT DEFINED WITH_SSE2)) option(WITH_SSE2 "Enable SSE2 optimization." ON) diff --git a/cmake/FindIPP.cmake b/cmake/FindIPP.cmake index 22e01b979..d202f1af9 100644 --- a/cmake/FindIPP.cmake +++ b/cmake/FindIPP.cmake @@ -265,7 +265,7 @@ if(NOT IPP_FOUND) # Note, if several IPP installations found the newest version will be # selected # ------------------------------------------------------------------------ - foreach(curdir ${CMAKE_SYSTEM_PREFIX_PATH}) + foreach(curdir ${CMAKE_SYSTEM_PREFIX_PATH} /opt) set(curdir ${curdir}/intel) file(TO_CMAKE_PATH ${curdir} CURDIR) @@ -336,3 +336,33 @@ if(WIN32 AND MINGW AND NOT IPP_LATEST_VERSION_MAJOR LESS 7) set(MSV_NTDLL "ntdll") set(IPP_LIBRARIES ${IPP_LIBRARIES} ${MSV_NTDLL}${IPP_LIB_SUFFIX}) endif() + +# ------------------------------------------------------------------------ +# This section will look for the IPP "compiler" dependent library +# libiomp5. +# ------------------------------------------------------------------------ +foreach(curdir ${CMAKE_SYSTEM_PREFIX_PATH} /opt) + set(curdir ${curdir}/intel) + + if(EXISTS ${curdir}) + file(GLOB_RECURSE liblist FOLLOW_SYMLINKS ${curdir}/libiomp5.*) + foreach(lib ${liblist}) + get_filename_component(libdir ${lib} REALPATH) + get_filename_component(libdir ${libdir} PATH) + set(IPP_COMPILER_LIBRARY_DIRS ${libdir}) + set(IPP_COMPILER_LIBRARIES iomp5) + endforeach(lib) + endif() +endforeach(curdir) + +# ------------------------------------------------------------------------ +# Build fullpath library list. +# ------------------------------------------------------------------------ +find_library(LIB_IPPI ippi PATHS ${IPP_LIBRARY_DIRS}) +set(IPP_LIBRARY_LIST ${IPP_LIBRARY_LIST} ${LIB_IPPI}) +find_library(LIB_IPPS ipps PATHS ${IPP_LIBRARY_DIRS}) +set(IPP_LIBRARY_LIST ${IPP_LIBRARY_LIST} ${LIB_IPPS}) +find_library(LIB_IPPCORE ippcore PATHS ${IPP_LIBRARY_DIRS}) +set(IPP_LIBRARY_LIST ${IPP_LIBRARY_LIST} ${LIB_IPPCORE}) +find_library(LIB_IOMP5 iomp5 PATHS ${IPP_COMPILER_LIBRARY_DIRS}) +set(IPP_LIBRARY_LIST ${IPP_LIBRARY_LIST} ${LIB_IOMP5}) diff --git a/config.h.in b/config.h.in index 71b69fba2..1bb2b647f 100644 --- a/config.h.in +++ b/config.h.in @@ -36,6 +36,7 @@ #cmakedefine WITH_PROFILER #cmakedefine WITH_SSE2 #cmakedefine WITH_NEON +#cmakedefine WITH_IPP #cmakedefine WITH_NATIVE_SSPI #cmakedefine WITH_JPEG #cmakedefine WITH_WIN8 diff --git a/include/freerdp/codec/rfx.h b/include/freerdp/codec/rfx.h index 266cb065c..e2e9b6c85 100644 --- a/include/freerdp/codec/rfx.h +++ b/include/freerdp/codec/rfx.h @@ -101,8 +101,6 @@ struct _RFX_CONTEXT BYTE quant_idx_cr; /* routines */ - void (*decode_ycbcr_to_rgb)(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf); - void (*encode_rgb_to_ycbcr)(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf); void (*quantization_decode)(INT16* buffer, const UINT32* quantization_values); void (*quantization_encode)(INT16* buffer, const UINT32* quantization_values); void (*dwt_2d_decode)(INT16* buffer, INT16* dwt_buffer); diff --git a/include/freerdp/primitives.h b/include/freerdp/primitives.h new file mode 100644 index 000000000..8da2155d1 --- /dev/null +++ b/include/freerdp/primitives.h @@ -0,0 +1,207 @@ +/* primitives.h + * vi:ts=4 sw=4 + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. Algorithms used by + * this code may be covered by patents by HP, Microsoft, or other parties. + */ + +#ifdef __GNUC__ +# pragma once +#endif + +#ifndef __PRIMITIVES_H_INCLUDED__ +#define __PRIMITIVES_H_INCLUDED__ + +#include +#include + +typedef INT32 pstatus_t; /* match IppStatus. */ +#define PRIMITIVES_SUCCESS (0) /* match ippStsNoErr */ + +/* Simple macro for address of an x,y location in 2d 4-byte memory block */ +#define PIXMAP4_ADDR(_dst_, _x_, _y_, _span_) \ + ((void *) (((BYTE *) (_dst_)) + (((_x_) + (_y_)*(_span_)) << 2))) + +#define PRIM_X86_MMX_AVAILABLE (1U<<0) +#define PRIM_X86_3DNOW_AVAILABLE (1U<<1) +#define PRIM_X86_3DNOW_PREFETCH_AVAILABLE (1U<<2) +#define PRIM_X86_SSE_AVAILABLE (1U<<3) +#define PRIM_X86_SSE2_AVAILABLE (1U<<4) +#define PRIM_X86_SSE3_AVAILABLE (1U<<5) +#define PRIM_X86_SSSE3_AVAILABLE (1U<<6) +#define PRIM_X86_SSE41_AVAILABLE (1U<<7) +#define PRIM_X86_SSE42_AVAILABLE (1U<<8) +#define PRIM_X86_AVX_AVAILABLE (1U<<9) +#define PRIM_X86_FMA_AVAILABLE (1U<<10) +#define PRIM_X86_AVX_AES_AVAILABLE (1U<<11) +#define PRIM_X86_AVX2_AVAILABLE (1U<<12) + +#define PRIM_ARM_VFP1_AVAILABLE (1U<<0) +#define PRIM_ARM_VFP2_AVAILABLE (1U<<1) +#define PRIM_ARM_VFP3_AVAILABLE (1U<<2) +#define PRIM_ARM_VFP4_AVAILABLE (1U<<3) +#define PRIM_ARM_FPA_AVAILABLE (1U<<4) +#define PRIM_ARM_FPE_AVAILABLE (1U<<5) +#define PRIM_ARM_IWMMXT_AVAILABLE (1U<<6) +#define PRIM_ARM_NEON_AVAILABLE (1U<<7) + +/* Structures compatible with IPP */ +typedef struct +{ + INT32 width; + INT32 height; +} prim_size_t; /* like IppiSize */ + +/* Function prototypes for all of the supported primitives. */ +typedef pstatus_t (*__copy_t)( + const void *pSrc, + void *pDst, + INT32 bytes); +typedef pstatus_t (*__copy_8u_t)( + const BYTE *pSrc, + BYTE *pDst, + INT32 len); +typedef pstatus_t (*__copy_8u_AC4r_t)( + const BYTE *pSrc, + INT32 srcStep, /* bytes */ + BYTE *pDst, + INT32 dstStep, /* bytes */ + INT32 width, INT32 height); /* pixels */ +typedef pstatus_t (*__set_8u_t)( + BYTE val, + BYTE *pDst, + INT32 len); +typedef pstatus_t (*__set_32s_t)( + INT32 val, + INT32 *pDst, + INT32 len); +typedef pstatus_t (*__set_32u_t)( + UINT32 val, + UINT32 *pDst, + INT32 len); +typedef pstatus_t (*__zero_t)( + void *pDst, + size_t bytes); +typedef pstatus_t (*__alphaComp_argb_t)( + const BYTE *pSrc1, INT32 src1Step, + const BYTE *pSrc2, INT32 src2Step, + BYTE *pDst, INT32 dstStep, + INT32 width, INT32 height); +typedef pstatus_t (*__add_16s_t)( + const INT16 *pSrc1, + const INT16 *pSrc2, + INT16 *pDst, + INT32 len); +typedef pstatus_t (*__lShiftC_16s_t)( + const INT16 *pSrc, + INT32 val, + INT16 *pSrcDst, + INT32 len); +typedef pstatus_t (*__lShiftC_16u_t)( + const UINT16 *pSrc, + INT32 val, + UINT16 *pSrcDst, + INT32 len); +typedef pstatus_t (*__rShiftC_16s_t)( + const INT16 *pSrc, + INT32 val, + INT16 *pSrcDst, + INT32 len); +typedef pstatus_t (*__rShiftC_16u_t)( + const UINT16 *pSrc, + INT32 val, + UINT16 *pSrcDst, + INT32 len); +typedef pstatus_t (*__shiftC_16s_t)( + const INT16 *pSrc, + INT32 val, + INT16 *pSrcDst, + INT32 len); +typedef pstatus_t (*__shiftC_16u_t)( + const UINT16 *pSrc, + INT32 val, + UINT16 *pSrcDst, + INT32 len); +typedef pstatus_t (*__sign_16s_t)( + const INT16 *pSrc, + INT16 *pDst, + INT32 len); +typedef pstatus_t (*__yCbCrToRGB_16s16s_P3P3_t)( + const INT16 *pSrc[3], INT32 srcStep, + INT16 *pDst[3], INT32 dstStep, + const prim_size_t *roi); +typedef pstatus_t (*__RGBToYCbCr_16s16s_P3P3_t)( + const INT16 *pSrc[3], INT32 srcStep, + INT16 *pDst[3], INT32 dstStep, + const prim_size_t *roi); +typedef pstatus_t (*__RGBToRGB_16s8u_P3AC4R_t)( + const INT16 *pSrc[3], INT32 srcStep, + BYTE *pDst, INT32 dstStep, + const prim_size_t *roi); +typedef pstatus_t (*__andC_32u_t)( + const UINT32 *pSrc, + UINT32 val, + UINT32 *pDst, + INT32 len); +typedef pstatus_t (*__orC_32u_t)( + const UINT32 *pSrc, + UINT32 val, + UINT32 *pDst, + INT32 len); + +typedef struct +{ + /* Memory-to-memory copy routines */ + __copy_t copy; /* memcpy/memmove, basically */ + __copy_8u_t copy_8u; /* more strongly typed */ + __copy_8u_AC4r_t copy_8u_AC4r; /* pixel copy function */ + /* Memory setting routines */ + __set_8u_t set_8u; /* memset, basically */ + __set_32s_t set_32s; + __set_32u_t set_32u; + __zero_t zero; /* bzero or faster */ + /* Arithmetic functions */ + __add_16s_t add_16s; + /* And/or */ + __andC_32u_t andC_32u; + __orC_32u_t orC_32u; + /* Shifts */ + __lShiftC_16s_t lShiftC_16s; + __lShiftC_16u_t lShiftC_16u; + __rShiftC_16s_t rShiftC_16s; + __rShiftC_16u_t rShiftC_16u; + __shiftC_16s_t shiftC_16s; + __shiftC_16u_t shiftC_16u; + /* Alpha Composition */ + __alphaComp_argb_t alphaComp_argb; + /* Sign */ + __sign_16s_t sign_16s; + /* Color conversions */ + __yCbCrToRGB_16s16s_P3P3_t yCbCrToRGB_16s16s_P3P3; + __RGBToYCbCr_16s16s_P3P3_t RGBToYCbCr_16s16s_P3P3; + __RGBToRGB_16s8u_P3AC4R_t RGBToRGB_16s8u_P3AC4R; + + /* internal use for CPU flags and such. */ + void *hints; +} primitives_t; + +/* Prototypes for the externally-visible entrypoints. */ +FREERDP_API void primitives_init(void); +FREERDP_API primitives_t *primitives_get(void); +FREERDP_API UINT32 primitives_get_flags( + const primitives_t *prims); +FREERDP_API void primitives_flags_str( + const primitives_t *prims, + char *str, + size_t len); +FREERDP_API void primitives_deinit(void); + +#endif /* !__PRIMITIVES_H_INCLUDED__ */ diff --git a/libfreerdp/CMakeLists.txt b/libfreerdp/CMakeLists.txt index ed6592a27..b8b6adb28 100644 --- a/libfreerdp/CMakeLists.txt +++ b/libfreerdp/CMakeLists.txt @@ -31,6 +31,7 @@ set(${MODULE_PREFIX}_SUBMODULES codec crypto locale + primitives core) foreach(${MODULE_PREFIX}_SUBMODULE ${${MODULE_PREFIX}_SUBMODULES}) diff --git a/libfreerdp/codec/rfx.c b/libfreerdp/codec/rfx.c index 6acf280f8..e30f7f3a3 100644 --- a/libfreerdp/codec/rfx.c +++ b/libfreerdp/codec/rfx.c @@ -79,7 +79,7 @@ static void rfx_profiler_create(RFX_CONTEXT* context) PROFILER_CREATE(context->priv->prof_rfx_differential_decode, "rfx_differential_decode"); PROFILER_CREATE(context->priv->prof_rfx_quantization_decode, "rfx_quantization_decode"); PROFILER_CREATE(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode"); - PROFILER_CREATE(context->priv->prof_rfx_decode_ycbcr_to_rgb, "rfx_decode_ycbcr_to_rgb"); + PROFILER_CREATE(context->priv->prof_rfx_ycbcr_to_rgb, "prims->yCbCrToRGB"); PROFILER_CREATE(context->priv->prof_rfx_decode_format_rgb, "rfx_decode_format_rgb"); PROFILER_CREATE(context->priv->prof_rfx_encode_rgb, "rfx_encode_rgb"); @@ -88,7 +88,7 @@ static void rfx_profiler_create(RFX_CONTEXT* context) PROFILER_CREATE(context->priv->prof_rfx_differential_encode, "rfx_differential_encode"); PROFILER_CREATE(context->priv->prof_rfx_quantization_encode, "rfx_quantization_encode"); PROFILER_CREATE(context->priv->prof_rfx_dwt_2d_encode, "rfx_dwt_2d_encode"); - PROFILER_CREATE(context->priv->prof_rfx_encode_rgb_to_ycbcr, "rfx_encode_rgb_to_ycbcr"); + PROFILER_CREATE(context->priv->prof_rfx_rgb_to_ycbcr, "prims->RGBToYCbCr"); PROFILER_CREATE(context->priv->prof_rfx_encode_format_rgb, "rfx_encode_format_rgb"); } @@ -100,7 +100,7 @@ static void rfx_profiler_free(RFX_CONTEXT* context) PROFILER_FREE(context->priv->prof_rfx_differential_decode); PROFILER_FREE(context->priv->prof_rfx_quantization_decode); PROFILER_FREE(context->priv->prof_rfx_dwt_2d_decode); - PROFILER_FREE(context->priv->prof_rfx_decode_ycbcr_to_rgb); + PROFILER_FREE(context->priv->prof_rfx_ycbcr_to_rgb); PROFILER_FREE(context->priv->prof_rfx_decode_format_rgb); PROFILER_FREE(context->priv->prof_rfx_encode_rgb); @@ -109,7 +109,7 @@ static void rfx_profiler_free(RFX_CONTEXT* context) PROFILER_FREE(context->priv->prof_rfx_differential_encode); PROFILER_FREE(context->priv->prof_rfx_quantization_encode); PROFILER_FREE(context->priv->prof_rfx_dwt_2d_encode); - PROFILER_FREE(context->priv->prof_rfx_encode_rgb_to_ycbcr); + PROFILER_FREE(context->priv->prof_rfx_rgb_to_ycbcr); PROFILER_FREE(context->priv->prof_rfx_encode_format_rgb); } @@ -123,7 +123,7 @@ static void rfx_profiler_print(RFX_CONTEXT* context) PROFILER_PRINT(context->priv->prof_rfx_differential_decode); PROFILER_PRINT(context->priv->prof_rfx_quantization_decode); PROFILER_PRINT(context->priv->prof_rfx_dwt_2d_decode); - PROFILER_PRINT(context->priv->prof_rfx_decode_ycbcr_to_rgb); + PROFILER_PRINT(context->priv->prof_rfx_ycbcr_to_rgb); PROFILER_PRINT(context->priv->prof_rfx_decode_format_rgb); PROFILER_PRINT(context->priv->prof_rfx_encode_rgb); @@ -132,7 +132,7 @@ static void rfx_profiler_print(RFX_CONTEXT* context) PROFILER_PRINT(context->priv->prof_rfx_differential_encode); PROFILER_PRINT(context->priv->prof_rfx_quantization_encode); PROFILER_PRINT(context->priv->prof_rfx_dwt_2d_encode); - PROFILER_PRINT(context->priv->prof_rfx_encode_rgb_to_ycbcr); + PROFILER_PRINT(context->priv->prof_rfx_rgb_to_ycbcr); PROFILER_PRINT(context->priv->prof_rfx_encode_format_rgb); PROFILER_PRINT_FOOTER; @@ -164,8 +164,6 @@ RFX_CONTEXT* rfx_context_new(void) rfx_profiler_create(context); /* set up default routines */ - context->decode_ycbcr_to_rgb = rfx_decode_ycbcr_to_rgb; - context->encode_rgb_to_ycbcr = rfx_encode_rgb_to_ycbcr; context->quantization_decode = rfx_quantization_decode; context->quantization_encode = rfx_quantization_encode; context->dwt_2d_decode = rfx_dwt_2d_decode; @@ -414,7 +412,7 @@ static void rfx_process_message_tile(RFX_CONTEXT* context, RFX_TILE* tile, STREA YLen, context->quants + (quantIdxY * 10), CbLen, context->quants + (quantIdxCb * 10), CrLen, context->quants + (quantIdxCr * 10), - tile->data); + tile->data, 64*sizeof(UINT32)); } static void rfx_process_message_tileset(RFX_CONTEXT* context, RFX_MESSAGE* message, STREAM* s) diff --git a/libfreerdp/codec/rfx_decode.c b/libfreerdp/codec/rfx_decode.c index 2d7774590..02d87d8d4 100644 --- a/libfreerdp/codec/rfx_decode.c +++ b/libfreerdp/codec/rfx_decode.c @@ -27,6 +27,7 @@ #include #include +#include #include "rfx_types.h" #include "rfx_rlgr.h" @@ -36,49 +37,55 @@ #include "rfx_decode.h" +/* stride is bytes between rows in the output buffer. */ static void rfx_decode_format_rgb(INT16* r_buf, INT16* g_buf, INT16* b_buf, - RDP_PIXEL_FORMAT pixel_format, BYTE* dst_buf) + RDP_PIXEL_FORMAT pixel_format, BYTE* dst_buf, int stride) { + primitives_t *prims = primitives_get(); INT16* r = r_buf; INT16* g = g_buf; INT16* b = b_buf; + INT16* pSrc[3]; + static const prim_size_t roi_64x64 = { 64, 64 }; BYTE* dst = dst_buf; - int i; + int x, y; switch (pixel_format) { case RDP_PIXEL_FORMAT_B8G8R8A8: - for (i = 0; i < 4096; i++) - { - *dst++ = (BYTE) (*b++); - *dst++ = (BYTE) (*g++); - *dst++ = (BYTE) (*r++); - *dst++ = 0xFF; - } + pSrc[0] = r; pSrc[1] = g; pSrc[2] = b; + prims->RGBToRGB_16s8u_P3AC4R( + (const INT16 **) pSrc, 64*sizeof(INT16), + dst, stride, &roi_64x64); break; case RDP_PIXEL_FORMAT_R8G8B8A8: - for (i = 0; i < 4096; i++) - { - *dst++ = (BYTE) (*r++); - *dst++ = (BYTE) (*g++); - *dst++ = (BYTE) (*b++); - *dst++ = 0xFF; - } + pSrc[0] = b; pSrc[1] = g; pSrc[2] = r; + prims->RGBToRGB_16s8u_P3AC4R( + (const INT16 **) pSrc, 64*sizeof(INT16), + dst, stride, &roi_64x64); break; case RDP_PIXEL_FORMAT_B8G8R8: - for (i = 0; i < 4096; i++) + for (y=0; y<64; y++) { - *dst++ = (BYTE) (*b++); - *dst++ = (BYTE) (*g++); - *dst++ = (BYTE) (*r++); + for (x=0; x<64; x++) + { + *dst++ = (BYTE) (*b++); + *dst++ = (BYTE) (*g++); + *dst++ = (BYTE) (*r++); + } + dst += stride - (64*3); } break; case RDP_PIXEL_FORMAT_R8G8B8: - for (i = 0; i < 4096; i++) + for (y=0; y<64; y++) { - *dst++ = (BYTE) (*r++); - *dst++ = (BYTE) (*g++); - *dst++ = (BYTE) (*b++); + for (x=0; x<64; x++) + { + *dst++ = (BYTE) (*r++); + *dst++ = (BYTE) (*g++); + *dst++ = (BYTE) (*b++); + } + dst += stride - (64*3); } break; default: @@ -86,69 +93,6 @@ static void rfx_decode_format_rgb(INT16* r_buf, INT16* g_buf, INT16* b_buf, } } -#define MINMAX(_v,_l,_h) ((_v) < (_l) ? (_l) : ((_v) > (_h) ? (_h) : (_v))) - -void rfx_decode_ycbcr_to_rgb(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf) -{ - /* INT32 is used intentionally because we calculate with shifted factors! */ - INT32 y, cb, cr; - INT32 r, g, b; - int i; - - /** - * The decoded YCbCr coeffectients are represented as 11.5 fixed-point numbers: - * - * 1 sign bit + 10 integer bits + 5 fractional bits - * - * However only 7 integer bits will be actually used since the value range is [-128.0, 127.0]. - * In other words, the decoded coeffectients is scaled by << 5 when intepreted as INT16. - * It was scaled in the quantization phase, so we must scale it back here. - */ - for (i = 0; i < 4096; i++) - { - y = y_r_buf[i]; - cb = cb_g_buf[i]; - cr = cr_b_buf[i]; - -#if 0 - /** - * This is the slow floating point version kept here for reference - */ - - y = y + 4096; /* 128<<5=4096 so that we can scale the sum by >> 5 */ - - r = y + cr*1.403f; - g = y - cb*0.344f - cr*0.714f; - b = y + cb*1.770f; - - y_r_buf[i] = MINMAX(r>>5, 0, 255); - cb_g_buf[i] = MINMAX(g>>5, 0, 255); - cr_b_buf[i] = MINMAX(b>>5, 0, 255); -#else - /** - * We scale the factors by << 16 into 32-bit integers in order to avoid slower - * floating point multiplications. Since the final result needs to be scaled - * by >> 5 we will extract only the upper 11 bits (>> 21) from the final sum. - * Hence we also have to scale the other terms of the sum by << 16. - * - * R: 1.403 << 16 = 91947 - * G: 0.344 << 16 = 22544, 0.714 << 16 = 46792 - * B: 1.770 << 16 = 115998 - */ - - y = (y+4096)<<16; - - r = y + cr*91947; - g = y - cb*22544 - cr*46792; - b = y + cb*115998; - - y_r_buf[i] = MINMAX(r>>21, 0, 255); - cb_g_buf[i] = MINMAX(g>>21, 0, 255); - cr_b_buf[i] = MINMAX(b>>21, 0, 255); -#endif - } -} - static void rfx_decode_component(RFX_CONTEXT* context, const UINT32* quantization_values, const BYTE* data, int size, INT16* buffer) { @@ -173,11 +117,18 @@ static void rfx_decode_component(RFX_CONTEXT* context, const UINT32* quantizatio PROFILER_EXIT(context->priv->prof_rfx_decode_component); } +/* rfx_decode_ycbcr_to_rgb code now resides in the primitives library. */ + +/* stride is bytes between rows in the output buffer. */ void rfx_decode_rgb(RFX_CONTEXT* context, STREAM* data_in, int y_size, const UINT32 * y_quants, int cb_size, const UINT32 * cb_quants, - int cr_size, const UINT32 * cr_quants, BYTE* rgb_buffer) + int cr_size, const UINT32 * cr_quants, BYTE* rgb_buffer, int stride) { + static const prim_size_t roi_64x64 = { 64, 64 }; + const primitives_t *prims = primitives_get(); + INT16 *pSrcDst[3]; + PROFILER_ENTER(context->priv->prof_rfx_decode_rgb); rfx_decode_component(context, y_quants, stream_get_tail(data_in), y_size, context->priv->y_r_buffer); /* YData */ @@ -188,12 +139,16 @@ void rfx_decode_rgb(RFX_CONTEXT* context, STREAM* data_in, stream_seek(data_in, cr_size); PROFILER_ENTER(context->priv->prof_rfx_decode_ycbcr_to_rgb); - context->decode_ycbcr_to_rgb(context->priv->y_r_buffer, context->priv->cb_g_buffer, context->priv->cr_b_buffer); + pSrcDst[0] = context->priv->y_r_buffer; + pSrcDst[1] = context->priv->cb_g_buffer; + pSrcDst[2] = context->priv->cr_b_buffer; + prims->yCbCrToRGB_16s16s_P3P3((const INT16 **) pSrcDst, 64*sizeof(INT16), + pSrcDst, 64*sizeof(INT16), &roi_64x64); PROFILER_EXIT(context->priv->prof_rfx_decode_ycbcr_to_rgb); PROFILER_ENTER(context->priv->prof_rfx_decode_format_rgb); rfx_decode_format_rgb(context->priv->y_r_buffer, context->priv->cb_g_buffer, context->priv->cr_b_buffer, - context->pixel_format, rgb_buffer); + context->pixel_format, rgb_buffer, stride); PROFILER_EXIT(context->priv->prof_rfx_decode_format_rgb); PROFILER_EXIT(context->priv->prof_rfx_decode_rgb); diff --git a/libfreerdp/codec/rfx_decode.h b/libfreerdp/codec/rfx_decode.h index 7a19b7bb2..e96eb66e2 100644 --- a/libfreerdp/codec/rfx_decode.h +++ b/libfreerdp/codec/rfx_decode.h @@ -22,12 +22,12 @@ #include -void rfx_decode_ycbcr_to_rgb(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf); - +/* stride is bytes between rows in the output buffer. */ void rfx_decode_rgb(RFX_CONTEXT* context, STREAM* data_in, int y_size, const UINT32 * y_quants, int cb_size, const UINT32 * cb_quants, - int cr_size, const UINT32 * cr_quants, BYTE* rgb_buffer); + int cr_size, const UINT32 * cr_quants, BYTE* rgb_buffer, + int stride); #endif /* __RFX_DECODE_H */ diff --git a/libfreerdp/codec/rfx_encode.c b/libfreerdp/codec/rfx_encode.c index 8e511eac0..cedf85469 100644 --- a/libfreerdp/codec/rfx_encode.c +++ b/libfreerdp/codec/rfx_encode.c @@ -26,6 +26,8 @@ #include #include +#include + #include "rfx_types.h" #include "rfx_rlgr.h" #include "rfx_differential.h" @@ -180,47 +182,7 @@ static void rfx_encode_format_rgb(const BYTE* rgb_data, int width, int height, i } } -void rfx_encode_rgb_to_ycbcr(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf) -{ - /* INT32 is used intentionally because we calculate with shifted factors! */ - int i; - INT32 r, g, b; - INT32 y, cb, cr; - - /** - * The encoded YCbCr coefficients are represented as 11.5 fixed-point numbers: - * - * 1 sign bit + 10 integer bits + 5 fractional bits - * - * However only 7 integer bits will be actually used since the value range is [-128.0, 127.0]. - * In other words, the encoded coefficients is scaled by << 5 when interpreted as INT16. - * It will be scaled down to original during the quantization phase. - */ - for (i = 0; i < 4096; i++) - { - r = y_r_buf[i]; - g = cb_g_buf[i]; - b = cr_b_buf[i]; - - /* - * We scale the factors by << 15 into 32-bit integers in order to avoid slower - * floating point multiplications. Since the terms need to be scaled by << 5 we - * simply scale the final sum by >> 10 - * - * Y: 0.299000 << 15 = 9798, 0.587000 << 15 = 19235, 0.114000 << 15 = 3735 - * Cb: 0.168935 << 15 = 5535, 0.331665 << 15 = 10868, 0.500590 << 15 = 16403 - * Cr: 0.499813 << 15 = 16377, 0.418531 << 15 = 13714, 0.081282 << 15 = 2663 - */ - - y = (r * 9798 + g * 19235 + b * 3735) >> 10; - cb = (r * -5535 + g * -10868 + b * 16403) >> 10; - cr = (r * 16377 + g * -13714 + b * -2663) >> 10; - - y_r_buf[i] = MINMAX(y - 4096, -4096, 4095); - cb_g_buf[i] = MINMAX(cb, -4096, 4095); - cr_b_buf[i] = MINMAX(cr, -4096, 4095); - } -} +/* rfx_encode_rgb_to_ycbcr code now resides in the primitives library. */ static void rfx_encode_component(RFX_CONTEXT* context, const UINT32* quantization_values, INT16* data, BYTE* buffer, int buffer_size, int* size) @@ -250,6 +212,9 @@ void rfx_encode_rgb(RFX_CONTEXT* context, const BYTE* rgb_data, int width, int h const UINT32* y_quants, const UINT32* cb_quants, const UINT32* cr_quants, STREAM* data_out, int* y_size, int* cb_size, int* cr_size) { + primitives_t *prims = primitives_get(); + INT16* pSrcDst[3]; + static const prim_size_t roi_64x64 = { 64, 64 }; INT16* y_r_buffer = context->priv->y_r_buffer; INT16* cb_g_buffer = context->priv->cb_g_buffer; INT16* cr_b_buffer = context->priv->cr_b_buffer; @@ -261,9 +226,13 @@ void rfx_encode_rgb(RFX_CONTEXT* context, const BYTE* rgb_data, int width, int h context->pixel_format, context->palette, y_r_buffer, cb_g_buffer, cr_b_buffer); PROFILER_EXIT(context->priv->prof_rfx_encode_format_rgb); - PROFILER_ENTER(context->priv->prof_rfx_encode_rgb_to_ycbcr); - context->encode_rgb_to_ycbcr(context->priv->y_r_buffer, context->priv->cb_g_buffer, context->priv->cr_b_buffer); - PROFILER_EXIT(context->priv->prof_rfx_encode_rgb_to_ycbcr); + PROFILER_ENTER(context->priv->prof_rfx_rgb_to_ycbcr); + pSrcDst[0] = context->priv->y_r_buffer; + pSrcDst[1] = context->priv->cb_g_buffer; + pSrcDst[2] = context->priv->cr_b_buffer; + prims->RGBToYCbCr_16s16s_P3P3((const INT16 **) pSrcDst, 64*sizeof(INT16), + pSrcDst, 64*sizeof(INT16), &roi_64x64); + PROFILER_EXIT(context->priv->prof_rfx_rgb_to_ycbcr); /* Ensure the buffer is reasonably large enough */ stream_check_size(data_out, 4096); diff --git a/libfreerdp/codec/rfx_encode.h b/libfreerdp/codec/rfx_encode.h index 56220e838..94dfc706d 100644 --- a/libfreerdp/codec/rfx_encode.h +++ b/libfreerdp/codec/rfx_encode.h @@ -22,8 +22,6 @@ #include -void rfx_encode_rgb_to_ycbcr(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf); - void rfx_encode_rgb(RFX_CONTEXT* context, const BYTE* rgb_data, int width, int height, int rowstride, const UINT32* y_quants, const UINT32* cb_quants, const UINT32* cr_quants, STREAM* data_out, int* y_size, int* cb_size, int* cr_size); diff --git a/libfreerdp/codec/rfx_neon.c b/libfreerdp/codec/rfx_neon.c index 11965e4b1..7331ea18b 100644 --- a/libfreerdp/codec/rfx_neon.c +++ b/libfreerdp/codec/rfx_neon.c @@ -35,56 +35,7 @@ #include "cpu-features.h" #endif -void rfx_decode_YCbCr_to_RGB_NEON(INT16 * y_r_buffer, INT16 * cb_g_buffer, INT16 * cr_b_buffer) -{ - int16x8_t zero = vdupq_n_s16(0); - int16x8_t max = vdupq_n_s16(255); - int16x8_t y_add = vdupq_n_s16(128); - - int16x8_t* y_r_buf = (int16x8_t*) y_r_buffer; - int16x8_t* cb_g_buf = (int16x8_t*) cb_g_buffer; - int16x8_t* cr_b_buf = (int16x8_t*) cr_b_buffer; - - int i; - for (i = 0; i < 4096 / 8; i++) - { - int16x8_t y = vld1q_s16((INT16*) &y_r_buf[i]); - y = vaddq_s16(y, y_add); - - int16x8_t cr = vld1q_s16((INT16*) &cr_b_buf[i]); - - // r = between((y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5)), 0, 255); - int16x8_t r = vaddq_s16(y, cr); - r = vaddq_s16(r, vshrq_n_s16(cr, 2)); - r = vaddq_s16(r, vshrq_n_s16(cr, 3)); - r = vaddq_s16(r, vshrq_n_s16(cr, 5)); - r = vminq_s16(vmaxq_s16(r, zero), max); - vst1q_s16((INT16*)&y_r_buf[i], r); - - // cb = cb_g_buf[i]; - int16x8_t cb = vld1q_s16((INT16*)&cb_g_buf[i]); - - // g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255); - int16x8_t g = vsubq_s16(y, vshrq_n_s16(cb, 2)); - g = vsubq_s16(g, vshrq_n_s16(cb, 4)); - g = vsubq_s16(g, vshrq_n_s16(cb, 5)); - g = vsubq_s16(g, vshrq_n_s16(cr, 1)); - g = vsubq_s16(g, vshrq_n_s16(cr, 3)); - g = vsubq_s16(g, vshrq_n_s16(cr, 4)); - g = vsubq_s16(g, vshrq_n_s16(cr, 5)); - g = vminq_s16(vmaxq_s16(g, zero), max); - vst1q_s16((INT16*)&cb_g_buf[i], g); - - // b = between((y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6)), 0, 255); - int16x8_t b = vaddq_s16(y, cb); - b = vaddq_s16(b, vshrq_n_s16(cb, 1)); - b = vaddq_s16(b, vshrq_n_s16(cb, 2)); - b = vaddq_s16(b, vshrq_n_s16(cb, 6)); - b = vminq_s16(vmaxq_s16(b, zero), max); - vst1q_s16((INT16*)&cr_b_buf[i], b); - } - -} +/* rfx_decode_YCbCr_to_RGB_NEON code now resides in the primitives library. */ static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) rfx_quantization_decode_block_NEON(INT16 * buffer, const int buffer_size, const UINT32 factor) @@ -338,11 +289,10 @@ void rfx_init_neon(RFX_CONTEXT * context) { DEBUG_RFX("Using NEON optimizations"); - IF_PROFILER(context->priv->prof_rfx_decode_ycbcr_to_rgb->name = "rfx_decode_YCbCr_to_RGB_NEON"); + IF_PROFILER(context->priv->prof_rfx_ycbcr_to_rgb->name = "rfx_decode_YCbCr_to_RGB_NEON"); IF_PROFILER(context->priv->prof_rfx_quantization_decode->name = "rfx_quantization_decode_NEON"); IF_PROFILER(context->priv->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_NEON"); - context->decode_ycbcr_to_rgb = rfx_decode_YCbCr_to_RGB_NEON; context->quantization_decode = rfx_quantization_decode_NEON; context->dwt_2d_decode = rfx_dwt_2d_decode_NEON; } diff --git a/libfreerdp/codec/rfx_quantization.c b/libfreerdp/codec/rfx_quantization.c index a25a8dabe..2cedfd91d 100644 --- a/libfreerdp/codec/rfx_quantization.c +++ b/libfreerdp/codec/rfx_quantization.c @@ -21,36 +21,34 @@ #include "config.h" #endif +#include #include "rfx_quantization.h" -static void rfx_quantization_decode_block(INT16* buffer, int buffer_size, UINT32 factor) +static void rfx_quantization_decode_block(const primitives_t *prims, INT16* buffer, int buffer_size, UINT32 factor) { - INT16* dst; - if (factor == 0) return; - for (dst = buffer; buffer_size > 0; dst++, buffer_size--) - { - *dst <<= factor; - } + prims->lShiftC_16s(buffer, factor, buffer, buffer_size); } void rfx_quantization_decode(INT16* buffer, const UINT32* quantization_values) { - /* Scale the values so that they are represented as 11.5 fixed-point number */ - rfx_quantization_decode_block(buffer, 4096, 5); + const primitives_t *prims = primitives_get(); - rfx_quantization_decode_block(buffer, 1024, quantization_values[8] - 6); /* HL1 */ - rfx_quantization_decode_block(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */ - rfx_quantization_decode_block(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */ - rfx_quantization_decode_block(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */ - rfx_quantization_decode_block(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */ - rfx_quantization_decode_block(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */ - rfx_quantization_decode_block(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */ - rfx_quantization_decode_block(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */ - rfx_quantization_decode_block(buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */ - rfx_quantization_decode_block(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */ + /* Scale the values so that they are represented as 11.5 fixed-point number */ + rfx_quantization_decode_block(prims, buffer, 4096, 5); + + rfx_quantization_decode_block(prims, buffer, 1024, quantization_values[8] - 6); /* HL1 */ + rfx_quantization_decode_block(prims, buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */ + rfx_quantization_decode_block(prims, buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */ + rfx_quantization_decode_block(prims, buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */ + rfx_quantization_decode_block(prims, buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */ + rfx_quantization_decode_block(prims, buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */ + rfx_quantization_decode_block(prims, buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */ + rfx_quantization_decode_block(prims, buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */ + rfx_quantization_decode_block(prims, buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */ + rfx_quantization_decode_block(prims, buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */ } static void rfx_quantization_encode_block(INT16* buffer, int buffer_size, UINT32 factor) @@ -62,6 +60,7 @@ static void rfx_quantization_encode_block(INT16* buffer, int buffer_size, UINT32 return; half = (1 << (factor - 1)); + /* Could probably use prims->rShiftC_16s(dst+half, factor, dst, buffer_size); */ for (dst = buffer; buffer_size > 0; dst++, buffer_size--) { *dst = (*dst + half) >> factor; diff --git a/libfreerdp/codec/rfx_sse2.c b/libfreerdp/codec/rfx_sse2.c index ac2376d28..72b0570a1 100644 --- a/libfreerdp/codec/rfx_sse2.c +++ b/libfreerdp/codec/rfx_sse2.c @@ -52,177 +52,8 @@ _mm_prefetch_buffer(char * buffer, int num_bytes) } } -static void rfx_decode_ycbcr_to_rgb_sse2(INT16* y_r_buffer, INT16* cb_g_buffer, INT16* cr_b_buffer) -{ - __m128i zero = _mm_setzero_si128(); - __m128i max = _mm_set1_epi16(255); - - __m128i* y_r_buf = (__m128i*) y_r_buffer; - __m128i* cb_g_buf = (__m128i*) cb_g_buffer; - __m128i* cr_b_buf = (__m128i*) cr_b_buffer; - - __m128i y; - __m128i cr; - __m128i cb; - __m128i r; - __m128i g; - __m128i b; - - int i; - - __m128i r_cr = _mm_set1_epi16(22986); // 1.403 << 14 - __m128i g_cb = _mm_set1_epi16(-5636); // -0.344 << 14 - __m128i g_cr = _mm_set1_epi16(-11698); // -0.714 << 14 - __m128i b_cb = _mm_set1_epi16(28999); // 1.770 << 14 - __m128i c4096 = _mm_set1_epi16(4096); - - for (i = 0; i < (4096 * sizeof(INT16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i))) - { - _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA); - _mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA); - _mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA); - } - for (i = 0; i < (4096 * sizeof(INT16) / sizeof(__m128i)); i++) - { - /* - In order to use SSE2 signed 16-bit integer multiplication we need to convert - the floating point factors to signed int without loosing information. - The result of this multiplication is 32 bit and we have two SSE instructions - that return either the hi or lo word. - Thus we will multiply the factors by the highest possible 2^n, take the - upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16) and correct this - result by multiplying it by 2^(16-n). - For the given factors in the conversion matrix the best possible n is 14. - - Example for calculating r: - r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula - r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above - r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification - r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3 - */ - - /* y = (y_r_buf[i] + 4096) >> 2 */ - y = _mm_load_si128(&y_r_buf[i]); - y = _mm_add_epi16(y, c4096); - y = _mm_srai_epi16(y, 2); - /* cb = cb_g_buf[i]; */ - cb = _mm_load_si128(&cb_g_buf[i]); - /* cr = cr_b_buf[i]; */ - cr = _mm_load_si128(&cr_b_buf[i]); - - /* (y + HIWORD(cr*22986)) >> 3 */ - r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr)); - r = _mm_srai_epi16(r, 3); - /* y_r_buf[i] = MINMAX(r, 0, 255); */ - _mm_between_epi16(r, zero, max); - _mm_store_si128(&y_r_buf[i], r); - - /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */ - g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb)); - g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr)); - g = _mm_srai_epi16(g, 3); - /* cb_g_buf[i] = MINMAX(g, 0, 255); */ - _mm_between_epi16(g, zero, max); - _mm_store_si128(&cb_g_buf[i], g); - - /* (y + HIWORD(cb*28999)) >> 3 */ - b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb)); - b = _mm_srai_epi16(b, 3); - /* cr_b_buf[i] = MINMAX(b, 0, 255); */ - _mm_between_epi16(b, zero, max); - _mm_store_si128(&cr_b_buf[i], b); - } -} - -/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point numbers. See rfx_encode.c */ -static void rfx_encode_rgb_to_ycbcr_sse2(INT16* y_r_buffer, INT16* cb_g_buffer, INT16* cr_b_buffer) -{ - __m128i min = _mm_set1_epi16(-128 << 5); - __m128i max = _mm_set1_epi16(127 << 5); - - __m128i* y_r_buf = (__m128i*) y_r_buffer; - __m128i* cb_g_buf = (__m128i*) cb_g_buffer; - __m128i* cr_b_buf = (__m128i*) cr_b_buffer; - - __m128i y; - __m128i cr; - __m128i cb; - __m128i r; - __m128i g; - __m128i b; - - __m128i y_r = _mm_set1_epi16(9798); // 0.299000 << 15 - __m128i y_g = _mm_set1_epi16(19235); // 0.587000 << 15 - __m128i y_b = _mm_set1_epi16(3735); // 0.114000 << 15 - __m128i cb_r = _mm_set1_epi16(-5535); // -0.168935 << 15 - __m128i cb_g = _mm_set1_epi16(-10868); // -0.331665 << 15 - __m128i cb_b = _mm_set1_epi16(16403); // 0.500590 << 15 - __m128i cr_r = _mm_set1_epi16(16377); // 0.499813 << 15 - __m128i cr_g = _mm_set1_epi16(-13714); // -0.418531 << 15 - __m128i cr_b = _mm_set1_epi16(-2663); // -0.081282 << 15 - - int i; - - for (i = 0; i < (4096 * sizeof(INT16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i))) - { - _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA); - _mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA); - _mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA); - } - for (i = 0; i < (4096 * sizeof(INT16) / sizeof(__m128i)); i++) - { - /* - In order to use SSE2 signed 16-bit integer multiplication we need to convert - the floating point factors to signed int without loosing information. - The result of this multiplication is 32 bit and using SSE2 we get either the - product's hi or lo word. - Thus we will multiply the factors by the highest possible 2^n and take the - upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16). - Since the final result needs to be scaled by << 5 and also in in order to keep - the precision within the upper 16 bits we will also have to scale the RGB - values used in the multiplication by << 5+(16-n). - */ - - /* r = y_r_buf[i]; */ - r = _mm_load_si128(&y_r_buf[i]); - - /* g = cb_g_buf[i]; */ - g = _mm_load_si128(&cb_g_buf[i]); - - /* b = cr_b_buf[i]; */ - b = _mm_load_si128(&cr_b_buf[i]); - - /* r<<6; g<<6; b<<6 */ - r = _mm_slli_epi16(r, 6); - g = _mm_slli_epi16(g, 6); - b = _mm_slli_epi16(b, 6); - - /* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */ - y = _mm_mulhi_epi16(r, y_r); - y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g)); - y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b)); - y = _mm_add_epi16(y, min); - /* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */ - _mm_between_epi16(y, min, max); - _mm_store_si128(&y_r_buf[i], y); - - /* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */ - cb = _mm_mulhi_epi16(r, cb_r); - cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g)); - cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b)); - /* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */ - _mm_between_epi16(cb, min, max); - _mm_store_si128(&cb_g_buf[i], cb); - - /* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */ - cr = _mm_mulhi_epi16(r, cr_r); - cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g)); - cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b)); - /* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */ - _mm_between_epi16(cr, min, max); - _mm_store_si128(&cr_b_buf[i], cr); - } -} +/* rfx_decode_ycbcr_to_rgb_sse2 code now resides in the primitives library. */ +/* rfx_encode_rgb_to_ycbcr_sse2 code now resides in the primitives library. */ static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) rfx_quantization_decode_block_sse2(INT16* buffer, const int buffer_size, const UINT32 factor) @@ -658,15 +489,11 @@ void rfx_init_sse2(RFX_CONTEXT* context) { DEBUG_RFX("Using SSE2 optimizations"); - IF_PROFILER(context->priv->prof_rfx_decode_ycbcr_to_rgb->name = "rfx_decode_ycbcr_to_rgb_sse2"); - IF_PROFILER(context->priv->prof_rfx_encode_rgb_to_ycbcr->name = "rfx_encode_rgb_to_ycbcr_sse2"); IF_PROFILER(context->priv->prof_rfx_quantization_decode->name = "rfx_quantization_decode_sse2"); IF_PROFILER(context->priv->prof_rfx_quantization_encode->name = "rfx_quantization_encode_sse2"); IF_PROFILER(context->priv->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_sse2"); IF_PROFILER(context->priv->prof_rfx_dwt_2d_encode->name = "rfx_dwt_2d_encode_sse2"); - context->decode_ycbcr_to_rgb = rfx_decode_ycbcr_to_rgb_sse2; - context->encode_rgb_to_ycbcr = rfx_encode_rgb_to_ycbcr_sse2; context->quantization_decode = rfx_quantization_decode_sse2; context->quantization_encode = rfx_quantization_encode_sse2; context->dwt_2d_decode = rfx_dwt_2d_decode_sse2; diff --git a/libfreerdp/codec/rfx_types.h b/libfreerdp/codec/rfx_types.h index 223cbf9cd..0d8340877 100644 --- a/libfreerdp/codec/rfx_types.h +++ b/libfreerdp/codec/rfx_types.h @@ -60,7 +60,7 @@ struct _RFX_CONTEXT_PRIV PROFILER_DEFINE(prof_rfx_differential_decode); PROFILER_DEFINE(prof_rfx_quantization_decode); PROFILER_DEFINE(prof_rfx_dwt_2d_decode); - PROFILER_DEFINE(prof_rfx_decode_ycbcr_to_rgb); + PROFILER_DEFINE(prof_rfx_ycbcr_to_rgb); PROFILER_DEFINE(prof_rfx_decode_format_rgb); PROFILER_DEFINE(prof_rfx_encode_rgb); @@ -69,7 +69,7 @@ struct _RFX_CONTEXT_PRIV PROFILER_DEFINE(prof_rfx_differential_encode); PROFILER_DEFINE(prof_rfx_quantization_encode); PROFILER_DEFINE(prof_rfx_dwt_2d_encode); - PROFILER_DEFINE(prof_rfx_encode_rgb_to_ycbcr); + PROFILER_DEFINE(prof_rfx_rgb_to_ycbcr); PROFILER_DEFINE(prof_rfx_encode_format_rgb); }; diff --git a/libfreerdp/primitives/CMakeLists.txt b/libfreerdp/primitives/CMakeLists.txt new file mode 100644 index 000000000..92f495e72 --- /dev/null +++ b/libfreerdp/primitives/CMakeLists.txt @@ -0,0 +1,81 @@ +# FreeRDP: A Remote Desktop Protocol Client +# libfreerdp-primitives cmake build script +# vi:ts=4 sw=4: +# +# (c) Copyright 2012 Hewlett-Packard Development Company, L.P. +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing permissions +# and limitations under the License. +# + +set(FREERDP_PRIMITIVE_CFILES + prim_add.c + prim_andor.c + prim_alphaComp.c + prim_colors.c + prim_copy.c + prim_set.c + prim_shift.c + prim_sign.c + primitives.c + ) +set(FREERDP_PRIMITIVE_HEADERS + prim_internal.h + ) +set(FREERDP_PRIMITIVE_SRCS + ${FREERDP_PRIMITIVE_CFILES} + ${FREERDP_PRIMITIVE_HEADERS} + ) + +add_definitions(-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}) + +### IPP Variable debugging +if(WITH_IPP) + if(CMAKE_COMPILER_IS_GNUCC) + foreach(INCLDIR ${IPP_INCLUDE_DIRS}) + set(OPTIMIZATION "${OPTIMIZATION} -I${INCLDIR}") + endforeach(INCLDIR) + endif() +endif() + +if(WITH_SSE2) + if(CMAKE_COMPILER_IS_GNUCC) + set(OPTIMIZATION "${OPTIMIZATION} -msse2 -mssse3 -Wdeclaration-after-statement") + endif() + + if(MSVC) + set(OPTIMIZATION "${OPTIMIZATION} /arch:SSE2") + endif() +elseif(WITH_NEON) + if(CMAKE_COMPILER_IS_GNUCC) + set(OPTIMZATION "${OPTIMIZATION} -mfpu=neon -mfloat-abi=softfp") + endif() + # TODO: Add MSVC equivalent +endif() +set_property(SOURCE ${FREERDP_PRIMITIVE_CFILES} PROPERTY COMPILE_FLAGS ${OPTIMIZATION}) + +if(WITH_MONOLITHIC_BUILD) + add_library(freerdp-primitives OBJECT ${FREERDP_PRIMITIVE_SRCS}) +else() + add_library(freerdp-primitives ${FREERDP_PRIMITIVE_SRCS}) +endif() + +set_target_properties(freerdp-primitives PROPERTIES VERSION ${FREERDP_VERSION_FULL} SOVERSION ${FREERDP_VERSION} PREFIX "lib") + +#set(FREERDP_PRIMITIVE_LIBS, rt) + +if(WITH_MONOLITHIC_BUILD) + set(FREERDP_LIBS ${FREERDP_LIBS} ${FREERDP_PRIMITIVE_LIBS} PARENT_SCOPE) +else() + target_link_libraries(freerdp-primitives ${FREERDP_PRIMITIVE_LIBS}) + install(TARGETS freerdp-primitives DESTINATION ${CMAKE_INSTALL_LIBDIR}) +endif() + +if(BUILD_TESTING) + add_subdirectory(test) +endif(BUILD_TESTING) diff --git a/libfreerdp/primitives/README.txt b/libfreerdp/primitives/README.txt new file mode 100644 index 000000000..369102c0d --- /dev/null +++ b/libfreerdp/primitives/README.txt @@ -0,0 +1,113 @@ +The Primitives Library + +Introduction +------------ +The purpose of the primitives library is to give the freerdp code easy +access to *run-time* optimization via SIMD operations. When the library +is initialized, dynamic checks of processor features are run (such as +the support of SSE3 or Neon), and entrypoints are linked to through +function pointers to provide the fastest possible operations. All +routines offer generic C alternatives as fallbacks. + +Run-time optimization has the advantage of allowing a single executable +to run fast on multiple platforms with different SIMD capabilities. + + +Use In Code +----------- +A singleton pointing to a structure containing the function pointers +is accessed through primitives_get(). The function pointers can then +be used from that structure, e.g. + + primitives_t *prims = primitives_get(); + prims->shiftC_16s(buffer, shifts, buffer, 256); + +Of course, there is some overhead in calling through the function pointer +and setting up the SIMD operations, so it would be counterproductive to +call the primitives library for very small operation, e.g. initializing an +array of eight values to a constant. The primitives library is intended +for larger-scale operations, e.g. arrays of size 64 and larger. + + +Initialization and Cleanup +-------------------------- +Library initialization is done the first time primitives_init() is called +or the first time primitives_get() is used. Cleanup (if any) is done by +primitives_deinit(). + + +Intel Integrated Performance Primitives (IPP) +--------------------------------------------- +If freerdp is compiled with IPP support (-DWITH_IPP=ON), the IPP function +calls will be used (where available) to fill the function pointers. +Where possible, function names and parameter lists match IPP format so +that the IPP functions can be plugged into the function pointers without +a wrapper layer. Use of IPP is completely optional, and in many cases +the SSE operations in the primitives library itself are faster or similar +in performance. + + +Coverage +-------- +The primitives library is not meant to be comprehensive, offering +entrypoints for every operation and operand type. Instead, the coverage +is focused on operations known to be performance bottlenecks in the code. +For instance, 16-bit signed operations are used widely in the RemoteFX +software, so you'll find 16s versions of several operations, but there +is no attempt to provide (unused) copies of the same code for 8u, 16u, +32s, etc. + + +New Optimizations +----------------- +As the need arises, new optimizations can be added to the library, +including NEON, AVX, and perhaps OpenCL or other SIMD implementations. +The initialization routine is free to do any quick run-time test to +determine which features are available before hooking the operation's +function pointer, or it can simply look at the processor features list +from the hints passed to the initialization routine. + + +Adding Entrypoints +------------------ +As the need for new operations or operands arises, new entrypoints can +be added. + 1) Function prototypes and pointers are added to + include/freerdp/primitives.h + 2) New module initialization and cleanup function prototypes are added + to prim_internal.h and called in primitives.c (primitives_init() + and primitives_deinit()). + 3) Operation names and parameter lists should be compatible with the IPP. + IPP manuals are available online at software.intel.com. + 4) A generic C entrypoint must be available as a fallback. + 5) prim_templates.h contains macro-based templates for simple operations, + such as applying a single SSE operation to arrays of data. + The template functions can frequently be used to extend the + operations without writing a lot of new code. + + +Flags +----- +The entrypoint primitives_get_flags() returns a bitfield of processor flags +(as defined in primitives.h) and primitives_flag_str() returns a string +related to those processor flags, for debugging and information. The +bitfield can be used elsewhere in the code as needed. + + +Cache Management +---------------- +I haven't found a lot of speed improvement by attempting prefetch, and +in fact it seems to have a negative impact in some cases. Done correctly +perhaps the routines could be further accelerated by proper use of prefetch, +fences, etc. + + +Testing +------- +In the test subdirectory is an executable (prim_test) that tests both +functionality and speed of primitives library operations. Any new +modules should be added to that test, following the conventions already +established in that directory. The program can be executed on various +target hardware to compare generic C, optimized, and IPP performance +with various array sizes. + diff --git a/libfreerdp/primitives/prim_add.c b/libfreerdp/primitives/prim_add.c new file mode 100644 index 000000000..82e2eb9bb --- /dev/null +++ b/libfreerdp/primitives/prim_add.c @@ -0,0 +1,81 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Add operations. + * vi:ts=4 sw=4: + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + * + */ + +#include +#include +#include +#include +#ifdef WITH_SSE2 +# include +# include +#endif /* WITH_SSE2 */ +#ifdef WITH_IPP +# include +#endif /* WITH_IPP */ +#include "prim_internal.h" +#include "prim_templates.h" + +/* ---------------------------------------------------------------------------- + * 16-bit signed add with saturation (under and over). + */ +PRIM_STATIC pstatus_t general_add_16s( + const INT16 *pSrc1, + const INT16 *pSrc2, + INT16 *pDst, + INT32 len) +{ + while (len--) + { + INT32 k = (INT32) (*pSrc1++) + (INT32) (*pSrc2++); + if (k > 32767) *pDst++ = ((INT16) 32767); + else if (k < -32768) *pDst++ = ((INT16) -32768); + else *pDst++ = (INT16) k; + } + + return PRIMITIVES_SUCCESS; +} + +#ifdef WITH_SSE2 +# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) +/* ------------------------------------------------------------------------- */ +SSE3_SSD_ROUTINE(sse3_add_16s, INT16, general_add_16s, + _mm_adds_epi16, general_add_16s(sptr1++, sptr2++, dptr++, 1)) +# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ +#endif + +/* ------------------------------------------------------------------------- */ +void primitives_init_add( + const primitives_hints_t *hints, + primitives_t *prims) +{ + prims->add_16s = general_add_16s; +#ifdef WITH_IPP + prims->add_16s = (__add_16s_t) ippsAdd_16s; +#elif defined(WITH_SSE2) + if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE) + && (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) /* for LDDQU */ + { + prims->add_16s = sse3_add_16s; + } +#endif +} + +/* ------------------------------------------------------------------------- */ +void primitives_deinit_add( + primitives_t *prims) +{ + /* Nothing to do. */ +} diff --git a/libfreerdp/primitives/prim_alphaComp.c b/libfreerdp/primitives/prim_alphaComp.c new file mode 100644 index 000000000..82a844c52 --- /dev/null +++ b/libfreerdp/primitives/prim_alphaComp.c @@ -0,0 +1,298 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Alpha blending routines. + * vi:ts=4 sw=4: + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + * + * Note: this code assumes the second operand is fully opaque, + * e.g. + * newval = alpha1*val1 + (1-alpha1)*val2 + * rather than + * newval = alpha1*val1 + (1-alpha1)*alpha2*val2 + * The IPP gives other options. + */ + +#include +#include +#include +#include +#include "prim_internal.h" +#ifdef WITH_SSE2 +# include +# include +#endif /* WITH_SSE2 */ +#ifdef WITH_IPP +# include +#endif /* WITH_IPP */ + +#define ALPHA(_k_) (((_k_) & 0xFF000000U) >> 24) +#define RED(_k_) (((_k_) & 0x00FF0000U) >> 16) +#define GRN(_k_) (((_k_) & 0x0000FF00U) >> 8) +#define BLU(_k_) (((_k_) & 0x000000FFU)) + +/* ------------------------------------------------------------------------- */ +PRIM_STATIC pstatus_t general_alphaComp_argb( + const BYTE *pSrc1, INT32 src1Step, + const BYTE *pSrc2, INT32 src2Step, + BYTE *pDst, INT32 dstStep, + INT32 width, INT32 height) +{ + const UINT32 *sptr1 = (const UINT32 *) pSrc1; + const UINT32 *sptr2 = (const UINT32 *) pSrc2; + UINT32 *dptr = (UINT32 *) pDst; + int linebytes = width * sizeof(UINT32); + int src1Jump = (src1Step - linebytes) / sizeof(UINT32); + int src2Jump = (src2Step - linebytes) / sizeof(UINT32); + int dstJump = (dstStep - linebytes) / sizeof(UINT32); + + int y; + for (y=0; y> 8) & 0x00FF00FFU; + UINT32 s1rb = src1 & 0x00FF00FFU; + UINT32 s1ag = (src1 >> 8) & 0x00FF00FFU; + + UINT32 drb = s1rb - s2rb; + UINT32 dag = s1ag - s2ag; + drb *= alpha; + dag *= alpha; + + rb = ((drb >> 8) + s2rb) & 0x00FF00FFU; + ag = (((dag >> 8) + s2ag) << 8) & 0xFF00FF00U; + *dptr++ = rb | ag; + } + } + sptr1 += src1Jump; + sptr2 += src2Jump; + dptr += dstJump; + } + + return PRIMITIVES_SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +#ifdef WITH_SSE2 +# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) + +PRIM_STATIC pstatus_t sse2_alphaComp_argb( + const BYTE *pSrc1, INT32 src1Step, + const BYTE *pSrc2, INT32 src2Step, + BYTE *pDst, INT32 dstStep, + INT32 width, INT32 height) +{ + const UINT32 *sptr1 = (const UINT32 *) pSrc1; + const UINT32 *sptr2 = (const UINT32 *) pSrc2; + UINT32 *dptr; + int linebytes, src1Jump, src2Jump, dstJump, y; + __m128i xmm0, xmm1; + + if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS; + + if (width < 4) /* pointless if too small */ + { + return general_alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, + pDst, dstStep, width, height); + } + dptr = (UINT32 *) pDst; + linebytes = width * sizeof(UINT32); + src1Jump = (src1Step - linebytes) / sizeof(UINT32); + src2Jump = (src2Step - linebytes) / sizeof(UINT32); + dstJump = (dstStep - linebytes) / sizeof(UINT32); + + xmm0 = _mm_set1_epi32(0); + xmm1 = _mm_set1_epi16(1); + + for (y=0; y> 2; + pixels -= count << 2; + while (count--) + { + __m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + /* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */ + xmm2 = LOAD_SI128(sptr1); sptr1 += 4; + /* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */ + xmm3 = LOAD_SI128(sptr2); sptr2 += 4; + /* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */ + xmm4 = _mm_unpackhi_epi8(xmm2, xmm0); + /* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */ + xmm5 = _mm_unpackhi_epi8(xmm3, xmm0); + /* subtract */ + xmm6 = _mm_subs_epi16(xmm4, xmm5); + /* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */ + xmm4 = _mm_shufflelo_epi16(xmm4, 0xff); + /* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */ + xmm4 = _mm_shufflehi_epi16(xmm4, 0xff); + /* Add one to alphas */ + xmm4 = _mm_adds_epi16(xmm4, xmm1); + /* Multiply and take low word */ + xmm4 = _mm_mullo_epi16(xmm4, xmm6); + /* Shift 8 right */ + xmm4 = _mm_srai_epi16(xmm4, 8); + /* Add xmm5 */ + xmm4 = _mm_adds_epi16(xmm4, xmm5); + /* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */ + + /* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */ + xmm5 = _mm_unpacklo_epi8(xmm2, xmm0); + /* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */ + xmm6 = _mm_unpacklo_epi8(xmm3, xmm0); + /* subtract */ + xmm7 = _mm_subs_epi16(xmm5, xmm6); + /* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */ + xmm5 = _mm_shufflelo_epi16(xmm5, 0xff); + /* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */ + xmm5 = _mm_shufflehi_epi16(xmm5, 0xff); + /* Add one to alphas */ + xmm5 = _mm_adds_epi16(xmm5, xmm1); + /* Multiply and take low word */ + xmm5 = _mm_mullo_epi16(xmm5, xmm7); + /* Shift 8 right */ + xmm5 = _mm_srai_epi16(xmm5, 8); + /* Add xmm6 */ + xmm5 = _mm_adds_epi16(xmm5, xmm6); + /* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */ + + /* Must mask off remainders or pack gets confused */ + xmm3 = _mm_set1_epi16(0x00ffU); + xmm4 = _mm_and_si128(xmm4, xmm3); + xmm5 = _mm_and_si128(xmm5, xmm3); + + /* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */ + xmm5 = _mm_packus_epi16(xmm5, xmm4); + _mm_store_si128((__m128i *) dptr, xmm5); dptr += 4; + } + + /* Finish off the remainder. */ + if (pixels) + { + general_alphaComp_argb((const BYTE *) sptr1, src1Step, + (const BYTE *) sptr2, src2Step, + (BYTE *) dptr, dstStep, pixels, 1); + sptr1 += pixels; + sptr2 += pixels; + dptr += pixels; + } + + /* Jump to next row. */ + sptr1 += src1Jump; + sptr2 += src2Jump; + dptr += dstJump; + } + + return PRIMITIVES_SUCCESS; +} +# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ +#endif + +#ifdef WITH_IPP +/* ------------------------------------------------------------------------- */ +PRIM_STATIC pstatus_t ipp_alphaComp_argb( + const BYTE *pSrc1, INT32 src1Step, + const BYTE *pSrc2, INT32 src2Step, + BYTE *pDst, INT32 dstStep, + INT32 width, INT32 height) +{ + IppiSize sz; + sz.width = width; + sz.height = height; + return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step, + pDst, dstStep, sz, ippAlphaOver); +} +#endif + +/* ------------------------------------------------------------------------- */ +void primitives_init_alphaComp( + const primitives_hints_t *hints, + primitives_t *prims) +{ + prims->alphaComp_argb = general_alphaComp_argb; +#ifdef WITH_IPP + prims->alphaComp_argb = ipp_alphaComp_argb; +#elif defined(WITH_SSE2) + if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE) + && (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) /* for LDDQU */ + { + prims->alphaComp_argb = sse2_alphaComp_argb; + } +#endif +} + +/* ------------------------------------------------------------------------- */ +void primitives_deinit_alphaComp( + primitives_t *prims) +{ + /* Nothing to do. */ +} diff --git a/libfreerdp/primitives/prim_andor.c b/libfreerdp/primitives/prim_andor.c new file mode 100644 index 000000000..4fe2ba5b3 --- /dev/null +++ b/libfreerdp/primitives/prim_andor.c @@ -0,0 +1,94 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Logical operations. + * vi:ts=4 sw=4: + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#include +#include +#include +#include +#ifdef WITH_SSE2 +# include +# include +#endif /* WITH_SSE2 */ +#ifdef WITH_IPP +# include +#endif /* WITH_IPP */ +#include "prim_internal.h" +#include "prim_templates.h" + +/* ---------------------------------------------------------------------------- + * 32-bit AND with a constant. + */ +PRIM_STATIC pstatus_t general_andC_32u( + const UINT32 *pSrc, + UINT32 val, + UINT32 *pDst, + INT32 len) +{ + if (val == 0) return PRIMITIVES_SUCCESS; + while (len--) *pDst++ = *pSrc++ & val; + return PRIMITIVES_SUCCESS; +} + +/* ---------------------------------------------------------------------------- + * 32-bit OR with a constant. + */ +PRIM_STATIC pstatus_t general_orC_32u( + const UINT32 *pSrc, + UINT32 val, + UINT32 *pDst, + INT32 len) +{ + if (val == 0) return PRIMITIVES_SUCCESS; + while (len--) *pDst++ = *pSrc++ | val; + return PRIMITIVES_SUCCESS; +} + +#ifdef WITH_SSE2 +# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) +/* ------------------------------------------------------------------------- */ +SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, general_andC_32u, + _mm_and_si128, *dptr++ = *sptr++ & val) +SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, general_orC_32u, + _mm_or_si128, *dptr++ = *sptr++ | val) +# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ +#endif + +/* ------------------------------------------------------------------------- */ +void primitives_init_andor( + const primitives_hints_t *hints, + primitives_t *prims) +{ + /* Start with the default. */ + prims->andC_32u = general_andC_32u; + prims->orC_32u = general_orC_32u; +#if defined(WITH_IPP) + prims->andC_32u = (__andC_32u_t) ippsAndC_32u; + prims->orC_32u = (__orC_32u_t) ippsOrC_32u; +#elif defined(WITH_SSE2) + if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE) + && (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) + { + prims->andC_32u = sse3_andC_32u; + prims->orC_32u = sse3_orC_32u; + } +#endif +} + +/* ------------------------------------------------------------------------- */ +void primitives_deinit_andor( + primitives_t *prims) +{ + /* Nothing to do. */ +} diff --git a/libfreerdp/primitives/prim_colors.c b/libfreerdp/primitives/prim_colors.c new file mode 100644 index 000000000..5a1a099c6 --- /dev/null +++ b/libfreerdp/primitives/prim_colors.c @@ -0,0 +1,742 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Color conversion operations. + * vi:ts=4 sw=4: + * + * Copyright 2011 Stephen Erisman + * Copyright 2011 Norbert Federa + * Copyright 2011 Martin Fleisz + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#ifdef HAVE_CONFIG_H +#include +#endif +#include +#include +#include +#ifdef WITH_SSE2 +#include +#elif WITH_NEON +#include +#endif /* WITH_SSE2 else WITH_NEON */ +#include "prim_internal.h" +#include "prim_templates.h" + +#ifndef MINMAX +#define MINMAX(_v_, _l_, _h_) \ + ((_v_) < (_l_) ? (_l_) : ((_v_) > (_h_) ? (_h_) : (_v_))) +#endif /* !MINMAX */ + +/* ------------------------------------------------------------------------- */ +PRIM_STATIC pstatus_t general_yCbCrToRGB_16s16s_P3P3( + const INT16 *pSrc[3], INT32 srcStep, + INT16 *pDst[3], INT32 dstStep, + const prim_size_t *roi) /* region of interest */ +{ + /** + * The decoded YCbCr coeffectients are represented as 11.5 fixed-point + * numbers: + * + * 1 sign bit + 10 integer bits + 5 fractional bits + * + * However only 7 integer bits will be actually used since the value range + * is [-128.0, 127.0]. In other words, the decoded coefficients are scaled + * by << 5 when interpreted as INT16. + * It was scaled in the quantization phase, so we must scale it back here. + */ + const INT16 *yptr = pSrc[0]; + const INT16 *cbptr = pSrc[1]; + const INT16 *crptr = pSrc[2]; + INT16 *rptr = pDst[0]; + INT16 *gptr = pDst[1]; + INT16 *bptr = pDst[2]; + int srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16); + int dstbump = (dstStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16); + int y; + + for (y=0; yheight; y++) + { + int x; + for (x=0; xwidth; ++x) + { + /* INT32 is used intentionally because we calculate + * with shifted factors! + */ + INT32 y = (INT32) (*yptr++); + INT32 cb = (INT32) (*cbptr++); + INT32 cr = (INT32) (*crptr++); + INT32 r,g,b; + + /* + * This is the slow floating point version kept here for reference. + * y = y + 4096; // 128<<5=4096 so that we can scale the sum by>>5 + * r = y + cr*1.403f; + * g = y - cb*0.344f - cr*0.714f; + * b = y + cb*1.770f; + * y_r_buf[i] = MINMAX(r>>5, 0, 255); + * cb_g_buf[i] = MINMAX(g>>5, 0, 255); + * cr_b_buf[i] = MINMAX(b>>5, 0, 255); + */ + + /* + * We scale the factors by << 16 into 32-bit integers in order to + * avoid slower floating point multiplications. Since the final + * result needs to be scaled by >> 5 we will extract only the + * upper 11 bits (>> 21) from the final sum. + * Hence we also have to scale the other terms of the sum by << 16. + * R: 1.403 << 16 = 91947 + * G: 0.344 << 16 = 22544, 0.714 << 16 = 46792 + * B: 1.770 << 16 = 115998 + */ + y = (y+4096)<<16; + + r = y + cr*91947; + g = y - cb*22544 - cr*46792; + b = y + cb*115998; + + *rptr++ = MINMAX(r>>21, 0, 255); + *gptr++ = MINMAX(g>>21, 0, 255); + *bptr++ = MINMAX(b>>21, 0, 255); + } + yptr += srcbump; + cbptr += srcbump; + crptr += srcbump; + rptr += dstbump; + gptr += dstbump; + bptr += dstbump; + } + return PRIMITIVES_SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +PRIM_STATIC pstatus_t general_RGBToYCbCr_16s16s_P3P3( + const INT16 *pSrc[3], INT32 srcStep, + INT16 *pDst[3], INT32 dstStep, + const prim_size_t *roi) /* region of interest */ +{ + /* The encoded YCbCr coefficients are represented as 11.5 fixed-point + * numbers: + * + * 1 sign bit + 10 integer bits + 5 fractional bits + * + * However only 7 integer bits will be actually used since the value + * range is [-128.0, 127.0]. In other words, the encoded coefficients + * is scaled by << 5 when interpreted as INT16. + * It will be scaled down to original during the quantization phase. + */ + const INT16 *rptr = pSrc[0]; + const INT16 *gptr = pSrc[1]; + const INT16 *bptr = pSrc[2]; + INT16 *yptr = pDst[0]; + INT16 *cbptr = pDst[1]; + INT16 *crptr = pDst[2]; + int srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16); + int dstbump = (dstStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16); + int y; + + for (y=0; yheight; y++) + { + int x; + for (x=0; xwidth; ++x) + { + /* INT32 is used intentionally because we calculate with + * shifted factors! + */ + INT32 r = (INT32) (*rptr++); + INT32 g = (INT32) (*gptr++); + INT32 b = (INT32) (*bptr++); + + /* We scale the factors by << 15 into 32-bit integers in order + * to avoid slower floating point multiplications. Since the + * terms need to be scaled by << 5 we simply scale the final + * sum by >> 10 + * + * Y: 0.299000 << 15 = 9798, 0.587000 << 15 = 19235, + * 0.114000 << 15 = 3735 + * Cb: 0.168935 << 15 = 5535, 0.331665 << 15 = 10868, + * 0.500590 << 15 = 16403 + * Cr: 0.499813 << 15 = 16377, 0.418531 << 15 = 13714, + * 0.081282 << 15 = 2663 + */ + INT32 y = (r * 9798 + g * 19235 + b * 3735) >> 10; + INT32 cb = (r * -5535 + g * -10868 + b * 16403) >> 10; + INT32 cr = (r * 16377 + g * -13714 + b * -2663) >> 10; + + *yptr++ = (INT16) MINMAX(y - 4096, -4096, 4095); + *cbptr++ = (INT16) MINMAX(cb, -4096, 4095); + *crptr++ = (INT16) MINMAX(cr, -4096, 4095); + } + yptr += srcbump; + cbptr += srcbump; + crptr += srcbump; + rptr += dstbump; + gptr += dstbump; + bptr += dstbump; + } + return PRIMITIVES_SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +PRIM_STATIC pstatus_t general_RGBToRGB_16s8u_P3AC4R( + const INT16 *pSrc[3], /* 16-bit R,G, and B arrays */ + int srcStep, /* bytes between rows in source data */ + BYTE *pDst, /* 32-bit interleaved ARGB (ABGR?) data */ + int dstStep, /* bytes between rows in dest data */ + const prim_size_t *roi) /* region of interest */ +{ + const INT16 *r = pSrc[0]; + const INT16 *g = pSrc[1]; + const INT16 *b = pSrc[2]; + BYTE *dst = pDst; + int x,y; + int srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16); + int dstbump = (dstStep - (roi->width * sizeof(UINT32))); + + for (y=0; yheight; ++y) + { + for (x=0; xwidth; ++x) + { + *dst++ = (BYTE) (*b++); + *dst++ = (BYTE) (*g++); + *dst++ = (BYTE) (*r++); + *dst++ = ((BYTE) (0xFFU)); + } + dst += dstbump; + r += srcbump; + g += srcbump; + b += srcbump; + } + return PRIMITIVES_SUCCESS; +} + + +#ifdef WITH_SSE2 + +#ifdef __GNUC__ +# define GNU_INLINE \ + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +#else +# define GNU_INLINE +#endif + +#define CACHE_LINE_BYTES 64 + +#define _mm_between_epi16(_val, _min, _max) \ + do { _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); } while (0) + +#ifdef DO_PREFETCH +/*---------------------------------------------------------------------------*/ +static inline void GNU_INLINE _mm_prefetch_buffer( + char * buffer, + int num_bytes) +{ + __m128i * buf = (__m128i*) buffer; + unsigned int i; + for (i = 0; i < (num_bytes / sizeof(__m128i)); + i+=(CACHE_LINE_BYTES / sizeof(__m128i))) + { + _mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA); + } +} +#endif /* DO_PREFETCH */ + +/*---------------------------------------------------------------------------*/ +PRIM_STATIC pstatus_t sse2_yCbCrToRGB_16s16s_P3P3( + const INT16 *pSrc[3], + int srcStep, + INT16 *pDst[3], + int dstStep, + const prim_size_t *roi) /* region of interest */ +{ + __m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096; + __m128i *y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf; + int srcbump, dstbump, yp, imax; + + if (((ULONG_PTR) (pSrc[0]) & 0x0f) + || ((ULONG_PTR) (pSrc[1]) & 0x0f) + || ((ULONG_PTR) (pSrc[2]) & 0x0f) + || ((ULONG_PTR) (pDst[0]) & 0x0f) + || ((ULONG_PTR) (pDst[1]) & 0x0f) + || ((ULONG_PTR) (pDst[2]) & 0x0f) + || (roi->width & 0x07) + || (srcStep & 127) + || (dstStep & 127)) + { + /* We can't maintain 16-byte alignment. */ + return general_yCbCrToRGB_16s16s_P3P3(pSrc, srcStep, + pDst, dstStep, roi); + } + + zero = _mm_setzero_si128(); + max = _mm_set1_epi16(255); + + y_buf = (__m128i*) (pSrc[0]); + cb_buf = (__m128i*) (pSrc[1]); + cr_buf = (__m128i*) (pSrc[2]); + r_buf = (__m128i*) (pDst[0]); + g_buf = (__m128i*) (pDst[1]); + b_buf = (__m128i*) (pDst[2]); + + r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */ + g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */ + g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */ + b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */ + c4096 = _mm_set1_epi16(4096); + srcbump = srcStep / sizeof(__m128i); + dstbump = dstStep / sizeof(__m128i); + +#ifdef DO_PREFETCH + /* Prefetch Y's, Cb's, and Cr's. */ + for (yp=0; ypheight; yp++) + { + int i; + for (i=0; iwidth * sizeof(INT16) / sizeof(__m128i); + i += (CACHE_LINE_BYTES / sizeof(__m128i))) + { + _mm_prefetch((char*)(&y_buf[i]), _MM_HINT_NTA); + _mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA); + _mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA); + } + y_buf += srcbump; + cb_buf += srcbump; + cr_buf += srcbump; + } + y_buf = (__m128i*) (pSrc[0]); + cb_buf = (__m128i*) (pSrc[1]); + cr_buf = (__m128i*) (pSrc[2]); +#endif /* DO_PREFETCH */ + + imax = roi->width * sizeof(INT16) / sizeof(__m128i); + for (yp=0; ypheight; ++yp) + { + int i; + for (i=0; i>5) + 128 + (cr*1.403)>>5 // our base formula + * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above + * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification + * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3 + */ + + /* y = (y_r_buf[i] + 4096) >> 2 */ + __m128i y, cb, cr, r, g, b; + y = _mm_load_si128(y_buf + i); + y = _mm_add_epi16(y, c4096); + y = _mm_srai_epi16(y, 2); + /* cb = cb_g_buf[i]; */ + cb = _mm_load_si128(cb_buf + i); + /* cr = cr_b_buf[i]; */ + cr = _mm_load_si128(cr_buf + i); + + /* (y + HIWORD(cr*22986)) >> 3 */ + r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr)); + r = _mm_srai_epi16(r, 3); + + /* r_buf[i] = MINMAX(r, 0, 255); */ + _mm_between_epi16(r, zero, max); + _mm_store_si128(r_buf + i, r); + + /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */ + g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb)); + g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr)); + g = _mm_srai_epi16(g, 3); + + /* g_buf[i] = MINMAX(g, 0, 255); */ + _mm_between_epi16(g, zero, max); + _mm_store_si128(g_buf + i, g); + + /* (y + HIWORD(cb*28999)) >> 3 */ + b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb)); + b = _mm_srai_epi16(b, 3); + /* b_buf[i] = MINMAX(b, 0, 255); */ + _mm_between_epi16(b, zero, max); + _mm_store_si128(b_buf + i, b); + } + y_buf += srcbump; + cb_buf += srcbump; + cr_buf += srcbump; + r_buf += dstbump; + g_buf += dstbump; + b_buf += dstbump; + } + + return PRIMITIVES_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point + * numbers. See the general code above. + */ +PRIM_STATIC pstatus_t sse2_RGBToYCbCr_16s16s_P3P3( + const INT16 *pSrc[3], + int srcStep, + INT16 *pDst[3], + int dstStep, + const prim_size_t *roi) /* region of interest */ +{ + __m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b; + __m128i *r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf; + int srcbump, dstbump, yp, imax; + + if (((ULONG_PTR) (pSrc[0]) & 0x0f) + || ((ULONG_PTR) (pSrc[1]) & 0x0f) + || ((ULONG_PTR) (pSrc[2]) & 0x0f) + || ((ULONG_PTR) (pDst[0]) & 0x0f) + || ((ULONG_PTR) (pDst[1]) & 0x0f) + || ((ULONG_PTR) (pDst[2]) & 0x0f) + || (roi->width & 0x07) + || (srcStep & 127) + || (dstStep & 127)) + { + /* We can't maintain 16-byte alignment. */ + return general_RGBToYCbCr_16s16s_P3P3(pSrc, srcStep, + pDst, dstStep, roi); + } + + min = _mm_set1_epi16(-128 << 5); + max = _mm_set1_epi16(127 << 5); + + r_buf = (__m128i*) (pSrc[0]); + g_buf = (__m128i*) (pSrc[1]); + b_buf = (__m128i*) (pSrc[2]); + y_buf = (__m128i*) (pDst[0]); + cb_buf = (__m128i*) (pDst[1]); + cr_buf = (__m128i*) (pDst[2]); + + y_r = _mm_set1_epi16(9798); /* 0.299000 << 15 */ + y_g = _mm_set1_epi16(19235); /* 0.587000 << 15 */ + y_b = _mm_set1_epi16(3735); /* 0.114000 << 15 */ + cb_r = _mm_set1_epi16(-5535); /* -0.168935 << 15 */ + cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */ + cb_b = _mm_set1_epi16(16403); /* 0.500590 << 15 */ + cr_r = _mm_set1_epi16(16377); /* 0.499813 << 15 */ + cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */ + cr_b = _mm_set1_epi16(-2663); /* -0.081282 << 15 */ + + srcbump = srcStep / sizeof(__m128i); + dstbump = dstStep / sizeof(__m128i); + +#ifdef DO_PREFETCH + /* Prefetch RGB's. */ + for (yp=0; ypheight; yp++) + { + int i; + for (i=0; iwidth * sizeof(INT16) / sizeof(__m128i); + i += (CACHE_LINE_BYTES / sizeof(__m128i))) + { + _mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA); + _mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA); + _mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA); + } + r_buf += srcbump; + g_buf += srcbump; + b_buf += srcbump; + } + r_buf = (__m128i*) (pSrc[0]); + g_buf = (__m128i*) (pSrc[1]); + b_buf = (__m128i*) (pSrc[2]); +#endif /* DO_PREFETCH */ + + imax = roi->width * sizeof(INT16) / sizeof(__m128i); + for (yp=0; ypheight; ++yp) + { + int i; + for (i=0; iwidth & 0x0f) + || (srcStep & 0x0f) + || (dstStep & 0x0f)) + { + return general_RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, roi); + } + + out = (BYTE *) pDst; + srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16); + dstbump = (dstStep - (roi->width * sizeof(UINT32))); + + for (y=0; yheight; ++y) + { + int width = roi->width; + do { + __m128i R0, R1, R2, R3, R4; + /* The comments below pretend these are 8-byte registers + * rather than 16-byte, for readability. + */ + R0 = LOAD128(b); b += 8; /* R0 = 00B300B200B100B0 */ + R1 = LOAD128(b); b += 8; /* R1 = 00B700B600B500B4 */ + PACKUSWB(R0,R1); /* R0 = B7B6B5B4B3B2B1B0 */ + R1 = LOAD128(g); g += 8; /* R1 = 00G300G200G100G0 */ + R2 = LOAD128(g); g += 8; /* R2 = 00G700G600G500G4 */ + PACKUSWB(R1,R2); /* R1 = G7G6G5G4G3G2G1G0 */ + R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */ + PUNPCKLBW(R2,R0); /* R2 = G3B3G2B2G1B1G0B0 */ + PUNPCKHBW(R1,R0); /* R1 = G7B7G6B7G5B5G4B4 */ + R0 = LOAD128(r); r += 8; /* R0 = 00R300R200R100R0 */ + R3 = LOAD128(r); r += 8; /* R3 = 00R700R600R500R4 */ + PACKUSWB(R0,R3); /* R0 = R7R6R5R4R3R2R1R0 */ + R3 = XMM_ALL_ONES; /* R3 = FFFFFFFFFFFFFFFF */ + R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */ + PUNPCKLBW(R4,R0); /* R4 = FFR3FFR2FFR1FFR0 */ + PUNPCKHBW(R3,R0); /* R3 = FFR7FFR6FFR5FFR4 */ + R0 = R4; /* R0 = R4 */ + PUNPCKLWD(R0,R2); /* R0 = FFR1G1B1FFR0G0B0 */ + PUNPCKHWD(R4,R2); /* R4 = FFR3G3B3FFR2G2B2 */ + R2 = R3; /* R2 = R3 */ + PUNPCKLWD(R2,R1); /* R2 = FFR5G5B5FFR4G4B4 */ + PUNPCKHWD(R3,R1); /* R3 = FFR7G7B7FFR6G6B6 */ + STORE128(out, R0); out += 16; /* FFR1G1B1FFR0G0B0 */ + STORE128(out, R4); out += 16; /* FFR3G3B3FFR2G2B2 */ + STORE128(out, R2); out += 16; /* FFR5G5B5FFR4G4B4 */ + STORE128(out, R3); out += 16; /* FFR7G7B7FFR6G6B6 */ + } while (width -= 16); + /* Jump to next row. */ + r += srcbump; + g += srcbump; + b += srcbump; + out += dstbump; + } + return PRIMITIVES_SUCCESS; +} +#endif /* WITH_SSE2 */ + +/*---------------------------------------------------------------------------*/ +#ifdef WITH_NEON +PRIM_STATIC pstatus_t neon_yCbCrToRGB_16s16s_P3P3( + const INT16 *pSrc[3], + int srcStep, + INT16 *pDst[3], + int dstStep, + const prim_size_t *roi) /* region of interest */ +{ + /* TODO: If necessary, check alignments and call the general version. */ + + int16x8_t zero = vdupq_n_s16(0); + int16x8_t max = vdupq_n_s16(255); + int16x8_t y_add = vdupq_n_s16(128); + + int16x8_t* y_buf = (int16x8_t*) pSrc[0]; + int16x8_t* cb_buf = (int16x8_t*) pSrc[1]; + int16x8_t* cr_buf = (int16x8_t*) pSrc[2]; + int16x8_t* r_buf = (int16x8_t*) pDst[0]; + int16x8_t* g_buf = (int16x8_t*) pDst[1]; + int16x8_t* b_buf = (int16x8_t*) pDst[2]; + + int srcbump = srcStep / sizeof(int16x8_t); + int dstbump = dstStep / sizeof(int16x8_t); + int yp; + + int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t); + for (yp=0; ypheight; ++yp) + { + int i; + for (i=0; i> 2) + (cr >> 3) + (cr >> 5)), + * 0, 255); + */ + int16x8_t r = vaddq_s16(y, cr); + r = vaddq_s16(r, vshrq_n_s16(cr, 2)); + r = vaddq_s16(r, vshrq_n_s16(cr, 3)); + r = vaddq_s16(r, vshrq_n_s16(cr, 5)); + r = vminq_s16(vmaxq_s16(r, zero), max); + vst1q_s16((INT16*) (r_buf+i), r); + + /* cb = cb_g_buf[i]; */ + int16x8_t cb = vld1q_s16((INT16*) (cb_buf+i)); + + /* g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) + * - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255); + */ + int16x8_t g = vsubq_s16(y, vshrq_n_s16(cb, 2)); + g = vsubq_s16(g, vshrq_n_s16(cb, 4)); + g = vsubq_s16(g, vshrq_n_s16(cb, 5)); + g = vsubq_s16(g, vshrq_n_s16(cr, 1)); + g = vsubq_s16(g, vshrq_n_s16(cr, 3)); + g = vsubq_s16(g, vshrq_n_s16(cr, 4)); + g = vsubq_s16(g, vshrq_n_s16(cr, 5)); + g = vminq_s16(vmaxq_s16(g, zero), max); + vst1q_s16((INT16*) (g_buf+i), g); + + /* b = between((y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6)), + * 0, 255); + */ + int16x8_t b = vaddq_s16(y, cb); + b = vaddq_s16(b, vshrq_n_s16(cb, 1)); + b = vaddq_s16(b, vshrq_n_s16(cb, 2)); + b = vaddq_s16(b, vshrq_n_s16(cb, 6)); + b = vminq_s16(vmaxq_s16(b, zero), max); + vst1q_s16((INT16*) (b_buf+i), b); + } + y_buf += srcbump; + cb_buf += srcbump; + cr_buf += srcbump; + r_buf += dstbump; + g_buf += dstbump; + b_buf += dstbump; + } +} +#endif /* WITH_NEON */ + + +/* I don't see a direct IPP version of this, since the input is INT16 + * YCbCr. It may be possible via Deinterleave and then YCbCrToRGB_. + * But that would likely be slower. + */ + +/* ------------------------------------------------------------------------- */ +void primitives_init_colors( + const primitives_hints_t *hints, + primitives_t *prims) +{ + prims->RGBToRGB_16s8u_P3AC4R = general_RGBToRGB_16s8u_P3AC4R; + prims->yCbCrToRGB_16s16s_P3P3 = general_yCbCrToRGB_16s16s_P3P3; + prims->RGBToYCbCr_16s16s_P3P3 = general_RGBToYCbCr_16s16s_P3P3; +#if defined(WITH_SSE2) + if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE) + { + prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R; + prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3; + prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3; + } +#elif defined(WITH_NEON) + if (hints->arm_flags & PRIM_ARM_NEON_AVAILABLE) + { + prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3; + } +#endif /* WITH_SSE2 */ +} + +/* ------------------------------------------------------------------------- */ +void primitives_deinit_colors( + primitives_t *prims) +{ + /* Nothing to do. */ +} diff --git a/libfreerdp/primitives/prim_copy.c b/libfreerdp/primitives/prim_copy.c new file mode 100644 index 000000000..826852597 --- /dev/null +++ b/libfreerdp/primitives/prim_copy.c @@ -0,0 +1,177 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Copy operations. + * vi:ts=4 sw=4: + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#include +#include +#include +#include +#ifdef WITH_IPP +# include +# include +#endif /* WITH_IPP */ +#include "prim_internal.h" + +/* ------------------------------------------------------------------------- */ +/*static inline BOOL memory_regions_overlap_1d(*/ +static BOOL memory_regions_overlap_1d( + const BYTE *p1, + const BYTE *p2, + size_t bytes) +{ + const ULONG_PTR p1m = (const ULONG_PTR) p1; + const ULONG_PTR p2m = (const ULONG_PTR) p2; + if (p1m <= p2m) + { + if (p1m + bytes > p2m) return TRUE; + } + else + { + if (p2m + bytes > p1m) return TRUE; + } + /* else */ + return FALSE; +} + +/* ------------------------------------------------------------------------- */ +/*static inline BOOL memory_regions_overlap_2d( */ +static BOOL memory_regions_overlap_2d( + const BYTE *p1, int p1Step, int p1Size, + const BYTE *p2, int p2Step, int p2Size, + int width, int height) +{ + ULONG_PTR p1m = (ULONG_PTR) p1; + ULONG_PTR p2m = (ULONG_PTR) p2; + + if (p1m <= p2m) + { + ULONG_PTR p1mEnd = p1m + (height-1)*p1Step + width*p1Size; + if (p1mEnd > p2m) return TRUE; + } + else + { + ULONG_PTR p2mEnd = p2m + (height-1)*p2Step + width*p2Size; + if (p2mEnd > p1m) return TRUE; + } + /* else */ + return FALSE; +} + +/* ------------------------------------------------------------------------- */ +PRIM_STATIC pstatus_t general_copy_8u( + const BYTE *pSrc, + BYTE *pDst, + INT32 len) +{ + if (memory_regions_overlap_1d(pSrc, pDst, (size_t) len)) + { + memmove((void *) pDst, (const void *) pSrc, (size_t) len); + } + else + { + memcpy((void *) pDst, (const void *) pSrc, (size_t) len); + } + + return PRIMITIVES_SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +/* Copy a block of pixels from one buffer to another. + * The addresses are assumed to have been already offset to the upper-left + * corners of the source and destination region of interest. + */ +PRIM_STATIC pstatus_t general_copy_8u_AC4r( + const BYTE *pSrc, INT32 srcStep, + BYTE *pDst, INT32 dstStep, + INT32 width, INT32 height) +{ + primitives_t *prims = primitives_get(); + const BYTE *src = (const BYTE *) pSrc; + BYTE *dst = (BYTE *) pDst; + int rowbytes = width * sizeof(UINT32); + + if ((width == 0) || (height == 0)) return PRIMITIVES_SUCCESS; + + if (memory_regions_overlap_2d(pSrc, srcStep, sizeof(UINT32), + pDst, dstStep, sizeof(UINT32), width, height)) + { + do { + prims->copy(src, dst, rowbytes); + src += srcStep; + dst += dstStep; + } while (--height); + } + else + { + /* TODO: do it in one operation when the rowdata is adjacent. */ + do { + /* If we find a replacement for memcpy that is consistently + * faster, this could be replaced with that. + */ + memcpy(dst, src, rowbytes); + src += srcStep; + dst += dstStep; + } while (--height); + } + + return PRIMITIVES_SUCCESS; +} + +#ifdef WITH_IPP +/* ------------------------------------------------------------------------- */ +/* This is just ippiCopy_8u_AC4R without the IppiSize structure parameter. */ +static pstatus_t ippiCopy_8u_AC4r( + const BYTE *pSrc, INT32 srcStep, + BYTE *pDst, INT32 dstStep, + INT32 width, INT32 height) +{ + IppiSize roi; + roi.width = width; + roi.height = height; + return (pstatus_t) ippiCopy_8u_AC4R(pSrc, srcStep, pDst, dstStep, roi); +} +#endif /* WITH_IPP */ + +/* ------------------------------------------------------------------------- */ +void primitives_init_copy( + const primitives_hints_t *hints, + primitives_t *prims) +{ + /* Start with the default. */ + prims->copy_8u = general_copy_8u; + prims->copy_8u_AC4r = general_copy_8u_AC4r; + + /* Pick tuned versions if possible. */ +#ifdef WITH_IPP + prims->copy_8u = (__copy_8u_t) ippsCopy_8u; + prims->copy_8u_AC4r = (__copy_8u_AC4r_t) ippiCopy_8u_AC4r; +#endif + /* Performance with an SSE2 version with no prefetch seemed to be + * all over the map vs. memcpy. + * Sometimes it was significantly faster, sometimes dreadfully slower, + * and it seemed to vary a lot depending on block size and processor. + * Hence, no SSE version is used here unless once can be written that + * is consistently faster than memcpy. + */ + + /* This is just an alias with void* parameters */ + prims->copy = (__copy_t) (prims->copy_8u); +} + +/* ------------------------------------------------------------------------- */ +void primitives_deinit_copy( + primitives_t *prims) +{ + /* Nothing to do. */ +} diff --git a/libfreerdp/primitives/prim_internal.h b/libfreerdp/primitives/prim_internal.h new file mode 100644 index 000000000..21df8cae0 --- /dev/null +++ b/libfreerdp/primitives/prim_internal.h @@ -0,0 +1,105 @@ +/* prim_internal.h + * vi:ts=4 sw=4 + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. Algorithms used by + * this code may be covered by patents by HP, Microsoft, or other parties. + * + */ + +#ifdef __GNUC__ +# pragma once +#endif + +#ifndef __PRIM_INTERNAL_H_INCLUDED__ +#define __PRIM_INTERNAL_H_INCLUDED__ + +#ifndef CMAKE_BUILD_TYPE +#define CMAKE_BUILD_TYPE Release +#endif + +#include + +/* Normally the internal entrypoints should be static, but a benchmark + * program may want to access them directly and turn this off. + */ +#ifndef PRIM_STATIC +# define PRIM_STATIC static +#else +# undef PRIM_STATIC +# define PRIM_STATIC +#endif /* !PRIM_STATIC */ + +/* Use lddqu for unaligned; load for 16-byte aligned. */ +#define LOAD_SI128(_ptr_) \ + (((ULONG_PTR) (_ptr_) & 0x0f) \ + ? _mm_lddqu_si128((__m128i *) (_ptr_)) \ + : _mm_load_si128((__m128i *) (_ptr_))) + +/* This structure can (eventually) be used to provide hints to the + * initialization routines, e.g. whether SSE2 or NEON or IPP instructions + * or calls are available. + */ +typedef struct +{ + UINT32 x86_flags; + UINT32 arm_flags; +} primitives_hints_t; + +/* Function prototypes for all the init/deinit routines. */ +extern void primitives_init_copy( + const primitives_hints_t *hints, + primitives_t *prims); +extern void primitives_deinit_copy( + primitives_t *prims); + +extern void primitives_init_set( + const primitives_hints_t *hints, + primitives_t *prims); +extern void primitives_deinit_set( + primitives_t *prims); + +extern void primitives_init_add( + const primitives_hints_t *hints, + primitives_t *prims); +extern void primitives_deinit_add( + primitives_t *prims); + +extern void primitives_init_andor( + const primitives_hints_t *hints, + primitives_t *prims); +extern void primitives_deinit_andor( + primitives_t *prims); + +extern void primitives_init_shift( + const primitives_hints_t *hints, + primitives_t *prims); +extern void primitives_deinit_shift( + primitives_t *prims); + +extern void primitives_init_sign( + const primitives_hints_t *hints, + primitives_t *prims); +extern void primitives_deinit_sign( + primitives_t *prims); + +extern void primitives_init_alphaComp( + const primitives_hints_t *hints, + primitives_t *prims); +extern void primitives_deinit_alphaComp( + primitives_t *prims); + +extern void primitives_init_colors( + const primitives_hints_t *hints, + primitives_t *prims); +extern void primitives_deinit_colors( + primitives_t *prims); + +#endif /* !__PRIM_INTERNAL_H_INCLUDED__ */ diff --git a/libfreerdp/primitives/prim_set.c b/libfreerdp/primitives/prim_set.c new file mode 100644 index 000000000..650a32eff --- /dev/null +++ b/libfreerdp/primitives/prim_set.c @@ -0,0 +1,309 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Routines to set a chunk of memory to a constant. + * vi:ts=4 sw=4: + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + * + */ + +#include +#include +#include +#include +#ifdef WITH_SSE2 +# include +#endif /* WITH_SSE2 */ +#ifdef WITH_IPP +# include +#endif /* WITH_IPP */ +#include "prim_internal.h" + +/* ========================================================================= */ +PRIM_STATIC pstatus_t general_set_8u( + BYTE val, + BYTE *pDst, + INT32 len) +{ + memset((void *) pDst, (int) val, (size_t) len); + return PRIMITIVES_SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +PRIM_STATIC pstatus_t general_zero( + void *pDst, + size_t len) +{ + memset(pDst, 0, len); + return PRIMITIVES_SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +#ifdef WITH_SSE2 +# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) +PRIM_STATIC pstatus_t sse2_set_8u( + BYTE val, + BYTE *pDst, + INT32 len) +{ + BYTE byte, *dptr; + __m128i xmm0; + size_t count; + + if (len < 16) return general_set_8u(val, pDst, len); + + byte = val; + dptr = (BYTE *) pDst; + + /* Seek 16-byte alignment. */ + while ((ULONG_PTR) dptr & 0x0f) + { + *dptr++ = byte; + if (--len == 0) return PRIMITIVES_SUCCESS; + } + + xmm0 = _mm_set1_epi8(byte); + + /* Cover 256-byte chunks via SSE register stores. */ + count = len >> 8; + len -= count << 8; + /* Do 256-byte chunks using one XMM register. */ + while (count--) + { + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + } + + /* Cover 16-byte chunks via SSE register stores. */ + count = len >> 4; + len -= count << 4; + /* Do 16-byte chunks using one XMM register. */ + while (count--) + { + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + } + + /* Do leftover bytes. */ + while (len--) *dptr++ = byte; + + return PRIMITIVES_SUCCESS; +} +# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ +#endif /* WITH_SSE2 */ + +/* ========================================================================= */ +PRIM_STATIC pstatus_t general_set_32s( + INT32 val, + INT32 *pDst, + INT32 len) +{ + INT32 *dptr = (INT32 *) pDst; + size_t span, remaining; + primitives_t *prims; + + if (len < 256) + { + while (len--) *dptr++ = val; + return PRIMITIVES_SUCCESS; + } + + /* else quadratic growth memcpy algorithm */ + span = 1; + *dptr = val; + remaining = len - 1; + prims = primitives_get(); + while (remaining) + { + size_t thiswidth = span; + if (thiswidth > remaining) thiswidth = remaining; + prims->copy_8u((BYTE *) dptr, (BYTE *) (dptr + span), thiswidth<<2); + remaining -= thiswidth; + span <<= 1; + } + return PRIMITIVES_SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +PRIM_STATIC pstatus_t general_set_32u( + UINT32 val, + UINT32 *pDst, + INT32 len) +{ + UINT32 *dptr = (UINT32 *) pDst; + size_t span, remaining; + primitives_t *prims; + + if (len < 256) + { + while (len--) *dptr++ = val; + return PRIMITIVES_SUCCESS; + } + + /* else quadratic growth memcpy algorithm */ + span = 1; + *dptr = val; + remaining = len - 1; + prims = primitives_get(); + while (remaining) + { + size_t thiswidth = span; + if (thiswidth > remaining) thiswidth = remaining; + prims->copy_8u((BYTE *) dptr, (BYTE *) (dptr + span), thiswidth<<2); + remaining -= thiswidth; + span <<= 1; + } + return PRIMITIVES_SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +#ifdef WITH_SSE2 +# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) +PRIM_STATIC pstatus_t sse2_set_32u( + UINT32 val, + UINT32 *pDst, + INT32 len) +{ + UINT32 *dptr = (UINT32 *) pDst; + __m128i xmm0; + size_t count; + + /* If really short, just do it here. */ + if (len < 32) + { + while (len--) *dptr++ = val; + return PRIMITIVES_SUCCESS; + } + + /* Assure we can reach 16-byte alignment. */ + if (((ULONG_PTR) dptr & 0x03) != 0) + { + return general_set_32u(val, pDst, len); + } + + /* Seek 16-byte alignment. */ + while ((ULONG_PTR) dptr & 0x0f) + { + *dptr++ = val; + if (--len == 0) return PRIMITIVES_SUCCESS; + } + + xmm0 = _mm_set1_epi32(val); + + /* Cover 256-byte chunks via SSE register stores. */ + count = len >> 6; + len -= count << 6; + /* Do 256-byte chunks using one XMM register. */ + while (count--) + { + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + } + + /* Cover 16-byte chunks via SSE register stores. */ + count = len >> 2; + len -= count << 2; + /* Do 16-byte chunks using one XMM register. */ + while (count--) + { + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + } + + /* Do leftover bytes. */ + while (len--) *dptr++ = val; + + return PRIMITIVES_SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +PRIM_STATIC pstatus_t sse2_set_32s( + INT32 val, + INT32 *pDst, + INT32 len) +{ + UINT32 uval = *((UINT32 *) &val); + return sse2_set_32u(uval, (UINT32 *) pDst, len); +} +# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ +#endif /* WITH_SSE2 */ + +#ifdef WITH_IPP +/* ------------------------------------------------------------------------- */ +PRIM_STATIC pstatus_t ipp_wrapper_set_32u( + UINT32 val, + UINT32 *pDst, + INT32 len) +{ + /* A little type conversion, then use the signed version. */ + INT32 sval = *((INT32 *) &val); + return ippsSet_32s(sval, (INT32 *) pDst, len); +} +#endif + +/* ------------------------------------------------------------------------- */ +void primitives_init_set( + const primitives_hints_t *hints, + primitives_t *prims) +{ + /* Start with the default. */ + prims->set_8u = general_set_8u; + prims->set_32s = general_set_32s; + prims->set_32u = general_set_32u; + prims->zero = general_zero; + + /* Pick tuned versions if possible. */ +#ifdef WITH_IPP + prims->set_8u = (__set_8u_t) ippsSet_8u; + prims->set_32s = (__set_32s_t) ippsSet_32s; + prims->set_32u = (__set_32u_t) ipp_wrapper_set_32u; + prims->zero = (__zero_t) ippsZero_8u; +#elif defined(WITH_SSE2) + if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE) + { + prims->set_8u = sse2_set_8u; + prims->set_32s = sse2_set_32s; + prims->set_32u = sse2_set_32u; + } +#endif +} + +/* ------------------------------------------------------------------------- */ +void primitives_deinit_set( + primitives_t *prims) +{ + /* Nothing to do. */ +} diff --git a/libfreerdp/primitives/prim_shift.c b/libfreerdp/primitives/prim_shift.c new file mode 100644 index 000000000..327eed073 --- /dev/null +++ b/libfreerdp/primitives/prim_shift.c @@ -0,0 +1,165 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Shift operations. + * vi:ts=4 sw=4: + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#include +#include +#include +#include +#ifdef WITH_SSE2 +# include +# include +#endif /* WITH_SSE2 */ +#ifdef WITH_IPP +# include +#endif /* WITH_IPP */ +#include "prim_internal.h" +#include "prim_templates.h" + +/* ------------------------------------------------------------------------- */ +PRIM_STATIC pstatus_t general_lShiftC_16s( + const INT16 *pSrc, + INT32 val, + INT16 *pDst, + INT32 len) +{ + if (val == 0) return PRIMITIVES_SUCCESS; + while (len--) *pDst++ = *pSrc++ << val; + return PRIMITIVES_SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +PRIM_STATIC pstatus_t general_rShiftC_16s( + const INT16 *pSrc, + INT32 val, + INT16 *pDst, + INT32 len) +{ + if (val == 0) return PRIMITIVES_SUCCESS; + while (len--) *pDst++ = *pSrc++ >> val; + return PRIMITIVES_SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +PRIM_STATIC pstatus_t general_lShiftC_16u( + const UINT16 *pSrc, + INT32 val, + UINT16 *pDst, + INT32 len) +{ + if (val == 0) return PRIMITIVES_SUCCESS; + while (len--) *pDst++ = *pSrc++ << val; + return PRIMITIVES_SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +PRIM_STATIC pstatus_t general_rShiftC_16u( + const UINT16 *pSrc, + INT32 val, + UINT16 *pDst, + INT32 len) +{ + if (val == 0) return PRIMITIVES_SUCCESS; + while (len--) *pDst++ = *pSrc++ >> val; + return PRIMITIVES_SUCCESS; +} + +#ifdef WITH_SSE2 +# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) +/* ------------------------------------------------------------------------- */ +SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, general_lShiftC_16s, + _mm_slli_epi16, *dptr++ = *sptr++ << val) +/* ------------------------------------------------------------------------- */ +SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, general_rShiftC_16s, + _mm_srai_epi16, *dptr++ = *sptr++ >> val) +/* ------------------------------------------------------------------------- */ +SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, general_lShiftC_16u, + _mm_slli_epi16, *dptr++ = *sptr++ << val) +/* ------------------------------------------------------------------------- */ +SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, general_rShiftC_16u, + _mm_srli_epi16, *dptr++ = *sptr++ >> val) +# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ +#endif + +/* ------------------------------------------------------------------------- */ +PRIM_STATIC pstatus_t general_shiftC_16s( + const INT16 *pSrc, + INT32 val, + INT16 *pDst, + INT32 len) +{ + primitives_t *prims; + + if (val == 0) return PRIMITIVES_SUCCESS; + prims = primitives_get(); + if (val < 0) return prims->rShiftC_16s(pSrc, -val, pDst, len); + else return prims->lShiftC_16s(pSrc, val, pDst, len); +} + +/* ------------------------------------------------------------------------- */ +PRIM_STATIC pstatus_t general_shiftC_16u( + const UINT16 *pSrc, + INT32 val, + UINT16 *pDst, + INT32 len) +{ + primitives_t *prims; + + if (val == 0) return PRIMITIVES_SUCCESS; + prims = primitives_get(); + if (val < 0) return prims->rShiftC_16u(pSrc, -val, pDst, len); + else return prims->lShiftC_16u(pSrc, val, pDst, len); +} + +/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s + * depending on the sign of val. To avoid using the deprecated inplace + * routines, a wrapper can use the src for the dest. + */ + +/* ------------------------------------------------------------------------- */ +void primitives_init_shift( + const primitives_hints_t *hints, + primitives_t *prims) +{ + /* Start with the default. */ + prims->lShiftC_16s = general_lShiftC_16s; + prims->rShiftC_16s = general_rShiftC_16s; + prims->lShiftC_16u = general_lShiftC_16u; + prims->rShiftC_16u = general_rShiftC_16u; +#if defined(WITH_IPP) + prims->lShiftC_16s = (__lShiftC_16s_t) ippsLShiftC_16s; + prims->rShiftC_16s = (__rShiftC_16s_t) ippsRShiftC_16s; + prims->lShiftC_16u = (__lShiftC_16u_t) ippsLShiftC_16u; + prims->rShiftC_16u = (__rShiftC_16u_t) ippsRShiftC_16u; +#elif defined(WITH_SSE2) + if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE) + && (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) + { + prims->lShiftC_16s = sse2_lShiftC_16s; + prims->rShiftC_16s = sse2_rShiftC_16s; + prims->lShiftC_16u = sse2_lShiftC_16u; + prims->rShiftC_16u = sse2_rShiftC_16u; + } +#endif + /* Wrappers */ + prims->shiftC_16s = general_shiftC_16s; + prims->shiftC_16u = general_shiftC_16u; +} + +/* ------------------------------------------------------------------------- */ +void primitives_deinit_shift( + primitives_t *prims) +{ + /* Nothing to do. */ +} diff --git a/libfreerdp/primitives/prim_sign.c b/libfreerdp/primitives/prim_sign.c new file mode 100644 index 000000000..c669e938a --- /dev/null +++ b/libfreerdp/primitives/prim_sign.c @@ -0,0 +1,170 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Sign operations. + * vi:ts=4 sw=4: + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#include +#include +#include +#include +#ifdef WITH_SSE2 +# include +# include +#endif /* WITH_SSE2 */ +#include "prim_internal.h" + +/* ---------------------------------------------------------------------------- + * Set pDst to the sign-value of the 16-bit values in pSrc (-1, 0, or 1). + */ +PRIM_STATIC pstatus_t general_sign_16s( + const INT16 *pSrc, + INT16 *pDst, + INT32 len) +{ + while (len--) + { + INT16 src = *pSrc++; + *pDst++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0); + } + + return PRIMITIVES_SUCCESS; +} + +#ifdef WITH_SSE2 +/* ------------------------------------------------------------------------- */ +PRIM_STATIC pstatus_t ssse3_sign_16s( + const INT16 *pSrc, + INT16 *pDst, + INT32 len) +{ + const INT16 *sptr = (const INT16 *) pSrc; + INT16 *dptr = (INT16 *) pDst; + size_t count; + + if (len < 16) + { + return general_sign_16s(pSrc, pDst, len); + } + + /* Check for 16-byte alignment (eventually). */ + if ((ULONG_PTR) pDst & 0x01) + { + return general_sign_16s(pSrc, pDst, len); + } + + /* Seek 16-byte alignment. */ + while ((ULONG_PTR) dptr & 0x0f) + { + INT16 src = *sptr++; + *dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0); + if (--len == 0) return PRIMITIVES_SUCCESS; + } + + /* Do 32-short chunks using 8 XMM registers. */ + count = len >> 5; /* / 32 */ + len -= count << 5; /* * 32 */ + if ((ULONG_PTR) sptr & 0x0f) + { + /* Unaligned */ + while (count--) + { + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + xmm0 = _mm_set1_epi16(0x0001U); + xmm1 = _mm_set1_epi16(0x0001U); + xmm2 = _mm_set1_epi16(0x0001U); + xmm3 = _mm_set1_epi16(0x0001U); + xmm4 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; + xmm5 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; + xmm6 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; + xmm7 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; + xmm0 = _mm_sign_epi16(xmm0, xmm4); + xmm1 = _mm_sign_epi16(xmm1, xmm5); + xmm2 = _mm_sign_epi16(xmm2, xmm6); + xmm3 = _mm_sign_epi16(xmm3, xmm7); + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 8; + _mm_store_si128((__m128i *) dptr, xmm1); dptr += 8; + _mm_store_si128((__m128i *) dptr, xmm2); dptr += 8; + _mm_store_si128((__m128i *) dptr, xmm3); dptr += 8; + } + } + else + { + /* Aligned */ + while (count--) + { + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + xmm0 = _mm_set1_epi16(0x0001U); + xmm1 = _mm_set1_epi16(0x0001U); + xmm2 = _mm_set1_epi16(0x0001U); + xmm3 = _mm_set1_epi16(0x0001U); + xmm4 = _mm_load_si128((__m128i *) sptr); sptr += 8; + xmm5 = _mm_load_si128((__m128i *) sptr); sptr += 8; + xmm6 = _mm_load_si128((__m128i *) sptr); sptr += 8; + xmm7 = _mm_load_si128((__m128i *) sptr); sptr += 8; + xmm0 = _mm_sign_epi16(xmm0, xmm4); + xmm1 = _mm_sign_epi16(xmm1, xmm5); + xmm2 = _mm_sign_epi16(xmm2, xmm6); + xmm3 = _mm_sign_epi16(xmm3, xmm7); + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 8; + _mm_store_si128((__m128i *) dptr, xmm1); dptr += 8; + _mm_store_si128((__m128i *) dptr, xmm2); dptr += 8; + _mm_store_si128((__m128i *) dptr, xmm3); dptr += 8; + } + } + + /* Do 8-short chunks using two XMM registers. */ + count = len >> 3; + len -= count << 3; + while (count--) + { + __m128i xmm0 = _mm_set1_epi16(0x0001U); + __m128i xmm1 = LOAD_SI128(sptr); sptr += 8; + xmm0 = _mm_sign_epi16(xmm0, xmm1); + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 8; + } + + /* Do leftovers. */ + while (len--) + { + INT16 src = *sptr++; + *dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0); + } + + return PRIMITIVES_SUCCESS; +} +#endif /* WITH_SSE2 */ + +/* ------------------------------------------------------------------------- */ +void primitives_init_sign( + const primitives_hints_t *hints, + primitives_t *prims) +{ + /* Start with the default. */ + prims->sign_16s = general_sign_16s; + /* Pick tuned versions if possible. */ + /* I didn't spot an IPP version of this. */ +#if defined(WITH_SSE2) + if ((hints->x86_flags & PRIM_X86_SSSE3_AVAILABLE) + && (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) + { + prims->sign_16s = ssse3_sign_16s; + } +#endif +} + +/* ------------------------------------------------------------------------- */ +void primitives_deinit_sign( + primitives_t *prims) +{ + /* Nothing to do. */ +} diff --git a/libfreerdp/primitives/prim_templates.h b/libfreerdp/primitives/prim_templates.h new file mode 100644 index 000000000..c0b6ac10d --- /dev/null +++ b/libfreerdp/primitives/prim_templates.h @@ -0,0 +1,416 @@ +/* prim_templates.h + * vi:ts=4 sw=4 + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. Algorithms used by + * this code may be covered by patents by HP, Microsoft, or other parties. + */ + +#ifdef __GNUC__ +# pragma once +#endif + +#ifndef __PRIM_TEMPLATES_H_INCLUDED__ +#define __PRIM_TEMPLATES_H_INCLUDED__ + +/* These are prototypes for SSE (potentially NEON) routines that do a + * simple SSE operation over an array of data. Since so much of this + * code is shared except for the operation itself, these prototypes are + * used rather than duplicating code. The naming convention depends on + * the parameters: S=Source param; C=Constant; D=Destination. + * All the macros have parameters for a fallback procedure if the data + * is too small and an operation "the slow way" for use at 16-byte edges. + */ + +/* SSE3 note: If someone needs to support an SSE2 version of these without + * SSE3 support, an alternative version could be added that merely checks + * that 16-byte alignment on both destination and source(s) can be + * achieved, rather than use LDDQU for unaligned reads. + */ + +/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant. + * It easily can't do that if the value is stored in a variable. + * So don't save it as an intermediate value. + */ + +/* ---------------------------------------------------------------------------- + * SCD = Source, Constant, Destination + */ +#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \ +PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, INT32 val, _type_ *pDst, INT32 len) \ +{ \ + int shifts; \ + UINT32 offBeatMask; \ + const _type_ *sptr = pSrc; \ + _type_ *dptr = pDst; \ + size_t count; \ + if (len < 16) /* pointless if too small */ \ + { \ + return _fallback_(pSrc, val, pDst, len); \ + } \ + if (sizeof(_type_) == 1) shifts = 1; \ + else if (sizeof(_type_) == 2) shifts = 2; \ + else if (sizeof(_type_) == 4) shifts = 3; \ + else if (sizeof(_type_) == 8) shifts = 4; \ + offBeatMask = (1 << (shifts - 1)) - 1; \ + if ((ULONG_PTR) pDst & offBeatMask) \ + { \ + /* Incrementing the pointer skips over 16-byte boundary. */ \ + return _fallback_(pSrc, val, pDst, len); \ + } \ + /* Get to the 16-byte boundary now. */ \ + while ((ULONG_PTR) dptr & 0x0f) \ + { \ + _slowWay_; \ + if (--len == 0) return PRIMITIVES_SUCCESS; \ + } \ + /* Use 8 128-bit SSE registers. */ \ + count = len >> (8-shifts); \ + len -= count << (8-shifts); \ + if ((ULONG_PTR) sptr & 0x0f) \ + { \ + while (count--) \ + { \ + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \ + xmm0 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm1 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm2 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm3 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm4 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm5 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm6 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm7 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm0 = _op_(xmm0, val); \ + xmm1 = _op_(xmm1, val); \ + xmm2 = _op_(xmm2, val); \ + xmm3 = _op_(xmm3, val); \ + xmm4 = _op_(xmm4, val); \ + xmm5 = _op_(xmm5, val); \ + xmm6 = _op_(xmm6, val); \ + xmm7 = _op_(xmm7, val); \ + _mm_store_si128((__m128i *) dptr, xmm0); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm1); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm2); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm3); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm4); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm5); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm6); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm7); \ + dptr += (16/sizeof(_type_)); \ + } \ + } \ + else \ + { \ + while (count--) \ + { \ + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \ + xmm0 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm1 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm2 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm3 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm4 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm5 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm6 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm7 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm0 = _op_(xmm0, val); \ + xmm1 = _op_(xmm1, val); \ + xmm2 = _op_(xmm2, val); \ + xmm3 = _op_(xmm3, val); \ + xmm4 = _op_(xmm4, val); \ + xmm5 = _op_(xmm5, val); \ + xmm6 = _op_(xmm6, val); \ + xmm7 = _op_(xmm7, val); \ + _mm_store_si128((__m128i *) dptr, xmm0); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm1); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm2); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm3); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm4); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm5); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm6); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm7); \ + dptr += (16/sizeof(_type_)); \ + } \ + } \ + /* Use a single 128-bit SSE register. */ \ + count = len >> (5-shifts); \ + len -= count << (5-shifts); \ + while (count--) \ + { \ + __m128i xmm0 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \ + xmm0 = _op_(xmm0, val); \ + _mm_store_si128((__m128i *) dptr, xmm0); \ + dptr += (16/sizeof(_type_)); \ + } \ + /* Finish off the remainder. */ \ + while (len--) { _slowWay_; } \ + return PRIMITIVES_SUCCESS; \ +} + +/* ---------------------------------------------------------------------------- + * SCD = Source, Constant, Destination + * PRE = preload xmm0 with the constant. + */ +#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \ +PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32 len) \ +{ \ + int shifts; \ + UINT32 offBeatMask; \ + const _type_ *sptr = pSrc; \ + _type_ *dptr = pDst; \ + size_t count; \ + __m128i xmm0; \ + if (len < 16) /* pointless if too small */ \ + { \ + return _fallback_(pSrc, val, pDst, len); \ + } \ + if (sizeof(_type_) == 1) shifts = 1; \ + else if (sizeof(_type_) == 2) shifts = 2; \ + else if (sizeof(_type_) == 4) shifts = 3; \ + else if (sizeof(_type_) == 8) shifts = 4; \ + offBeatMask = (1 << (shifts - 1)) - 1; \ + if ((ULONG_PTR) pDst & offBeatMask) \ + { \ + /* Incrementing the pointer skips over 16-byte boundary. */ \ + return _fallback_(pSrc, val, pDst, len); \ + } \ + /* Get to the 16-byte boundary now. */ \ + while ((ULONG_PTR) dptr & 0x0f) \ + { \ + _slowWay_; \ + if (--len == 0) return PRIMITIVES_SUCCESS; \ + } \ + /* Use 4 128-bit SSE registers. */ \ + count = len >> (7-shifts); \ + len -= count << (7-shifts); \ + xmm0 = _mm_set1_epi32(val); \ + if ((ULONG_PTR) sptr & 0x0f) \ + { \ + while (count--) \ + { \ + __m128i xmm1, xmm2, xmm3, xmm4; \ + xmm1 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm2 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm3 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm4 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm1 = _op_(xmm1, xmm0); \ + xmm2 = _op_(xmm2, xmm0); \ + xmm3 = _op_(xmm3, xmm0); \ + xmm4 = _op_(xmm4, xmm0); \ + _mm_store_si128((__m128i *) dptr, xmm1); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm2); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm3); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm4); \ + dptr += (16/sizeof(_type_)); \ + } \ + } \ + else \ + { \ + while (count--) \ + { \ + __m128i xmm1, xmm2, xmm3, xmm4; \ + xmm1 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm2 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm3 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm4 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm1 = _op_(xmm1, xmm0); \ + xmm2 = _op_(xmm2, xmm0); \ + xmm3 = _op_(xmm3, xmm0); \ + xmm4 = _op_(xmm4, xmm0); \ + _mm_store_si128((__m128i *) dptr, xmm1); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm2); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm3); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm4); \ + dptr += (16/sizeof(_type_)); \ + } \ + } \ + /* Use a single 128-bit SSE register. */ \ + count = len >> (5-shifts); \ + len -= count << (5-shifts); \ + while (count--) \ + { \ + __m128i xmm1 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \ + xmm1 = _op_(xmm1, xmm0); \ + _mm_store_si128((__m128i *) dptr, xmm1); \ + dptr += (16/sizeof(_type_)); \ + } \ + /* Finish off the remainder. */ \ + while (len--) { _slowWay_; } \ + return PRIMITIVES_SUCCESS; \ +} + +/* ---------------------------------------------------------------------------- + * SSD = Source1, Source2, Destination + */ +#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \ +PRIM_STATIC pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, INT32 len) \ +{ \ + int shifts; \ + UINT32 offBeatMask; \ + const _type_ *sptr1 = pSrc1; \ + const _type_ *sptr2 = pSrc2; \ + _type_ *dptr = pDst; \ + size_t count; \ + if (len < 16) /* pointless if too small */ \ + { \ + return _fallback_(pSrc1, pSrc2, pDst, len); \ + } \ + if (sizeof(_type_) == 1) shifts = 1; \ + else if (sizeof(_type_) == 2) shifts = 2; \ + else if (sizeof(_type_) == 4) shifts = 3; \ + else if (sizeof(_type_) == 8) shifts = 4; \ + offBeatMask = (1 << (shifts - 1)) - 1; \ + if ((ULONG_PTR) pDst & offBeatMask) \ + { \ + /* Incrementing the pointer skips over 16-byte boundary. */ \ + return _fallback_(pSrc1, pSrc2, pDst, len); \ + } \ + /* Get to the 16-byte boundary now. */ \ + while ((ULONG_PTR) dptr & 0x0f) \ + { \ + _slowWay_; \ + if (--len == 0) return PRIMITIVES_SUCCESS; \ + } \ + /* Use 4 128-bit SSE registers. */ \ + count = len >> (7-shifts); \ + len -= count << (7-shifts); \ + if (((ULONG_PTR) sptr1 & 0x0f) || ((ULONG_PTR) sptr2 & 0x0f)) \ + { \ + /* Unaligned loads */ \ + while (count--) \ + { \ + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \ + xmm0 = _mm_lddqu_si128((__m128i *) sptr1); \ + sptr1 += (16/sizeof(_type_)); \ + xmm1 = _mm_lddqu_si128((__m128i *) sptr1); \ + sptr1 += (16/sizeof(_type_)); \ + xmm2 = _mm_lddqu_si128((__m128i *) sptr1); \ + sptr1 += (16/sizeof(_type_)); \ + xmm3 = _mm_lddqu_si128((__m128i *) sptr1); \ + sptr1 += (16/sizeof(_type_)); \ + xmm4 = _mm_lddqu_si128((__m128i *) sptr2); \ + sptr2 += (16/sizeof(_type_)); \ + xmm5 = _mm_lddqu_si128((__m128i *) sptr2); \ + sptr2 += (16/sizeof(_type_)); \ + xmm6 = _mm_lddqu_si128((__m128i *) sptr2); \ + sptr2 += (16/sizeof(_type_)); \ + xmm7 = _mm_lddqu_si128((__m128i *) sptr2); \ + sptr2 += (16/sizeof(_type_)); \ + xmm0 = _op_(xmm0, xmm4); \ + xmm1 = _op_(xmm1, xmm5); \ + xmm2 = _op_(xmm2, xmm6); \ + xmm3 = _op_(xmm3, xmm7); \ + _mm_store_si128((__m128i *) dptr, xmm0); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm1); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm2); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm3); \ + dptr += (16/sizeof(_type_)); \ + } \ + } \ + else \ + { \ + /* Aligned loads */ \ + while (count--) \ + { \ + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \ + xmm0 = _mm_load_si128((__m128i *) sptr1); \ + sptr1 += (16/sizeof(_type_)); \ + xmm1 = _mm_load_si128((__m128i *) sptr1); \ + sptr1 += (16/sizeof(_type_)); \ + xmm2 = _mm_load_si128((__m128i *) sptr1); \ + sptr1 += (16/sizeof(_type_)); \ + xmm3 = _mm_load_si128((__m128i *) sptr1); \ + sptr1 += (16/sizeof(_type_)); \ + xmm4 = _mm_load_si128((__m128i *) sptr2); \ + sptr2 += (16/sizeof(_type_)); \ + xmm5 = _mm_load_si128((__m128i *) sptr2); \ + sptr2 += (16/sizeof(_type_)); \ + xmm6 = _mm_load_si128((__m128i *) sptr2); \ + sptr2 += (16/sizeof(_type_)); \ + xmm7 = _mm_load_si128((__m128i *) sptr2); \ + sptr2 += (16/sizeof(_type_)); \ + xmm0 = _op_(xmm0, xmm4); \ + xmm1 = _op_(xmm1, xmm5); \ + xmm2 = _op_(xmm2, xmm6); \ + xmm3 = _op_(xmm3, xmm7); \ + _mm_store_si128((__m128i *) dptr, xmm0); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm1); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm2); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm3); \ + dptr += (16/sizeof(_type_)); \ + } \ + } \ + /* Use a single 128-bit SSE register. */ \ + count = len >> (5-shifts); \ + len -= count << (5-shifts); \ + while (count--) \ + { \ + __m128i xmm0, xmm1; \ + xmm0 = LOAD_SI128(sptr1); sptr1 += (16/sizeof(_type_)); \ + xmm1 = LOAD_SI128(sptr2); sptr2 += (16/sizeof(_type_)); \ + xmm0 = _op_(xmm0, xmm1); \ + _mm_store_si128((__m128i *) dptr, xmm0); \ + dptr += (16/sizeof(_type_)); \ + } \ + /* Finish off the remainder. */ \ + while (len--) { _slowWay_; } \ + return PRIMITIVES_SUCCESS; \ +} + +#endif /* !__PRIM_TEMPLATES_H_INCLUDED__ */ diff --git a/libfreerdp/primitives/primitives.c b/libfreerdp/primitives/primitives.c new file mode 100644 index 000000000..a7213b4a4 --- /dev/null +++ b/libfreerdp/primitives/primitives.c @@ -0,0 +1,297 @@ +/* primitives.c + * This code queries processor features and calls the init/deinit routines. + * vi:ts=4 sw=4 + * + * Copyright 2011 Martin Fleisz + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#include +#include +#include +#include "prim_internal.h" +#ifdef ANDROID +# include "cpu-features.h" +#endif + +/* Singleton pointer used throughout the program when requested. */ +static primitives_t *pPrimitives = NULL; + +#define D_BIT_MMX (1<<23) +#define D_BIT_SSE (1<<25) +#define D_BIT_SSE2 (1<<26) +#define D_BIT_3DN (1<<30) +#define C_BIT_SSE3 (1<<0) +#define C_BIT_3DNP (1<<8) +#define C_BIT_SSSE3 (1<<9) +#define C_BIT_SSE41 (1<<19) +#define C_BIT_SSE42 (1<<20) +#define C_BIT_XGETBV (1<<27) +#define C_BIT_AVX (1<<28) +#define C_BITS_AVX (C_BIT_XGETBV|C_BIT_AVX) +#define E_BIT_XMM (1<<1) +#define E_BIT_YMM (1<<2) +#define E_BITS_AVX (E_BIT_XMM|E_BIT_YMM) +#define C_BIT_FMA (1<<11) +#define C_BIT_AVX_AES (1<<24) + +/* If x86 */ +#if defined(__x86_64) || defined(__x86_64__) || defined(__amd64) \ + || defined(__amd64__) || defined(_M_AMD64) || defined(_M_X64) \ + || defined(i386) || defined(__i386) || defined(__i386__) \ + || defined(_M_IX86) || defined(_X86_) +#ifndef i386 +#define i386 +#endif + +/* If GCC */ +# ifdef __GNUC__ +# define xgetbv(_func_, _lo_, _hi_) \ + __asm__ __volatile__ ("xgetbv" : "=a" (_lo_), "=d" (_hi_) : "c" (_func_)) + +static void cpuid( + unsigned info, + unsigned *eax, + unsigned *ebx, + unsigned *ecx, + unsigned *edx) +{ + *eax = *ebx = *ecx = *edx = 0; + __asm volatile + ( + /* The EBX (or RBX register on x86_64) is used for the PIC base address + * and must not be corrupted by our inline assembly. + */ +# if defined(__i386__) + "mov %%ebx, %%esi;" + "cpuid;" + "xchg %%ebx, %%esi;" +#else + "mov %%rbx, %%rsi;" + "cpuid;" + "xchg %%rbx, %%rsi;" +#endif + : "=a" (*eax), "=S" (*ebx), "=c" (*ecx), "=d" (*edx) + : "0" (info) + ); +} + +static void set_hints( + primitives_hints_t *hints) +{ + unsigned a, b, c, d; + cpuid(1, &a, &b, &c, &d); + if (d & D_BIT_MMX) hints->x86_flags |= PRIM_X86_MMX_AVAILABLE; + if (d & D_BIT_SSE) hints->x86_flags |= PRIM_X86_SSE_AVAILABLE; + if (d & D_BIT_SSE2) hints->x86_flags |= PRIM_X86_SSE2_AVAILABLE; + if (d & D_BIT_3DN) hints->x86_flags |= PRIM_X86_3DNOW_AVAILABLE; + if (c & C_BIT_3DNP) hints->x86_flags |= PRIM_X86_3DNOW_PREFETCH_AVAILABLE; + if (c & C_BIT_SSE3) hints->x86_flags |= PRIM_X86_SSE3_AVAILABLE; + if (c & C_BIT_SSSE3) hints->x86_flags |= PRIM_X86_SSSE3_AVAILABLE; + if (c & C_BIT_SSE41) hints->x86_flags |= PRIM_X86_SSE41_AVAILABLE; + if (c & C_BIT_SSE42) hints->x86_flags |= PRIM_X86_SSE42_AVAILABLE; + if ((c & C_BITS_AVX) == C_BITS_AVX) + { + int e, f; + xgetbv(0, e, f); + if ((e & E_BITS_AVX) == E_BITS_AVX) + { + hints->x86_flags |= PRIM_X86_AVX_AVAILABLE; + if (c & C_BIT_FMA) hints->x86_flags |= PRIM_X86_FMA_AVAILABLE; + if (c & C_BIT_AVX_AES) hints->x86_flags |= PRIM_X86_AVX_AES_AVAILABLE; + } + } + /* TODO: AVX2: set eax=7, ecx=0, cpuid, check ebx-bit5 */ +} +# else +static void set_hints( + primitives_hints_t *hints) +{ + /* x86 non-GCC: TODO */ +} +# endif /* __GNUC__ */ +/* ------------------------------------------------------------------------- */ +#elif defined(__arm__) || defined(__ARM_ARCH_7A__) \ + || defined(__ARM_EABI__) || defined(__ARMEL__) || defined(ANDROID) +#ifndef __arm__ +#define __arm__ +#endif + +static UINT32 androidNeon(void) +{ +#if ANDROID + if (android_getCpuFamily() != ANDROID_CPU_FAMILY_ARM) return 0; + + UINT64 features = android_getCpuFeatures(); + + if ((features & ANDROID_CPU_ARM_FEATURE_ARMv7)) + { + if (features & ANDROID_CPU_ARM_FEATURE_NEON) + { + return PRIM_ARM_NEON_AVAILABLE; + } + } + /* else */ +#endif + return 0; +} + +static void set_hints( + primitives_hints_t *hints) +{ + /* ARM: TODO */ + hints->arm_flags |= androidNeon(); +} + +#else +static void set_hints( + primitives_hints_t *hints) +{ +} +#endif /* x86 else ARM else */ + +/* ------------------------------------------------------------------------- */ +void primitives_init(void) +{ + primitives_hints_t *hints; + + if (pPrimitives == NULL) + { + pPrimitives = calloc(1, sizeof(primitives_t)); + if (pPrimitives == NULL) return; + } + + hints = calloc(1, sizeof(primitives_hints_t)); + set_hints(hints); + pPrimitives->hints = (void *) hints; + + /* Now call each section's initialization routine. */ + primitives_init_add(hints, pPrimitives); + primitives_init_andor(hints, pPrimitives); + primitives_init_alphaComp(hints, pPrimitives); + primitives_init_copy(hints, pPrimitives); + primitives_init_set(hints, pPrimitives); + primitives_init_shift(hints, pPrimitives); + primitives_init_sign(hints, pPrimitives); + primitives_init_colors(hints, pPrimitives); +} + +/* ------------------------------------------------------------------------- */ +primitives_t *primitives_get(void) +{ + if (pPrimitives == NULL) primitives_init(); + return pPrimitives; +} + +/* ------------------------------------------------------------------------- */ +UINT32 primitives_get_flags( + const primitives_t *prims) +{ + primitives_hints_t *hints = (primitives_hints_t *) (prims->hints); +#ifdef i386 + return hints->x86_flags; +#elif defined(__arm__) + return hints->arm_flags; +#else + return 0; +#endif +} + +/* ------------------------------------------------------------------------- */ +void primitives_flags_str( + const primitives_t *prims, + char *str, + size_t len) +{ + typedef struct + { + UINT32 flag; + const char *str; + } flagpair_t; + static const flagpair_t x86_flags[] = + { + { PRIM_X86_MMX_AVAILABLE, "MMX" }, + { PRIM_X86_3DNOW_AVAILABLE, "3DNow" }, + { PRIM_X86_3DNOW_PREFETCH_AVAILABLE, "3DNow-PF" }, + { PRIM_X86_SSE_AVAILABLE, "SSE" }, + { PRIM_X86_SSE2_AVAILABLE, "SSE2" }, + { PRIM_X86_SSE3_AVAILABLE, "SSE3" }, + { PRIM_X86_SSSE3_AVAILABLE, "SSSE3" }, + { PRIM_X86_SSE41_AVAILABLE, "SSE4.1" }, + { PRIM_X86_SSE42_AVAILABLE, "SSE4.2" }, + { PRIM_X86_AVX_AVAILABLE, "AVX" }, + { PRIM_X86_FMA_AVAILABLE, "FMA" }, + { PRIM_X86_AVX_AES_AVAILABLE, "AVX-AES" }, + { PRIM_X86_AVX2_AVAILABLE, "AVX2" }, + }; + static const flagpair_t arm_flags[] = + { + { PRIM_ARM_VFP1_AVAILABLE, "VFP1" }, + { PRIM_ARM_VFP2_AVAILABLE, "VFP2" }, + { PRIM_ARM_VFP3_AVAILABLE, "VFP3" }, + { PRIM_ARM_VFP4_AVAILABLE, "VFP4" }, + { PRIM_ARM_FPA_AVAILABLE, "FPA" }, + { PRIM_ARM_FPE_AVAILABLE, "FPE" }, + { PRIM_ARM_IWMMXT_AVAILABLE, "IWMMXT" }, + { PRIM_ARM_NEON_AVAILABLE, "NEON" }, + }; + int i; + primitives_hints_t *hints; + + *str = '\0'; + --len; /* for the '/0' */ + + hints = (primitives_hints_t *) (prims->hints); + for (i=0; ix86_flags & x86_flags[i].flag) + { + int slen = strlen(x86_flags[i].str) + 1; + if (len < slen) break; + if (*str != '\0') strcat(str, " "); + strcat(str, x86_flags[i].str); + len -= slen; + } + } + + for (i=0; iarm_flags & arm_flags[i].flag) + { + int slen = strlen(arm_flags[i].str) + 1; + if (len < slen) break; + if (*str != '\0') strcat(str, " "); + strcat(str, arm_flags[i].str); + len -= slen; + } + } +} + +/* ------------------------------------------------------------------------- */ +void primitives_deinit(void) +{ + if (pPrimitives == NULL) return; + + /* Call each section's de-initialization routine. */ + primitives_deinit_add(pPrimitives); + primitives_deinit_andor(pPrimitives); + primitives_deinit_alphaComp(pPrimitives); + primitives_deinit_copy(pPrimitives); + primitives_deinit_set(pPrimitives); + primitives_deinit_shift(pPrimitives); + primitives_deinit_sign(pPrimitives); + primitives_deinit_colors(pPrimitives); + + if (pPrimitives->hints != NULL) free((void *) (pPrimitives->hints)); + free((void *) pPrimitives); + pPrimitives = NULL; +} diff --git a/libfreerdp/primitives/test/CMakeLists.txt b/libfreerdp/primitives/test/CMakeLists.txt new file mode 100644 index 000000000..71d23b018 --- /dev/null +++ b/libfreerdp/primitives/test/CMakeLists.txt @@ -0,0 +1,139 @@ +# FreeRDP: A Remote Desktop Protocol Client +# primitives test makefile builder +# vi:ts=4 sw=4: +# +# (c) Copyright 2012 Hewlett-Packard Development Company, L.P. +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing permissions +# and limitations under the License. +# + +# TODO: Integrate this into the testing framework, in some form. +# Right now this produces a standalone test that covers both functionality +# and performance of the primitives library entrypoints. + +cmake_minimum_required(VERSION 2.8) +set(MODULE_NAME "prim_test") +set(MODULE_PREFIX "PRIMITIVES_LIBRARY_TEST") + +set(PRIMITIVE_TEST_CFILES + prim_test.c + test_add.c + test_alphaComp.c + test_andor.c + test_colors.c + test_copy.c + test_set.c + test_shift.c + test_sign.c + ../prim_add.c + ../prim_andor.c + ../prim_alphaComp.c + ../prim_colors.c + ../prim_copy.c + ../prim_set.c + ../prim_shift.c + ../prim_sign.c + ../primitives.c + ) + +set(PRIMITIVE_TEST_HEADERS + measure.h + prim_test.h + ../prim_internal.h + ) + +set(PRIMITIVE_TEST_SRCS + ${PRIMITIVE_TEST_CFILES} + ${PRIMITIVE_TEST_HEADERS} + ) + +include_directories(. ../../.. ../../../include ../../../winpr/include) +add_definitions(-DPRIM_STATIC=auto -DALL_PRIMITIVES_VERSIONS -DHAVE_CONFIG_H) + +# If these haven't been set by the caller, set them now to defaults. +if(NOT DEFINED WITH_IPP) + set(WITH_IPP FALSE) +endif() +if(NOT DEFINED WITH_SSE2) + if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm*") + set(WITH_SSE2 FALSE) + else() + set(WITH_SSE2 TRUE) + endif() +endif() +if(NOT DEFINED WITH_NEON) + if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm*") + set(WITH_NEON TRUE) + else() + set(WITH_NEON FALSE) + endif() +endif() + +if(WITH_SSE2) + if(CMAKE_COMPILER_IS_GNUCC) + set(OPTFLAGS "${OPTFLAGS} -msse2 -mssse3 -O2 -Wdeclaration-after-statement") + endif() + + if(MSVC) + set(OPTFLAGS "${OPTFLAGS} /arch:SSE2") + endif() +elseif(WITH_NEON) + if(CMAKE_COMPILER_IS_GNUCC) + set(OPTIMZATION "${OPTFLAGS} -mfpu=neon -mfloat-abi=softfp -O2") + endif() + # TODO: Add MSVC equivalent +endif() + +add_executable(prim_test ${PRIMITIVE_TEST_SRCS}) + +if(WITH_IPP) + if(NOT DEFINED IPP_FOUND) + include(../../../cmake/FindIPP.cmake) + endif() + add_definitions(-DWITH_IPP) + + # IPP PATH debugging messages + message(IPP_FOUND=${IPP_FOUND}) + message(IPP_VERSION_STR=${IPP_VERSION_STR}) + message(IPP_VERSION_MAJOR=${IPP_VERSION_MAJOR}) + message(IPP_VERSION_MINOR=${IPP_VERSION_MINOR}) + message(IPP_VERSION_BUILD=${IPP_VERSION_BUILD}) + message(IPP_ROOT_DIR=${IPP_ROOT_DIR}) + message(IPP_INCLUDE_DIRS=${IPP_INCLUDE_DIRS}) + message(IPP_LIBRARY_DIRS=${IPP_LIBRARY_DIRS}) + message(IPP_LIBRARIES=${IPP_LIBRARIES}) + message(IPP_COMPILER_LIBRARY_DIRS=${IPP_COMPILER_LIBRARY_DIRS}) + message(IPP_COMPILER_LIBRARIES=${IPP_COMPILER_LIBRARIES}) + message(IPP_LIBRARY_LIST=${IPP_LIBRARY_LIST}) + message(IPP_LIB_PREFIX=${IPP_LIB_PREFIX}) + message(IPP_LIB_SUFFIX=${IPP_LIB_SUFFIX}) + message(IPP_PREFIX=${IPP_PREFIX}) + message(IPP_SUFFIX=${IPP_SUFFIX}) + message(IPPCORE=${IPPCORE}) + message(IPPS=${IPPS}) + message(IPPI=${IPPI}) + message(IPPCC=${IPPCC}) + message(IPPCV=${IPPCV}) + message(IPPVM=${IPPVM}) + + if(CMAKE_COMPILER_IS_GNUCC) + foreach(INCLDIR ${IPP_INCLUDE_DIRS}) + set(OPTFLAGS "${OPTFLAGS} -I${INCLDIR}") + endforeach(INCLDIR) + endif() + target_link_libraries(prim_test ${IPP_LIBRARY_LIST}) +endif() + +set_property(SOURCE ${PRIMITIVE_TEST_CFILES} PROPERTY COMPILE_FLAGS ${OPTFLAGS}) + +target_link_libraries(prim_test rt) +if(NOT TESTING_OUTPUT_DIRECTORY) + set(TESTING_OUTPUT_DIRECTORY .) +endif() +add_test(prim_test ${TESTING_OUTPUT_DIRECTORY}/prim_test functionality) diff --git a/libfreerdp/primitives/test/measure.h b/libfreerdp/primitives/test/measure.h new file mode 100644 index 000000000..7f8d383db --- /dev/null +++ b/libfreerdp/primitives/test/measure.h @@ -0,0 +1,121 @@ +/* measure.h + * Macros to help with performance measurement. + * vi:ts=4 sw=4 + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. Algorithms used by + * this code may be covered by patents by HP, Microsoft, or other parties. + * + * MEASURE_LOOP_START("measurement", 2000) + * code to be measured + * MEASURE_LOOP_STOP + * buffer flush and such + * MEASURE_SHOW_RESULTS + * + * Define GOOGLE_PROFILER if you want gperftools included. + */ + +#ifdef _GNUC_ +# pragma once +#endif + +#ifndef __MEASURE_H_INCLUDED__ +#define __MEASURE_H_INCLUDED__ + +#include +#include +#include +#include +#include + +#ifdef GOOGLE_PROFILER +#include +#define PROFILER_START(_prefix_) \ + do { \ + char _path[PATH_MAX]; \ + sprintf(_path, "./%s.prof", (_prefix_)); \ + ProfilerStart(_path); \ + } while (0); +# define PROFILER_STOP \ + do { \ + ProfilerStop(); \ + } while (0); +#else +#define PROFILER_START(_prefix_) +#define PROFILER_STOP +#endif // GOOGLE_PROFILER + +extern float _delta_time(const struct timespec *t0, const struct timespec *t1); +extern void _floatprint(float t, char *output); + +#ifndef CLOCK_MONOTONIC_RAW +#define CLOCK_MONOTONIC_RAW 4 +#endif // !CLOCK_MONOTONIC_RAW + +#define MEASURE_LOOP_START(_prefix_, _count_) \ +{ struct timespec _start, _stop; \ + char *_prefix; \ + int _count = (_count_); \ + int _loop; \ + float _delta; \ + char _str1[32], _str2[32]; \ + _prefix = strdup(_prefix_); \ + _str1[0] = '\0'; _str2[0] = '\0'; \ + clock_gettime(CLOCK_MONOTONIC_RAW, &_start); \ + PROFILER_START(_prefix); \ + _loop = (_count); \ + do { + +#define MEASURE_LOOP_STOP \ + } while (--_loop); + +#define MEASURE_GET_RESULTS(_result_) \ + PROFILER_STOP; \ + clock_gettime(CLOCK_MONOTONIC_RAW, &_stop); \ + _delta = _delta_time(&_start, &_stop); \ + (_result_) = (float) _count / _delta; \ + free(_prefix); \ +} + +#define MEASURE_SHOW_RESULTS(_result_) \ + PROFILER_STOP; \ + clock_gettime(CLOCK_MONOTONIC_RAW, &_stop); \ + _delta = _delta_time(&_start, &_stop); \ + (_result_) = (float) _count / _delta; \ + _floatprint((float) _count / _delta, _str1); \ + printf("%s: %9d iterations in %5.1f seconds = %s/s \n", \ + _prefix, _count, _delta, _str1); \ + free(_prefix); \ +} + +#define MEASURE_SHOW_RESULTS_SCALED(_scale_, _label_) \ + PROFILER_STOP; \ + clock_gettime(CLOCK_MONOTONIC_RAW, &_stop); \ + _delta = _delta_time(&_start, &_stop); \ + _floatprint((float) _count / _delta, _str1); \ + _floatprint((float) _count / _delta * (_scale_), _str2); \ + printf("%s: %9d iterations in %5.1f seconds = %s/s = %s%s \n", \ + _prefix, _count, _delta, _str1, _str2, _label_); \ + free(_prefix); \ +} + +#define MEASURE_TIMED(_label_, _init_iter_, _test_time_, _result_, _call_) \ +{ float _r; \ + MEASURE_LOOP_START(_label_, _init_iter_); \ + _call_; \ + MEASURE_LOOP_STOP; \ + MEASURE_GET_RESULTS(_r); \ + MEASURE_LOOP_START(_label_, _r * _test_time_); \ + _call_; \ + MEASURE_LOOP_STOP; \ + MEASURE_SHOW_RESULTS(_result_); \ +} + +#endif // __MEASURE_H_INCLUDED__ diff --git a/libfreerdp/primitives/test/prim_test.c b/libfreerdp/primitives/test/prim_test.c new file mode 100644 index 000000000..82740e191 --- /dev/null +++ b/libfreerdp/primitives/test/prim_test.c @@ -0,0 +1,414 @@ +/* prim_test.c + * vi:ts=4 sw=4 + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#include "prim_test.h" +#include +#include +#include +#include +#include +#include + +int test_sizes[] = { 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096 }; +int Quiet = 0; + +/* ------------------------------------------------------------------------- */ +static void get_random_data_lrand( + void *buffer, + size_t size) +{ + static int seeded = 0; + long int *ptr = (long int *) buffer; + unsigned char *cptr; + + if (!seeded) + { + seeded = 1; + srand48(time(NULL)); + } + /* This isn't the perfect random number generator, but that's okay. */ + while (size >= sizeof(long int)) + { + *ptr++ = lrand48(); + size -= sizeof(long int); + } + cptr = (unsigned char *) ptr; + while (size > 0) + { + *cptr++ = lrand48() & 0xff; + --size; + } +} + +/* ------------------------------------------------------------------------- */ +void get_random_data( + void *buffer, + size_t size) +{ +#ifdef linux + size_t offset = 0; + int fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) + { + get_random_data_lrand(buffer, size); + return; + } + + while (size > 0) + { + ssize_t count = read(fd, buffer+offset, size); + size -= count; + offset += count; + } + close(fd); +#else + get_random_data_lrand(buffer, size); +#endif +} + +/* ------------------------------------------------------------------------- */ +float _delta_time( + const struct timespec *t0, + const struct timespec *t1) +{ + INT64 secs = (INT64) (t1->tv_sec) - (INT64) (t0->tv_sec); + long nsecs = t1->tv_nsec - t0->tv_nsec; + double retval; + + if (nsecs < 0) + { + --secs; + nsecs += 1000000000; + } + retval = (double) secs + (double) nsecs / (double) 1000000000.0; + return (retval < 0.0) ? 0.0 : (float) retval; +} + +/* ------------------------------------------------------------------------- */ +void _floatprint( + float t, + char *output) +{ + /* I don't want to link against -lm, so avoid log,exp,... */ + float f = 10.0; + int i; + while (t > f) f *= 10.0; + f /= 1000.0; + i = ((int) (t/f+0.5)) * (int) f; + if (t < 0.0) sprintf(output, "%f", t); + else if (i == 0) sprintf(output, "%d", (int) (t+0.5)); + else if (t < 1e+3) sprintf(output, "%3d", i); + else if (t < 1e+6) sprintf(output, "%3d,%03d", + i/1000, i % 1000); + else if (t < 1e+9) sprintf(output, "%3d,%03d,000", + i/1000000, (i % 1000000) / 1000); + else if (t < 1e+12) sprintf(output, "%3d,%03d,000,000", + i/1000000000, (i % 1000000000) / 1000000); + else sprintf(output, "%f", t); +} + +/* ------------------------------------------------------------------------- */ +/* Specific areas to test: */ +#define TEST_COPY8 (1<<0) +#define TEST_SET8 (1<<1) +#define TEST_SET32S (1<<2) +#define TEST_SET32U (1<<3) +#define TEST_SIGN16S (1<<4) +#define TEST_ADD16S (1<<5) +#define TEST_LSHIFT16S (1<<6) +#define TEST_LSHIFT16U (1<<7) +#define TEST_RSHIFT16S (1<<8) +#define TEST_RSHIFT16U (1<<9) +#define TEST_RGB (1<<10) +#define TEST_ALPHA (1<<11) +#define TEST_AND (1<<12) +#define TEST_OR (1<<13) + +/* Specific types of testing: */ +#define TEST_FUNCTIONALITY (1<<0) +#define TEST_PERFORMANCE (1<<1) + +/* ------------------------------------------------------------------------- */ +int main( + int argc, + char **argv) +{ + typedef struct + { + const char *testStr; + UINT32 bits; + } test_t; + static const test_t testList[] = + { + { "all", 0xFFFFFFFFU }, + { "copy", TEST_COPY8 }, + { "copy8", TEST_COPY8 }, + { "set", TEST_SET8|TEST_SET32S|TEST_SET32U }, + { "set8", TEST_SET8 }, + { "set32", TEST_SET32S|TEST_SET32U }, + { "set32s", TEST_SET32S }, + { "set32u", TEST_SET32U }, + { "sign", TEST_SIGN16S }, + { "sign16s", TEST_SIGN16S }, + { "add", TEST_ADD16S }, + { "add16s", TEST_ADD16S }, + { "lshift", TEST_LSHIFT16S|TEST_LSHIFT16U }, + { "rshift", TEST_RSHIFT16S|TEST_RSHIFT16U }, + { "shift", TEST_LSHIFT16S|TEST_LSHIFT16U|TEST_RSHIFT16S|TEST_RSHIFT16U }, + { "lshift16s", TEST_LSHIFT16S }, + { "lshift16u", TEST_LSHIFT16U }, + { "rshift16s", TEST_RSHIFT16S }, + { "rshift16u", TEST_RSHIFT16U }, + { "rgb", TEST_RGB }, + { "color", TEST_RGB }, + { "colors", TEST_RGB }, + { "alpha", TEST_ALPHA }, + { "and", TEST_AND }, + { "or", TEST_OR }, + }; +#define NUMTESTS (sizeof(testList)/sizeof(test_t)) + static const test_t testTypeList[] = + { + { "functionality", TEST_FUNCTIONALITY }, + { "performance", TEST_PERFORMANCE }, + }; +#define NUMTESTTYPES (sizeof(testTypeList)/sizeof(test_t)) + char hints[256]; + UINT32 testSet = 0; + UINT32 testTypes = 0; + int i; + int results = SUCCESS; + + /* Parse command line for the test set. */ + for (i=1; i +#include +#include + +#include +#include +#include +#include +#ifdef WITH_IPP +#include +#include +#endif + +#define BLOCK_ALIGNMENT 16 +#ifdef __GNUC__ +#define ALIGN(x) x __attribute((aligned(BLOCK_ALIGNMENT))) +#define POSSIBLY_UNUSED(x) x __attribute((unused)) +#else +/* TODO: Someone needs to finish this for non-GNU C */ +#define ALIGN(x) x +#define POSSIBLY_UNUSED(x) x +#endif +#define ABS(_x_) ((_x_) < 0 ? (-(_x_)) : (_x_)) +#define MAX_TEST_SIZE 4096 + +extern int test_sizes[]; +#define NUM_TEST_SIZES 10 + +extern void get_random_data(void *buffer, size_t size); + +#ifndef SUCCESS +#define SUCCESS 0 +#endif +#ifndef FAILURE +#define FAILURE 1 +#endif + +extern int test_copy8u_func(void); +extern int test_copy8u_speed(void); + +extern int test_set8u_func(void); +extern int test_set32s_func(void); +extern int test_set8u_speed(void); +extern int test_set32s_speed(void); +extern int test_set32u_speed(void); + +extern int test_sign16s_func(void); +extern int test_sign16s_speed(void); + +extern int test_add16s_func(void); +extern int test_add16s_speed(void); + +extern int test_lShift_16s_func(void); +extern int test_lShift_16u_func(void); +extern int test_rShift_16s_func(void); +extern int test_rShift_16u_func(void); +extern int test_lShift_16s_speed(void); +extern int test_lShift_16u_speed(void); +extern int test_rShift_16s_speed(void); +extern int test_rShift_16u_speed(void); + +extern int test_RGBToRGB_16s8u_P3AC4R_func(void); +extern int test_RGBToRGB_16s8u_P3AC4R_speed(void); +extern int test_yCbCrToRGB_16s16s_P3P3_func(void); +extern int test_yCbCrToRGB_16s16s_P3P3_speed(void); + +extern int test_alphaComp_func(void); +extern int test_alphaComp_speed(void); + +extern int test_and_32u_func(void); +extern int test_and_32u_speed(void); +extern int test_or_32u_func(void); +extern int test_or_32u_speed(void); + +/* Since so much of this code is repeated, define a macro to build + * functions to do speed tests. + */ +#ifdef armel +#define SIMD_TYPE "Neon" +#else +#define SIMD_TYPE "SSE" +#endif + +#define DO_NORMAL_MEASUREMENTS(_funcNormal_, _prework_) \ + do { \ + for (s=0; s 0.0) _floatprint(resultNormal[s], sN); \ + if (resultSSENeon[s] > 0.0) \ + { \ + _floatprint(resultSSENeon[s], sSN); \ + if (resultNormal[s] > 0.0) \ + { \ + sprintf(sSNp, "%d%%", \ + (int) (resultSSENeon[s] / resultNormal[s] * 100.0 + 0.5)); \ + } \ + } \ + if (resultIPP[s] > 0.0) \ + { \ + _floatprint(resultIPP[s], sIPP); \ + if (resultNormal[s] > 0.0) \ + { \ + sprintf(sIPPp, "%d%%", \ + (int) (resultIPP[s] / resultNormal[s] * 100.0 + 0.5)); \ + } \ + } \ + printf("%8d: %15s %15s %5s %15s %5s\n", \ + size_array[s], sN, sSN, sSNp, sIPP, sIPPp); \ + } \ + free(resultNormal); free(resultSSENeon); free(resultIPP); \ +} + +#endif // !__PRIMTEST_H_INCLUDED__ diff --git a/libfreerdp/primitives/test/test_add.c b/libfreerdp/primitives/test/test_add.c new file mode 100644 index 000000000..f5c2b82c0 --- /dev/null +++ b/libfreerdp/primitives/test/test_add.c @@ -0,0 +1,105 @@ +/* test_add.c + * vi:ts=4 sw=4 + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#include "prim_test.h" + +#define FUNC_TEST_SIZE 65536 +static const int ADD16S_PRETEST_ITERATIONS = 300000*64; +static const int TEST_TIME = 2.0; // seconds + +extern pstatus_t general_add_16s( + const INT16 *pSrc1, const INT16 *pSrc2, INT16 *pDst, int len); +extern pstatus_t sse3_add_16s( + const INT16 *pSrc1, const INT16 *pSrc2, INT16 *pDst, int len); + +/* ========================================================================= */ +int test_add16s_func(void) +{ + INT16 ALIGN(src1[FUNC_TEST_SIZE+3]), ALIGN(src2[FUNC_TEST_SIZE+3]), + ALIGN(d1[FUNC_TEST_SIZE+3]), ALIGN(d2[FUNC_TEST_SIZE+3]); + int failed = 0; + int i; + char testStr[256]; + UINT32 pflags = primitives_get_flags(primitives_get()); + + testStr[0] = '\0'; + get_random_data(src1, sizeof(src1)); + get_random_data(src2, sizeof(src2)); + memset(d1, 0, sizeof(d1)); + memset(d2, 0, sizeof(d2)); + general_add_16s(src1+1, src2+1, d1+1, FUNC_TEST_SIZE); +#ifdef i386 + if (pflags & PRIM_X86_SSE3_AVAILABLE) + { + strcat(testStr, " SSE3"); + /* Aligned */ + sse3_add_16s(src1+1, src2+1, d2+1, FUNC_TEST_SIZE); + for (i=1; i 0) ? FAILURE : SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +STD_SPEED_TEST(add16s_speed_test, INT16, INT16, dst=dst, + TRUE, general_add_16s(src1, src2, dst, size), + TRUE, sse3_add_16s(src1, src2, dst, size), PRIM_X86_SSE3_AVAILABLE, + FALSE, dst=dst, 0, + TRUE, ippsAdd_16s(src1, src2, dst, size)); + +int test_add16s_speed(void) +{ + INT16 ALIGN(src1[MAX_TEST_SIZE+3]), ALIGN(src2[MAX_TEST_SIZE+3]), + ALIGN(dst[MAX_TEST_SIZE+3]); + get_random_data(src1, sizeof(src1)); + get_random_data(src2, sizeof(src2)); + add16s_speed_test("add16s", "aligned", src1, src2, 0, dst, + test_sizes, NUM_TEST_SIZES, ADD16S_PRETEST_ITERATIONS, TEST_TIME); + add16s_speed_test("add16s", "unaligned", src1+1, src2+2, 0, dst, + test_sizes, NUM_TEST_SIZES, ADD16S_PRETEST_ITERATIONS, TEST_TIME); + return SUCCESS; +} diff --git a/libfreerdp/primitives/test/test_alphaComp.c b/libfreerdp/primitives/test/test_alphaComp.c new file mode 100644 index 000000000..bb45fc644 --- /dev/null +++ b/libfreerdp/primitives/test/test_alphaComp.c @@ -0,0 +1,225 @@ +/* test_alphaComp.c + * vi:ts=4 sw=4 + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#include "prim_test.h" + +static const int ALPHA_PRETEST_ITERATIONS = 5000000; +static const float TEST_TIME = 5.0; + +static const int block_size[] = { 4, 64, 256 }; +#define NUM_BLOCK_SIZES (sizeof(block_size)/sizeof(int)) +#define MAX_BLOCK_SIZE 256 +#define SIZE_SQUARED (MAX_BLOCK_SIZE*MAX_BLOCK_SIZE) + +extern pstatus_t general_alphaComp_argb( + const BYTE *pSrc1, int src1Step, + const BYTE *pSrc2, int src2Step, + BYTE *pDst, int dstStep, + int width, int height); +extern pstatus_t sse2_alphaComp_argb( + const BYTE *pSrc1, int src1Step, + const BYTE *pSrc2, int src2Step, + BYTE *pDst, int dstStep, + int width, int height); +extern pstatus_t ipp_alphaComp_argb( + const BYTE *pSrc1, int src1Step, + const BYTE *pSrc2, int src2Step, + BYTE *pDst, int dstStep, + int width, int height); + +/* ========================================================================= */ +#define ALF(_c_) (((_c_) & 0xFF000000U) >> 24) +#define RED(_c_) (((_c_) & 0x00FF0000U) >> 16) +#define GRN(_c_) (((_c_) & 0x0000FF00U) >> 8) +#define BLU(_c_) ((_c_) & 0x000000FFU) +#define TOLERANCE 1 +#define PIXEL(_addr_, _bytes_, _x_, _y_) \ + ((UINT32 *) (((BYTE *) (_addr_)) + (_x_)*4 + (_y_)*(_bytes_))) +#define SRC1_WIDTH 6 +#define SRC1_HEIGHT 6 +#define SRC2_WIDTH 7 +#define SRC2_HEIGHT 7 +#define DST_WIDTH 9 +#define DST_HEIGHT 9 +#define TEST_WIDTH 4 +#define TEST_HEIGHT 5 + +/* ------------------------------------------------------------------------- */ +static UINT32 alpha_add( + UINT32 c1, + UINT32 c2) +{ + UINT32 a1 = ALF(c1); + UINT32 r1 = RED(c1); + UINT32 g1 = GRN(c1); + UINT32 b1 = BLU(c1); + + UINT32 a2 = ALF(c2); + UINT32 r2 = RED(c2); + UINT32 g2 = GRN(c2); + UINT32 b2 = BLU(c2); + + UINT32 a3 = ((a1 * a1 + (255-a1) * a2) / 255) & 0xff; + UINT32 r3 = ((a1 * r1 + (255-a1) * r2) / 255) & 0xff; + UINT32 g3 = ((a1 * g1 + (255-a1) * g2) / 255) & 0xff; + UINT32 b3 = ((a1 * b1 + (255-a1) * b2) / 255) & 0xff; + + return (a3 << 24) | (r3 << 16) | (g3 << 8) | b3; +} + +/* ------------------------------------------------------------------------- */ +static UINT32 colordist( + UINT32 c1, + UINT32 c2) +{ + int d, maxd = 0; + + d = ABS(ALF(c1) - ALF(c2)); + if (d > maxd) maxd = d; + d = ABS(RED(c1) - RED(c2)); + if (d > maxd) maxd = d; + d = ABS(GRN(c1) - GRN(c2)); + if (d > maxd) maxd = d; + d = ABS(BLU(c1) - BLU(c2)); + if (d > maxd) maxd = d; + return maxd; +} + +/* ------------------------------------------------------------------------- */ +int test_alphaComp_func(void) +{ + UINT32 ALIGN(src1[SRC1_WIDTH*SRC1_HEIGHT]); + UINT32 ALIGN(src2[SRC2_WIDTH*SRC2_HEIGHT]); + UINT32 ALIGN(dst1[DST_WIDTH*DST_HEIGHT]); + UINT32 ALIGN(dst2a[DST_WIDTH*DST_HEIGHT]); + UINT32 ALIGN(dst2u[DST_WIDTH*DST_HEIGHT+1]); + UINT32 ALIGN(dst3[DST_WIDTH*DST_HEIGHT]); + int error = 0; + UINT32 pflags = primitives_get_flags(primitives_get()); + char testStr[256]; + UINT32 *ptr; + int i, x, y; + + testStr[0] = '\0'; + get_random_data(src1, sizeof(src1)); + /* Special-case the first two values */ + src1[0] &= 0x00FFFFFFU; + src1[1] |= 0xFF000000U; + get_random_data(src2, sizeof(src2)); + /* Set the second operand to fully-opaque. */ + ptr = src2; + for (i=0; i TOLERANCE) + { + printf("alphaComp-general: [%d,%d] 0x%08x+0x%08x=0x%08x, got 0x%08x\n", + x, y, s1, s2, c0, c1); + error = 1; + } +#ifdef i386 + if (pflags & PRIM_X86_SSE2_AVAILABLE) + { + UINT32 c2 = *PIXEL(dst2a, 4*DST_WIDTH, x, y); + if (colordist(c0, c2) > TOLERANCE) + { + printf("alphaComp-SSE-aligned: [%d,%d] 0x%08x+0x%08x=0x%08x, got 0x%08x\n", + x, y, s1, s2, c0, c2); + error = 1; + } + c2 = *PIXEL(dst2u+1, 4*DST_WIDTH, x, y); + if (colordist(c0, c2) > TOLERANCE) + { + printf("alphaComp-SSE-unaligned: [%d,%d] 0x%08x+0x%08x=0x%08x, got 0x%08x\n", + x, y, s1, s2, c0, c2); + error = 1; + } + } +#endif /* i386 */ +#ifdef WITH_IPP + UINT32 c3 = *PIXEL(dst3, 4*DST_WIDTH, x, y); + if (colordist(c0, c3) > TOLERANCE) + { + printf("alphaComp-IPP: [%d,%d] 0x%08x+0x%08x=0x%08x, got 0x%08x\n", + x, y, s1, s2, c0, c3); + error = 1; + } +#endif + } + } + if (!error) printf("All alphaComp tests passed (%s).\n", testStr); + return (error > 0) ? FAILURE : SUCCESS; +} + + +/* ------------------------------------------------------------------------- */ +STD_SPEED_TEST(alphaComp_speed, BYTE, BYTE, int bytes = size*4, + TRUE, general_alphaComp_argb(src1, bytes, src2, bytes, dst, bytes, + size, size), + TRUE, sse2_alphaComp_argb(src1, bytes, src2, bytes, dst, bytes, + size, size), PRIM_X86_SSE2_AVAILABLE, + FALSE, dst=dst, 0, + TRUE, ipp_alphaComp_argb(src1, bytes, src2, bytes, dst, bytes, + size, size)); + +int test_alphaComp_speed(void) +{ + INT32 ALIGN(src1[MAX_BLOCK_SIZE*(MAX_BLOCK_SIZE+1)]), + ALIGN(src2[SIZE_SQUARED]), + ALIGN(dst[SIZE_SQUARED]); + + get_random_data(src1, sizeof(src1)); + get_random_data(src2, sizeof(src2)); + + alphaComp_speed("alphaComp", "aligned", + (BYTE *) src1, (BYTE *) src2, 0, (BYTE *) dst, + block_size, NUM_BLOCK_SIZES, ALPHA_PRETEST_ITERATIONS, TEST_TIME); + alphaComp_speed("alphaComp", "unaligned", + (BYTE *) src1+1, (BYTE *) src2, 0, (BYTE *) dst, + block_size, NUM_BLOCK_SIZES, ALPHA_PRETEST_ITERATIONS, TEST_TIME); + return SUCCESS; +} diff --git a/libfreerdp/primitives/test/test_andor.c b/libfreerdp/primitives/test/test_andor.c new file mode 100644 index 000000000..39ee83c11 --- /dev/null +++ b/libfreerdp/primitives/test/test_andor.c @@ -0,0 +1,178 @@ +/* test_andor.c + * vi:ts=4 sw=4 + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#include "prim_test.h" + +#define FUNC_TEST_SIZE 65536 +static const int ANDOR_PRETEST_ITERATIONS = 100000; +static const int TEST_TIME = 2.0; // seconds + +extern pstatus_t general_andC_32u(const UINT32 *pSrc, UINT32 val, + UINT32 *pDst, int len); +extern pstatus_t sse3_andC_32u(const UINT32 *pSrc, UINT32 val, + UINT32 *pDst, int len); +extern pstatus_t general_orC_32u(const UINT32 *pSrc, UINT32 val, + UINT32 *pDst, int len); +extern pstatus_t sse3_orC_32u(const UINT32 *pSrc, UINT32 val, + UINT32 *pDst, int len); + +#define VALUE (0xA5A5A5A5U) + +/* ========================================================================= */ +int test_and_32u_func(void) +{ + UINT32 ALIGN(src[FUNC_TEST_SIZE+3]), ALIGN(dst[FUNC_TEST_SIZE+3]); + int failed = 0; + int i; + UINT32 pflags = primitives_get_flags(primitives_get()); + char testStr[256]; + + testStr[0] = '\0'; + get_random_data(src, sizeof(src)); + general_andC_32u(src+1, VALUE, dst+1, FUNC_TEST_SIZE); + strcat(testStr, " general"); + for (i=1; i<=FUNC_TEST_SIZE; ++i) + { + if (dst[i] != (src[i] & VALUE)) + { + printf("AND-general FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n", + i, src[i], VALUE, src[i] & VALUE, dst[i]); + ++failed; + } + } +#ifdef i386 + if (pflags & PRIM_X86_SSE3_AVAILABLE) + { + strcat(testStr, " SSE3"); + /* Aligned */ + memset(dst, 0, sizeof(dst)); + sse3_andC_32u(src+1, VALUE, dst+1, FUNC_TEST_SIZE); + for (i=1; i<=FUNC_TEST_SIZE; ++i) + { + if (dst[i] != (src[i] & VALUE)) + { + printf("AND-SSE-aligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n", + i, src[i], VALUE, src[i] & VALUE, dst[i]); + ++failed; + } + } + /* Unaligned */ + memset(dst, 0, sizeof(dst)); + sse3_andC_32u(src+1, VALUE, dst+2, FUNC_TEST_SIZE); + for (i=1; i<=FUNC_TEST_SIZE; ++i) + { + if (dst[i+1] != (src[i] & VALUE)) + { + printf("AND-SSE-unaligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n", + i, src[i], VALUE, src[i] & VALUE, dst[i+1]); + ++failed; + } + } + } +#endif /* i386 */ + if (!failed) printf("All and_32u tests passed (%s).\n", testStr); + return (failed > 0) ? FAILURE : SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +STD_SPEED_TEST(andC_32u_speed_test, UINT32, UINT32, dst=dst, + TRUE, general_andC_32u(src1, constant, dst, size), + TRUE, sse3_andC_32u(src1, constant, dst, size), PRIM_X86_SSE3_AVAILABLE, + FALSE, dst=dst, 0, + TRUE, ippsAndC_32u(src1, constant, dst, size)) + +int test_and_32u_speed(void) +{ + UINT32 ALIGN(src[MAX_TEST_SIZE+3]), ALIGN(dst[MAX_TEST_SIZE+3]); + get_random_data(src, sizeof(src)); + andC_32u_speed_test("and32u", "aligned", src, NULL, VALUE, dst, + test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME); + andC_32u_speed_test("and32u", "unaligned", src+1, NULL, VALUE, dst, + test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME); + return SUCCESS; +} + +/* ========================================================================= */ +int test_or_32u_func(void) +{ + UINT32 ALIGN(src[FUNC_TEST_SIZE+3]), ALIGN(dst[FUNC_TEST_SIZE+3]); + int failed = 0; + int i; + UINT32 pflags = primitives_get_flags(primitives_get()); + char testStr[256]; + + testStr[0] = '\0'; + get_random_data(src, sizeof(src)); + general_orC_32u(src+1, VALUE, dst+1, FUNC_TEST_SIZE); + strcat(testStr, " general"); + for (i=1; i<=FUNC_TEST_SIZE; ++i) + { + if (dst[i] != (src[i] | VALUE)) + { + printf("OR-general general FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n", + i, src[i], VALUE, src[i] | VALUE, dst[i]); + ++failed; + } + } +#ifdef i386 + if (pflags & PRIM_X86_SSE3_AVAILABLE) + { + strcat(testStr, " SSE3"); + /* Aligned */ + memset(dst, 0, sizeof(dst)); + sse3_orC_32u(src+1, VALUE, dst+1, FUNC_TEST_SIZE); + for (i=1; i 0) ? FAILURE : SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +STD_SPEED_TEST(orC_32u_speed_test, UINT32, UINT32, dst=dst, + TRUE, general_orC_32u(src1, constant, dst, size), + TRUE, sse3_orC_32u(src1, constant, dst, size), PRIM_X86_SSE3_AVAILABLE, + FALSE, dst=dst, 0, + TRUE, ippsOrC_32u(src1, constant, dst, size)) + +int test_or_32u_speed(void) +{ + UINT32 ALIGN(src[MAX_TEST_SIZE+3]), ALIGN(dst[MAX_TEST_SIZE+3]); + get_random_data(src, sizeof(src)); + orC_32u_speed_test("or32u", "aligned", src, NULL, VALUE, dst, + test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME); + orC_32u_speed_test("or32u", "unaligned", src+1, NULL, VALUE, dst, + test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME); + return SUCCESS; +} diff --git a/libfreerdp/primitives/test/test_colors.c b/libfreerdp/primitives/test/test_colors.c new file mode 100644 index 000000000..ebde1c7e8 --- /dev/null +++ b/libfreerdp/primitives/test/test_colors.c @@ -0,0 +1,226 @@ +/* test_colors.c + * vi:ts=4 sw=4 + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#include "prim_test.h" + +static const int RGB_TRIAL_ITERATIONS = 1000; +static const int YCBCR_TRIAL_ITERATIONS = 1000; +static const float TEST_TIME = 4.0; + +extern pstatus_t general_RGBToRGB_16s8u_P3AC4R(const INT16 *pSrc[3], + int srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi); +extern pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(const INT16 *pSrc[3], + int srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi); +extern pstatus_t general_yCbCrToRGB_16s16s_P3P3(const INT16 *pSrc[3], + int srcStep, INT16 *pDst[3], int dstStep, const prim_size_t *roi); +extern pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(const INT16 *pSrc[3], + int srcStep, INT16 *pDst[3], int dstStep, const prim_size_t *roi); + +/* ------------------------------------------------------------------------- */ +int test_RGBToRGB_16s8u_P3AC4R_func(void) +{ + INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]); + UINT32 ALIGN(out1[4096]), ALIGN(out2[4096]); + int i; + int failed = 0; + UINT32 pflags = primitives_get_flags(primitives_get()); + char testStr[256]; + INT16 *ptrs[3]; + prim_size_t roi = { 64, 64 }; + + testStr[0] = '\0'; + get_random_data(r, sizeof(r)); + get_random_data(g, sizeof(g)); + get_random_data(b, sizeof(b)); + /* clear upper bytes */ + for (i=0; i<4096; ++i) + { + r[i] &= 0x00FFU; + g[i] &= 0x00FFU; + b[i] &= 0x00FFU; + } + + ptrs[0] = r; + ptrs[1] = g; + ptrs[2] = b; + + general_RGBToRGB_16s8u_P3AC4R((const INT16 **) ptrs, 64*2, + (BYTE *) out1, 64*4, &roi); +#ifdef i386 + if (pflags & PRIM_X86_SSE2_AVAILABLE) + { + strcat(testStr, " SSE2"); + sse2_RGBToRGB_16s8u_P3AC4R((const INT16 **) ptrs, 64*2, + (BYTE *) out2, 64*4, &roi); + for (i=0; i<4096; ++i) + { + if (out1[i] != out2[i]) + { + printf("RGBToRGB-SSE FAIL: out1[%d]=0x%08x out2[%d]=0x%08x\n", + i, out1[i], i, out2[i]); + failed = 1; + } + } + } +#endif /* i386 */ + if (!failed) printf("All RGBToRGB_16s8u_P3AC4R tests passed (%s).\n", testStr); + return (failed > 0) ? FAILURE : SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +static const prim_size_t roi64x64 = { 64, 64 }; +STD_SPEED_TEST( + rgb_to_argb_speed, INT16*, UINT32, dst=dst, + TRUE, general_RGBToRGB_16s8u_P3AC4R( + (const INT16 **) src1, 64*2, (BYTE *) dst, 64*4, &roi64x64), + TRUE, sse2_RGBToRGB_16s8u_P3AC4R( + (const INT16 **) src1, 64*2, (BYTE *) dst, 64*4, &roi64x64), + PRIM_X86_SSE2_AVAILABLE, + FALSE, dst=dst, 0, + FALSE, dst=dst); + +int test_RGBToRGB_16s8u_P3AC4R_speed(void) +{ + INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]); + UINT32 ALIGN(dst[4096]); + int i; + INT16 *ptrs[3]; + int size_array[] = { 64 }; + + get_random_data(r, sizeof(r)); + get_random_data(g, sizeof(g)); + get_random_data(b, sizeof(b)); + /* clear upper bytes */ + for (i=0; i<4096; ++i) + { + r[i] &= 0x00FFU; + g[i] &= 0x00FFU; + b[i] &= 0x00FFU; + } + + ptrs[0] = r; + ptrs[1] = g; + ptrs[2] = b; + + rgb_to_argb_speed("RGBToARGB", "aligned", + (const INT16 **) ptrs, NULL, 0, dst, + size_array, 1, RGB_TRIAL_ITERATIONS, TEST_TIME); + return SUCCESS; +} + +/* ========================================================================= */ +int test_yCbCrToRGB_16s16s_P3P3_func(void) +{ + INT16 ALIGN(y[4096]), ALIGN(cb[4096]), ALIGN(cr[4096]); + INT16 ALIGN(r1[4096]), ALIGN(g1[4096]), ALIGN(b1[4096]); + INT16 ALIGN(r2[4096]), ALIGN(g2[4096]), ALIGN(b2[4096]); + int i; + int failed = 0; + UINT32 pflags = primitives_get_flags(primitives_get()); + char testStr[256]; + const INT16 *in[3]; + INT16 *out1[3]; + INT16 *out2[3]; + prim_size_t roi = { 64, 64 }; + + testStr[0] = '\0'; + get_random_data(y, sizeof(y)); + get_random_data(cb, sizeof(cb)); + get_random_data(cr, sizeof(cr)); + /* Normalize to 11.5 fixed radix */ + for (i=0; i<4096; ++i) + { + y[i] &= 0x1FE0U; + cb[i] &= 0x1FE0U; + cr[i] &= 0x1FE0U; + } + memset(r1, 0, sizeof(r1)); + memset(g1, 0, sizeof(g1)); + memset(b1, 0, sizeof(b1)); + memset(r2, 0, sizeof(r2)); + memset(g2, 0, sizeof(g2)); + memset(b2, 0, sizeof(b2)); + + in[0] = y; + in[1] = cb; + in[2] = cr; + out1[0] = r1; + out1[1] = g1; + out1[2] = b1; + out2[0] = r2; + out2[1] = g2; + out2[2] = b2; + + general_yCbCrToRGB_16s16s_P3P3(in, 64*2, out1, 64*2, &roi); +#ifdef i386 + if (pflags & PRIM_X86_SSE2_AVAILABLE) + { + strcat(testStr, " SSE2"); + sse2_yCbCrToRGB_16s16s_P3P3(in, 64*2, out2, 64*2, &roi); + for (i=0; i<4096; ++i) + { + if ((ABS(r1[i]-r2[i]) > 1) + || (ABS(g1[i]-g2[i]) > 1) + || (ABS(b1[i]-b2[i]) > 1)) { + printf("YCbCrToRGB-SSE FAIL[%d]: %d,%d,%d vs %d,%d,%d\n", i, + r1[i],g1[i],b1[i], r2[i],g2[i],b2[i]); + failed = 1; + } + } + } +#endif /* i386 */ + if (!failed) printf("All yCbCrToRGB_16s16s_P3P3 tests passed (%s).\n", testStr); + return (failed > 0) ? FAILURE : SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +STD_SPEED_TEST( + ycbcr_to_rgb_speed, INT16*, INT16*, dst=dst, + TRUE, general_yCbCrToRGB_16s16s_P3P3(src1, 64*2, dst, 64*2, &roi64x64), + TRUE, sse2_yCbCrToRGB_16s16s_P3P3(src1, 64*2, dst, 64*2, &roi64x64), + PRIM_X86_SSE2_AVAILABLE, + FALSE, dst=dst, 0, + FALSE, dst=dst); + +int test_yCbCrToRGB_16s16s_P3P3_speed(void) +{ + INT16 ALIGN(y[4096]), ALIGN(cb[4096]), ALIGN(cr[4096]); + INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]); + int i; + const INT16 *input[3]; + INT16 *output[3]; + int size_array[] = { 64 }; + + get_random_data(y, sizeof(y)); + get_random_data(cb, sizeof(cb)); + get_random_data(cr, sizeof(cr)); + /* Normalize to 11.5 fixed radix */ + for (i=0; i<4096; ++i) + { + y[i] &= 0x1FE0U; + cb[i] &= 0x1FE0U; + cr[i] &= 0x1FE0U; + } + + input[0] = y; + input[1] = cb; + input[2] = cr; + output[0] = r; + output[1] = g; + output[2] = b; + + ycbcr_to_rgb_speed("yCbCrToRGB", "aligned", input, NULL, NULL, output, + size_array, 1, YCBCR_TRIAL_ITERATIONS, TEST_TIME); + return SUCCESS; +} diff --git a/libfreerdp/primitives/test/test_copy.c b/libfreerdp/primitives/test/test_copy.c new file mode 100644 index 000000000..fd427b505 --- /dev/null +++ b/libfreerdp/primitives/test/test_copy.c @@ -0,0 +1,83 @@ +/* test_copy.c + * vi:ts=4 sw=4 + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#include "prim_test.h" + +static const int MEMCPY_PRETEST_ITERATIONS = 1000000; +static const int TEST_TIME = 1.0; // seconds +#define COPY_TESTSIZE (256*2+16*2+15+15) +#if 0 +extern pstatus_t sse3_copy_8u(const BYTE *pSrc, BYTE *pDst, int len); +#endif + +/* ------------------------------------------------------------------------- */ +int test_copy8u_func(void) +{ + primitives_t *prims = primitives_get(); + BYTE ALIGN(data[COPY_TESTSIZE+15]); + int i, soff; + int failed = 0; + char testStr[256]; + BYTE ALIGN(dest[COPY_TESTSIZE+15]); + + testStr[0] = '\0'; + get_random_data(data, sizeof(data)); + + strcat(testStr, " ptr"); + for (soff=0; soff<16; ++soff) + { + int doff; + for (doff=0; doff<16; ++doff) + { + int length; + for (length=1; length<=COPY_TESTSIZE-doff; ++length) + { + memset(dest, 0, sizeof(dest)); + prims->copy_8u(data+soff, dest+doff, length); + for (i=0; i 0) ? FAILURE : SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +STD_SPEED_TEST(copy8u_speed_test, BYTE, BYTE, dst=dst, + TRUE, memcpy(dst, src1, size), + FALSE, NULL, 0, + FALSE, NULL, 0, + TRUE, ippsCopy_8u(src1, dst, size)); + +int test_copy8u_speed(void) +{ + BYTE ALIGN(src[MAX_TEST_SIZE+4]); + BYTE ALIGN(intervening[MAX_TEST_SIZE*7]); + BYTE ALIGN(dst[MAX_TEST_SIZE+4]); + copy8u_speed_test("copy8u", "aligned", src, NULL, 0, dst, + test_sizes, NUM_TEST_SIZES, MEMCPY_PRETEST_ITERATIONS, TEST_TIME); + copy8u_speed_test("copy8u", "unaligned", src+1, NULL, 0, dst, + test_sizes, NUM_TEST_SIZES, MEMCPY_PRETEST_ITERATIONS, TEST_TIME); + return SUCCESS; +} diff --git a/libfreerdp/primitives/test/test_set.c b/libfreerdp/primitives/test/test_set.c new file mode 100644 index 000000000..cc04cf42b --- /dev/null +++ b/libfreerdp/primitives/test/test_set.c @@ -0,0 +1,294 @@ +/* test_set.c + * vi:ts=4 sw=4 + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#include "prim_test.h" + +static const int MEMSET8_PRETEST_ITERATIONS = 100000000; +static const int MEMSET32_PRETEST_ITERATIONS = 40000000; +static const float TEST_TIME = 1.0; + +extern pstatus_t general_set_8u(BYTE val, BYTE *pDst, int len); +extern pstatus_t sse2_set_8u(BYTE val, BYTE *pDst, int len); +extern pstatus_t general_set_32s(INT32 val, INT32 *pDst, int len); +extern pstatus_t sse2_set_32s(INT32 val, INT32 *pDst, int len); +extern pstatus_t general_set_32u(UINT32 val, UINT32 *pDst, int len); +extern pstatus_t sse2_set_32u(UINT32 val, UINT32 *pDst, int len); +extern pstatus_t ipp_wrapper_set_32u(UINT32 val, UINT32 *pDst, int len); + +static const int set_sizes[] = { 1, 4, 16, 32, 64, 256, 1024, 4096 }; +#define NUM_SET_SIZES (sizeof(set_sizes)/sizeof(int)) + +/* ------------------------------------------------------------------------- */ +int test_set8u_func(void) +{ + BYTE ALIGN(dest[48]); + int failed = 0; + int off; + char testStr[256]; + UINT32 pflags = primitives_get_flags(primitives_get()); + testStr[0] = '\0'; + +#ifdef i386 + /* Test SSE under various alignments */ + if (pflags & PRIM_X86_SSE2_AVAILABLE) + { + strcat(testStr, " SSE2"); + for (off=0; off<16; ++off) + { + int len; + for (len=1; len<48-off; ++len) + { + int i; + memset(dest, 0, sizeof(dest)); + sse2_set_8u(0xa5, dest+off, len); + for (i=0; i 0) ? FAILURE : SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +STD_SPEED_TEST(set8u_speed_test, BYTE, BYTE, dst=dst, + TRUE, memset(dst, constant, size), + FALSE, NULL, 0, + FALSE, NULL, 0, + TRUE, ippsSet_8u(constant, dst, size)); + +int test_set8u_speed(void) +{ + BYTE ALIGN(dst[MAX_TEST_SIZE]); + set8u_speed_test("set8u", "aligned", NULL, NULL, 0xA5, dst, + set_sizes, NUM_SET_SIZES, MEMSET8_PRETEST_ITERATIONS, TEST_TIME); + return SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +int test_set32s_func(void) +{ + primitives_t *prims = primitives_get(); + INT32 ALIGN(dest[512]); + int failed = 0; + int off; + char testStr[256]; + UINT32 pflags = primitives_get_flags(primitives_get()); + testStr[0] = '\0'; + +#ifdef i386 + /* Test SSE under various alignments */ + if (pflags & PRIM_X86_SSE2_AVAILABLE) + { + strcat(testStr, " SSE2"); + for (off=0; off<16; ++off) { + int len; + for (len=1; len<512-off; ++len) + { + int i; + memset(dest, 0, sizeof(dest)); + sse2_set_32s(0xdeadbeef, dest+off, len); + for (i=0; i 0) ? FAILURE : SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +int test_set32u_func(void) +{ + primitives_t *prims = primitives_get(); + UINT32 ALIGN(dest[512]); + int failed = 0; + int off; + char testStr[256]; + UINT32 pflags = primitives_get_flags(primitives_get()); + testStr[0] = '\0'; + +#ifdef i386 + /* Test SSE under various alignments */ + if (pflags & PRIM_X86_SSE2_AVAILABLE) + { + strcat(testStr, " SSE2"); + for (off=0; off<16; ++off) { + int len; + for (len=1; len<512-off; ++len) + { + int i; + memset(dest, 0, sizeof(dest)); + sse2_set_32u(0xdeadbeefU, dest+off, len); + for (i=0; i 0) ? FAILURE : SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +static inline void memset32u_naive( + UINT32 val, + UINT32 *dst, + size_t count) +{ + while (count--) *dst++ = val; +} + +/* ------------------------------------------------------------------------- */ +STD_SPEED_TEST(set32u_speed_test, UINT32, UINT32, dst=dst, + TRUE, memset32u_naive(constant, dst, size), + TRUE, sse2_set_32u(constant, dst, size), PRIM_X86_SSE2_AVAILABLE, + FALSE, dst=dst, 0, + TRUE, ipp_wrapper_set_32u(constant, dst, size)); + +int test_set32u_speed(void) +{ + UINT32 ALIGN(dst[MAX_TEST_SIZE+1]); + set32u_speed_test("set32u", "aligned", NULL, NULL, 0xdeadbeef, dst, + set_sizes, NUM_SET_SIZES, MEMSET32_PRETEST_ITERATIONS, TEST_TIME); +#if 0 + /* Not really necessary; should be almost as fast. */ + set32u_speed_test("set32u", "unaligned", NULL, NULL, dst+1, + set_sizes, NUM_SET_SIZES, MEMSET32_PRETEST_ITERATIONS, TEST_TIME); +#endif + return SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +static inline void memset32s_naive( + INT32 val, + INT32 *dst, + size_t count) +{ + while (count--) *dst++ = val; +} + +/* ------------------------------------------------------------------------- */ +STD_SPEED_TEST(set32s_speed_test, INT32, INT32, dst=dst, + TRUE, memset32s_naive(constant, dst, size), + TRUE, sse2_set_32s(constant, dst, size), PRIM_X86_SSE2_AVAILABLE, + FALSE, dst=dst, 0, + TRUE, ippsSet_32s(constant, dst, size)); + +int test_set32s_speed(void) +{ + INT32 ALIGN(dst[MAX_TEST_SIZE+1]); + set32s_speed_test("set32s", "aligned", NULL, NULL, 0xdeadbeef, dst, + set_sizes, NUM_SET_SIZES, MEMSET32_PRETEST_ITERATIONS, TEST_TIME); +#if 0 + /* Not really necessary; should be almost as fast. */ + set32s_speed_test("set32s", "unaligned", NULL, NULL, dst+1, + set_sizes, NUM_SET_SIZES, MEMSET32_PRETEST_ITERATIONS, TEST_TIME); +#endif + return SUCCESS; +} diff --git a/libfreerdp/primitives/test/test_shift.c b/libfreerdp/primitives/test/test_shift.c new file mode 100644 index 000000000..1e00f577d --- /dev/null +++ b/libfreerdp/primitives/test/test_shift.c @@ -0,0 +1,173 @@ +/* test_shift.c + * vi:ts=4 sw=4 + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#include "prim_test.h" + +#define FUNC_TEST_SIZE 65536 +static const int SHIFT_PRETEST_ITERATIONS = 50000; +static const float TEST_TIME = 1.0; + +extern pstatus_t general_lShiftC_16s( + const INT16 *pSrc, int val, INT16 *pDst, int len); +extern pstatus_t general_rShiftC_16s( + const INT16 *pSrc, int val, INT16 *pDst, int len); +extern pstatus_t general_shiftC_16s( + const INT16 *pSrc, int val, INT16 *pDst, int len); +extern pstatus_t general_lShiftC_16u( + const UINT16 *pSrc, int val, UINT16 *pDst, int len); +extern pstatus_t general_rShiftC_16u( + const UINT16 *pSrc, int val, UINT16 *pDst, int len); +extern pstatus_t general_shiftC_16u( + const UINT16 *pSrc, int val, UINT16 *pDst, int len); +extern pstatus_t sse2_lShiftC_16s( + const INT16 *pSrc, int val, INT16 *pDst, int len); +extern pstatus_t sse2_rShiftC_16s( + const INT16 *pSrc, int val, INT16 *pDst, int len); +extern pstatus_t sse2_shiftC_16s( + const INT16 *pSrc, int val, INT16 *pDst, int len); +extern pstatus_t sse2_lShiftC_16u( + const UINT16 *pSrc, int val, UINT16 *pDst, int len); +extern pstatus_t sse2_rShiftC_16u( + const UINT16 *pSrc, int val, UINT16 *pDst, int len); +extern pstatus_t sse2_shiftC_16u( + const UINT16 *pSrc, int val, UINT16 *pDst, int len); + +#ifdef i386 +#define SHIFT_TEST_FUNC(_name_, _type_, _str_, _f1_, _f2_) \ +int _name_(void) \ +{ \ + _type_ ALIGN(src[FUNC_TEST_SIZE+3]), \ + ALIGN(d1[FUNC_TEST_SIZE+3]), ALIGN(d2[FUNC_TEST_SIZE+3]); \ + int failed = 0; \ + int i; \ + UINT32 pflags = primitives_get_flags(primitives_get()); \ + char testStr[256]; \ + testStr[0] = '\0'; \ + get_random_data(src, sizeof(src)); \ + _f1_(src+1, 3, d1+1, FUNC_TEST_SIZE); \ + if (pflags & PRIM_X86_SSE3_AVAILABLE) \ + { \ + strcat(testStr, " SSE3"); \ + /* Aligned */ \ + _f2_(src+1, 3, d2+1, FUNC_TEST_SIZE); \ + for (i=1; i<=FUNC_TEST_SIZE; ++i) \ + { \ + if (d1[i] != d2[i]) \ + { \ + printf("%s-SSE-aligned FAIL[%d]: 0x%x>>3=0x%x, got 0x%x\n", \ + _str_, i, src[i], d1[i], d2[i]); \ + ++failed; \ + } \ + } \ + /* Unaligned */ \ + _f2_(src+1, 3, d2+2, FUNC_TEST_SIZE); \ + for (i=1; i<=FUNC_TEST_SIZE; ++i) \ + { \ + if (d1[i] != d2[i+1]) \ + { \ + printf("%s-SSE-unaligned FAIL[%d]: 0x%x>>3=0x%x, got 0x%x\n", \ + _str_, i, src[i], d1[i], d2[i+1]); \ + ++failed; \ + } \ + } \ + } \ + if (!failed) printf("All %s tests passed (%s).\n", _str_, testStr); \ + return (failed > 0) ? FAILURE : SUCCESS; \ +} +#else +#define SHIFT_TEST_FUNC(_name_, _type_, _str_, _f1_, _f2_) \ +int _name_(void) \ +{ \ + return SUCCESS; \ +} +#endif /* i386 */ + +SHIFT_TEST_FUNC(test_lShift_16s_func, INT16, "lshift_16s", general_lShiftC_16s, + sse2_lShiftC_16s) +SHIFT_TEST_FUNC(test_lShift_16u_func, UINT16, "lshift_16u", general_lShiftC_16u, + sse2_lShiftC_16u) +SHIFT_TEST_FUNC(test_rShift_16s_func, INT16, "rshift_16s", general_rShiftC_16s, + sse2_rShiftC_16s) +SHIFT_TEST_FUNC(test_rShift_16u_func, UINT16, "rshift_16u", general_rShiftC_16u, + sse2_rShiftC_16u) + +/* ========================================================================= */ +STD_SPEED_TEST(speed_lShift_16s, INT16, INT16, dst=dst, + TRUE, general_lShiftC_16s(src1, constant, dst, size), + TRUE, sse2_lShiftC_16s(src1, constant, dst, size), PRIM_X86_SSE2_AVAILABLE, + FALSE, dst=dst, 0, + TRUE, ippsLShiftC_16s(src1, constant, dst, size)); +STD_SPEED_TEST(speed_lShift_16u, UINT16, UINT16, dst=dst, + TRUE, general_lShiftC_16u(src1, constant, dst, size), + TRUE, sse2_lShiftC_16u(src1, constant, dst, size), PRIM_X86_SSE2_AVAILABLE, + FALSE, dst=dst, 0, + TRUE, ippsLShiftC_16u(src1, constant, dst, size)); +STD_SPEED_TEST(speed_rShift_16s, INT16, INT16, dst=dst, + TRUE, general_rShiftC_16s(src1, constant, dst, size), + TRUE, sse2_rShiftC_16s(src1, constant, dst, size), PRIM_X86_SSE2_AVAILABLE, + FALSE, dst=dst, 0, + TRUE, ippsRShiftC_16s(src1, constant, dst, size)); +STD_SPEED_TEST(speed_rShift_16u, UINT16, UINT16, dst=dst, + TRUE, general_rShiftC_16u(src1, constant, dst, size), + TRUE, sse2_rShiftC_16u(src1, constant, dst, size), PRIM_X86_SSE2_AVAILABLE, + FALSE, dst=dst, 0, + TRUE, ippsRShiftC_16u(src1, constant, dst, size)); + +/* ------------------------------------------------------------------------- */ +int test_lShift_16s_speed(void) +{ + INT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]); + get_random_data(src, sizeof(src)); + speed_lShift_16s("lShift_16s", "aligned", src, NULL, 3, dst, + test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); + speed_lShift_16s("lShift_16s", "unaligned", src+1, NULL, 3, dst, + test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); + return SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +int test_lShift_16u_speed(void) +{ + UINT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]); + get_random_data(src, sizeof(src)); + speed_lShift_16u("lShift_16u", "aligned", src, NULL, 3, dst, + test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); + speed_lShift_16u("lShift_16u", "unaligned", src+1, NULL, 3, dst, + test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); + return SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +int test_rShift_16s_speed(void) +{ + INT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]); + get_random_data(src, sizeof(src)); + speed_rShift_16s("rShift_16s", "aligned", src, NULL, 3, dst, + test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); + speed_rShift_16s("rShift_16s", "unaligned", src+1, NULL, 3, dst, + test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); + return SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +int test_rShift_16u_speed(void) +{ + UINT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]); + get_random_data(src, sizeof(src)); + speed_rShift_16u("rShift_16u", "aligned", src, NULL, 3, dst, + test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); + speed_rShift_16u("rShift_16u", "unaligned", src+1, NULL, 3, dst, + test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); + return SUCCESS; +} diff --git a/libfreerdp/primitives/test/test_sign.c b/libfreerdp/primitives/test/test_sign.c new file mode 100644 index 000000000..82f71933f --- /dev/null +++ b/libfreerdp/primitives/test/test_sign.c @@ -0,0 +1,91 @@ +/* test_sign.c + * vi:ts=4 sw=4 + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#include "prim_test.h" + +static const int SIGN_PRETEST_ITERATIONS = 100000; +static const float TEST_TIME = 1.0; + +extern pstatus_t general_sign_16s(const INT16 *pSrc, INT16 *pDst, int len); +extern pstatus_t ssse3_sign_16s(const INT16 *pSrc, INT16 *pDst, int len); + +/* ------------------------------------------------------------------------- */ +int test_sign16s_func(void) +{ + INT16 ALIGN(src[65535]), ALIGN(d1[65535]), ALIGN(d2[65535]); + int failed = 0; + int i; + UINT32 pflags = primitives_get_flags(primitives_get()); + char testStr[256]; + + /* Test when we can reach 16-byte alignment */ + testStr[0] = '\0'; + get_random_data(src, sizeof(src)); + general_sign_16s(src+1, d1+1, 65535); +#ifdef i386 + if (pflags & PRIM_X86_SSSE3_AVAILABLE) + { + strcat(testStr, " SSSE3"); + ssse3_sign_16s(src+1, d2+1, 65535); + for (i=1; i<65535; ++i) + { + if (d1[i] != d2[i]) + { + printf("SIGN16s-SSE-aligned FAIL[%d] of %d: want %d, got %d\n", + i, src[i], d1[i], d2[i]); + ++failed; + } + } + } +#endif /* i386 */ + + /* Test when we cannot reach 16-byte alignment */ + get_random_data(src, sizeof(src)); + general_sign_16s(src+1, d1+2, 65535); +#ifdef i386 + if (pflags & PRIM_X86_SSSE3_AVAILABLE) + { + ssse3_sign_16s(src+1, d2+2, 65535); + for (i=2; i<65535; ++i) + { + if (d1[i] != d2[i]) + { + printf("SIGN16s-SSE-unaligned FAIL[%d] of %d: want %d, got %d\n", + i, src[i-1], d1[i], d2[i]); + ++failed; + } + } + } +#endif /* i386 */ + if (!failed) printf("All sign16s tests passed (%s).\n", testStr); + return (failed > 0) ? FAILURE : SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +STD_SPEED_TEST(sign16s_speed_test, INT16, INT16, dst=dst, + TRUE, general_sign_16s(src1, dst, size), + TRUE, ssse3_sign_16s(src1, dst, size), PRIM_X86_SSSE3_AVAILABLE, + FALSE, dst=dst, 0, + FALSE, dst=dst); + +int test_sign16s_speed(void) +{ + INT16 ALIGN(src[MAX_TEST_SIZE+3]), ALIGN(dst[MAX_TEST_SIZE+3]); + get_random_data(src, sizeof(src)); + sign16s_speed_test("sign16s", "aligned", src, NULL, 0, dst, + test_sizes, NUM_TEST_SIZES, SIGN_PRETEST_ITERATIONS, TEST_TIME); + sign16s_speed_test("sign16s", "unaligned", src+1, NULL, 0, dst, + test_sizes, NUM_TEST_SIZES, SIGN_PRETEST_ITERATIONS, TEST_TIME); + return SUCCESS; +} diff --git a/server/Mac/CMakeLists.txt b/server/Mac/CMakeLists.txt index 39584139a..dbc4c1df9 100644 --- a/server/Mac/CMakeLists.txt +++ b/server/Mac/CMakeLists.txt @@ -64,7 +64,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS MONOLITHIC ${MONOLITHIC_BUILD} MODULE freerdp - MODULES freerdp-core freerdp-utils freerdp-codec) + MODULES freerdp-core freerdp-utils freerdp-codec freerdp-primitives) set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS MONOLITHIC ${MONOLITHIC_BUILD} diff --git a/server/Sample/CMakeLists.txt b/server/Sample/CMakeLists.txt index 976893198..972cc29f6 100644 --- a/server/Sample/CMakeLists.txt +++ b/server/Sample/CMakeLists.txt @@ -33,7 +33,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} freerdp-server) set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS MONOLITHIC ${MONOLITHIC_BUILD} MODULE freerdp - MODULES freerdp-core freerdp-utils freerdp-codec) + MODULES freerdp-core freerdp-utils freerdp-codec freerdp-primitives) set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS MONOLITHIC ${MONOLITHIC_BUILD} diff --git a/server/Windows/CMakeLists.txt b/server/Windows/CMakeLists.txt index bdabe1329..08ebb6563 100644 --- a/server/Windows/CMakeLists.txt +++ b/server/Windows/CMakeLists.txt @@ -59,7 +59,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} freerdp-server) set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS MONOLITHIC ${MONOLITHIC_BUILD} MODULE freerdp - MODULES freerdp-core freerdp-utils freerdp-codec) + MODULES freerdp-core freerdp-utils freerdp-codec freerdp-primitives) target_link_libraries(${MODULE_NAME} ${${MODULE_PREFIX}_LIBS}) diff --git a/server/X11/CMakeLists.txt b/server/X11/CMakeLists.txt index dd24a2976..9e8709ddc 100644 --- a/server/X11/CMakeLists.txt +++ b/server/X11/CMakeLists.txt @@ -90,7 +90,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} ${X11_LIBRARIES}) set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS MONOLITHIC ${MONOLITHIC_BUILD} MODULE freerdp - MODULES freerdp-core freerdp-common freerdp-codec freerdp-utils freerdp-gdi freerdp-crypto freerdp-locale) + MODULES freerdp-core freerdp-common freerdp-codec freerdp-primitives freerdp-utils freerdp-gdi freerdp-crypto freerdp-locale) set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS MONOLITHIC ${MONOLITHIC_BUILD}