mirror of https://github.com/FreeRDP/FreeRDP
Merge branch 'master' of https://github.com/dpoe/FreeRDP
This commit is contained in:
commit
c6e6956a43
|
@ -231,6 +231,10 @@ set(GSTREAMER_FEATURE_TYPE "RECOMMENDED")
|
|||
set(GSTREAMER_FEATURE_PURPOSE "multimedia")
|
||||
set(GSTREAMER_FEATURE_DESCRIPTION "multimedia redirection, audio and video playback")
|
||||
|
||||
set(IPP_FEATURE_TYPE "OPTIONAL")
|
||||
set(IPP_FEATURE_PURPOSE "performance")
|
||||
set(IPP_FEATURE_DESCRIPTION "Intel Integrated Performance Primitives library")
|
||||
|
||||
if(WIN32)
|
||||
set(X11_FEATURE_TYPE "DISABLED")
|
||||
set(ZLIB_FEATURE_TYPE "DISABLED")
|
||||
|
@ -285,6 +289,9 @@ find_feature(PCSC ${PCSC_FEATURE_TYPE} ${PCSC_FEATURE_PURPOSE} ${PCSC_FEATURE_DE
|
|||
find_feature(FFmpeg ${FFMPEG_FEATURE_TYPE} ${FFMPEG_FEATURE_PURPOSE} ${FFMPEG_FEATURE_DESCRIPTION})
|
||||
find_feature(Gstreamer ${GSTREAMER_FEATURE_TYPE} ${GSTREAMER_FEATURE_PURPOSE} ${GSTREAMER_FEATURE_DESCRIPTION})
|
||||
|
||||
# Intel Performance Primitives
|
||||
find_feature(IPP ${IPP_FEATURE_TYPE} ${IPP_FEATURE_PURPOSE} ${IPP_FEATURE_DESCRIPTION})
|
||||
|
||||
# Installation Paths
|
||||
if(WIN32)
|
||||
set(CMAKE_INSTALL_BINDIR ".")
|
||||
|
|
|
@ -36,7 +36,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} freerdp-client)
|
|||
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
|
||||
MONOLITHIC ${MONOLITHIC_BUILD}
|
||||
MODULE freerdp
|
||||
MODULES freerdp-core freerdp-gdi freerdp-locale freerdp-codec freerdp-utils)
|
||||
MODULES freerdp-core freerdp-gdi freerdp-locale freerdp-codec freerdp-primitives freerdp-utils)
|
||||
|
||||
target_link_libraries(${MODULE_NAME} ${${MODULE_PREFIX}_LIBS})
|
||||
install(TARGETS ${MODULE_NAME} DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
|
|
|
@ -78,7 +78,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} freerdp-client)
|
|||
|
||||
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS MONOLITHIC ${MONOLITHIC_BUILD}
|
||||
MODULE freerdp
|
||||
MODULES freerdp-core freerdp-cache freerdp-gdi freerdp-codec freerdp-rail freerdp-utils)
|
||||
MODULES freerdp-core freerdp-cache freerdp-gdi freerdp-codec freerdp-primitives freerdp-rail freerdp-utils)
|
||||
|
||||
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS MONOLITHIC ${MONOLITHIC_BUILD}
|
||||
MODULE winpr
|
||||
|
|
|
@ -46,7 +46,7 @@ set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
|
|||
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
|
||||
MONOLITHIC ${MONOLITHIC_BUILD}
|
||||
MODULE freerdp
|
||||
MODULES freerdp-core freerdp-gdi freerdp-codec freerdp-utils)
|
||||
MODULES freerdp-core freerdp-gdi freerdp-codec freerdp-primitives freerdp-utils)
|
||||
|
||||
target_link_libraries(${MODULE_NAME} ${${MODULE_PREFIX}_LIBS})
|
||||
|
||||
|
|
|
@ -120,10 +120,14 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} freerdp-client)
|
|||
|
||||
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS MONOLITHIC ${MONOLITHIC_BUILD}
|
||||
MODULE freerdp
|
||||
MODULES freerdp-core freerdp-gdi freerdp-locale freerdp-rail freerdp-utils)
|
||||
MODULES freerdp-core freerdp-gdi freerdp-locale freerdp-primitives freerdp-rail freerdp-utils)
|
||||
|
||||
target_link_libraries(${MODULE_NAME} ${${MODULE_PREFIX}_LIBS})
|
||||
|
||||
if(WITH_IPP)
|
||||
target_link_libraries(xfreerdp ${IPP_LIBRARY_LIST})
|
||||
endif()
|
||||
|
||||
install(TARGETS ${MODULE_NAME} DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
|
||||
set_property(TARGET ${MODULE_NAME} PROPERTY FOLDER "Client/X11")
|
||||
|
|
|
@ -9,6 +9,7 @@ endif()
|
|||
|
||||
option(WITH_MANPAGES "Generate manpages." ON)
|
||||
option(WITH_PROFILER "Compile profiler." OFF)
|
||||
option(WITH_IPP "Use Intel Performance Primitives." OFF)
|
||||
|
||||
if((TARGET_ARCH MATCHES "x86|x64") AND (NOT DEFINED WITH_SSE2))
|
||||
option(WITH_SSE2 "Enable SSE2 optimization." ON)
|
||||
|
|
|
@ -265,7 +265,7 @@ if(NOT IPP_FOUND)
|
|||
# Note, if several IPP installations found the newest version will be
|
||||
# selected
|
||||
# ------------------------------------------------------------------------
|
||||
foreach(curdir ${CMAKE_SYSTEM_PREFIX_PATH})
|
||||
foreach(curdir ${CMAKE_SYSTEM_PREFIX_PATH} /opt)
|
||||
set(curdir ${curdir}/intel)
|
||||
file(TO_CMAKE_PATH ${curdir} CURDIR)
|
||||
|
||||
|
@ -336,3 +336,33 @@ if(WIN32 AND MINGW AND NOT IPP_LATEST_VERSION_MAJOR LESS 7)
|
|||
set(MSV_NTDLL "ntdll")
|
||||
set(IPP_LIBRARIES ${IPP_LIBRARIES} ${MSV_NTDLL}${IPP_LIB_SUFFIX})
|
||||
endif()
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# This section will look for the IPP "compiler" dependent library
|
||||
# libiomp5.
|
||||
# ------------------------------------------------------------------------
|
||||
foreach(curdir ${CMAKE_SYSTEM_PREFIX_PATH} /opt)
|
||||
set(curdir ${curdir}/intel)
|
||||
|
||||
if(EXISTS ${curdir})
|
||||
file(GLOB_RECURSE liblist FOLLOW_SYMLINKS ${curdir}/libiomp5.*)
|
||||
foreach(lib ${liblist})
|
||||
get_filename_component(libdir ${lib} REALPATH)
|
||||
get_filename_component(libdir ${libdir} PATH)
|
||||
set(IPP_COMPILER_LIBRARY_DIRS ${libdir})
|
||||
set(IPP_COMPILER_LIBRARIES iomp5)
|
||||
endforeach(lib)
|
||||
endif()
|
||||
endforeach(curdir)
|
||||
|
||||
# ------------------------------------------------------------------------
|
||||
# Build fullpath library list.
|
||||
# ------------------------------------------------------------------------
|
||||
find_library(LIB_IPPI ippi PATHS ${IPP_LIBRARY_DIRS})
|
||||
set(IPP_LIBRARY_LIST ${IPP_LIBRARY_LIST} ${LIB_IPPI})
|
||||
find_library(LIB_IPPS ipps PATHS ${IPP_LIBRARY_DIRS})
|
||||
set(IPP_LIBRARY_LIST ${IPP_LIBRARY_LIST} ${LIB_IPPS})
|
||||
find_library(LIB_IPPCORE ippcore PATHS ${IPP_LIBRARY_DIRS})
|
||||
set(IPP_LIBRARY_LIST ${IPP_LIBRARY_LIST} ${LIB_IPPCORE})
|
||||
find_library(LIB_IOMP5 iomp5 PATHS ${IPP_COMPILER_LIBRARY_DIRS})
|
||||
set(IPP_LIBRARY_LIST ${IPP_LIBRARY_LIST} ${LIB_IOMP5})
|
||||
|
|
|
@ -36,6 +36,7 @@
|
|||
#cmakedefine WITH_PROFILER
|
||||
#cmakedefine WITH_SSE2
|
||||
#cmakedefine WITH_NEON
|
||||
#cmakedefine WITH_IPP
|
||||
#cmakedefine WITH_NATIVE_SSPI
|
||||
#cmakedefine WITH_JPEG
|
||||
#cmakedefine WITH_WIN8
|
||||
|
|
|
@ -101,8 +101,6 @@ struct _RFX_CONTEXT
|
|||
BYTE quant_idx_cr;
|
||||
|
||||
/* routines */
|
||||
void (*decode_ycbcr_to_rgb)(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf);
|
||||
void (*encode_rgb_to_ycbcr)(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf);
|
||||
void (*quantization_decode)(INT16* buffer, const UINT32* quantization_values);
|
||||
void (*quantization_encode)(INT16* buffer, const UINT32* quantization_values);
|
||||
void (*dwt_2d_decode)(INT16* buffer, INT16* dwt_buffer);
|
||||
|
|
|
@ -0,0 +1,207 @@
|
|||
/* primitives.h
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License. Algorithms used by
|
||||
* this code may be covered by patents by HP, Microsoft, or other parties.
|
||||
*/
|
||||
|
||||
#ifdef __GNUC__
|
||||
# pragma once
|
||||
#endif
|
||||
|
||||
#ifndef __PRIMITIVES_H_INCLUDED__
|
||||
#define __PRIMITIVES_H_INCLUDED__
|
||||
|
||||
#include <freerdp/api.h>
|
||||
#include <freerdp/types.h>
|
||||
|
||||
typedef INT32 pstatus_t; /* match IppStatus. */
|
||||
#define PRIMITIVES_SUCCESS (0) /* match ippStsNoErr */
|
||||
|
||||
/* Simple macro for address of an x,y location in 2d 4-byte memory block */
|
||||
#define PIXMAP4_ADDR(_dst_, _x_, _y_, _span_) \
|
||||
((void *) (((BYTE *) (_dst_)) + (((_x_) + (_y_)*(_span_)) << 2)))
|
||||
|
||||
#define PRIM_X86_MMX_AVAILABLE (1U<<0)
|
||||
#define PRIM_X86_3DNOW_AVAILABLE (1U<<1)
|
||||
#define PRIM_X86_3DNOW_PREFETCH_AVAILABLE (1U<<2)
|
||||
#define PRIM_X86_SSE_AVAILABLE (1U<<3)
|
||||
#define PRIM_X86_SSE2_AVAILABLE (1U<<4)
|
||||
#define PRIM_X86_SSE3_AVAILABLE (1U<<5)
|
||||
#define PRIM_X86_SSSE3_AVAILABLE (1U<<6)
|
||||
#define PRIM_X86_SSE41_AVAILABLE (1U<<7)
|
||||
#define PRIM_X86_SSE42_AVAILABLE (1U<<8)
|
||||
#define PRIM_X86_AVX_AVAILABLE (1U<<9)
|
||||
#define PRIM_X86_FMA_AVAILABLE (1U<<10)
|
||||
#define PRIM_X86_AVX_AES_AVAILABLE (1U<<11)
|
||||
#define PRIM_X86_AVX2_AVAILABLE (1U<<12)
|
||||
|
||||
#define PRIM_ARM_VFP1_AVAILABLE (1U<<0)
|
||||
#define PRIM_ARM_VFP2_AVAILABLE (1U<<1)
|
||||
#define PRIM_ARM_VFP3_AVAILABLE (1U<<2)
|
||||
#define PRIM_ARM_VFP4_AVAILABLE (1U<<3)
|
||||
#define PRIM_ARM_FPA_AVAILABLE (1U<<4)
|
||||
#define PRIM_ARM_FPE_AVAILABLE (1U<<5)
|
||||
#define PRIM_ARM_IWMMXT_AVAILABLE (1U<<6)
|
||||
#define PRIM_ARM_NEON_AVAILABLE (1U<<7)
|
||||
|
||||
/* Structures compatible with IPP */
|
||||
typedef struct
|
||||
{
|
||||
INT32 width;
|
||||
INT32 height;
|
||||
} prim_size_t; /* like IppiSize */
|
||||
|
||||
/* Function prototypes for all of the supported primitives. */
|
||||
typedef pstatus_t (*__copy_t)(
|
||||
const void *pSrc,
|
||||
void *pDst,
|
||||
INT32 bytes);
|
||||
typedef pstatus_t (*__copy_8u_t)(
|
||||
const BYTE *pSrc,
|
||||
BYTE *pDst,
|
||||
INT32 len);
|
||||
typedef pstatus_t (*__copy_8u_AC4r_t)(
|
||||
const BYTE *pSrc,
|
||||
INT32 srcStep, /* bytes */
|
||||
BYTE *pDst,
|
||||
INT32 dstStep, /* bytes */
|
||||
INT32 width, INT32 height); /* pixels */
|
||||
typedef pstatus_t (*__set_8u_t)(
|
||||
BYTE val,
|
||||
BYTE *pDst,
|
||||
INT32 len);
|
||||
typedef pstatus_t (*__set_32s_t)(
|
||||
INT32 val,
|
||||
INT32 *pDst,
|
||||
INT32 len);
|
||||
typedef pstatus_t (*__set_32u_t)(
|
||||
UINT32 val,
|
||||
UINT32 *pDst,
|
||||
INT32 len);
|
||||
typedef pstatus_t (*__zero_t)(
|
||||
void *pDst,
|
||||
size_t bytes);
|
||||
typedef pstatus_t (*__alphaComp_argb_t)(
|
||||
const BYTE *pSrc1, INT32 src1Step,
|
||||
const BYTE *pSrc2, INT32 src2Step,
|
||||
BYTE *pDst, INT32 dstStep,
|
||||
INT32 width, INT32 height);
|
||||
typedef pstatus_t (*__add_16s_t)(
|
||||
const INT16 *pSrc1,
|
||||
const INT16 *pSrc2,
|
||||
INT16 *pDst,
|
||||
INT32 len);
|
||||
typedef pstatus_t (*__lShiftC_16s_t)(
|
||||
const INT16 *pSrc,
|
||||
INT32 val,
|
||||
INT16 *pSrcDst,
|
||||
INT32 len);
|
||||
typedef pstatus_t (*__lShiftC_16u_t)(
|
||||
const UINT16 *pSrc,
|
||||
INT32 val,
|
||||
UINT16 *pSrcDst,
|
||||
INT32 len);
|
||||
typedef pstatus_t (*__rShiftC_16s_t)(
|
||||
const INT16 *pSrc,
|
||||
INT32 val,
|
||||
INT16 *pSrcDst,
|
||||
INT32 len);
|
||||
typedef pstatus_t (*__rShiftC_16u_t)(
|
||||
const UINT16 *pSrc,
|
||||
INT32 val,
|
||||
UINT16 *pSrcDst,
|
||||
INT32 len);
|
||||
typedef pstatus_t (*__shiftC_16s_t)(
|
||||
const INT16 *pSrc,
|
||||
INT32 val,
|
||||
INT16 *pSrcDst,
|
||||
INT32 len);
|
||||
typedef pstatus_t (*__shiftC_16u_t)(
|
||||
const UINT16 *pSrc,
|
||||
INT32 val,
|
||||
UINT16 *pSrcDst,
|
||||
INT32 len);
|
||||
typedef pstatus_t (*__sign_16s_t)(
|
||||
const INT16 *pSrc,
|
||||
INT16 *pDst,
|
||||
INT32 len);
|
||||
typedef pstatus_t (*__yCbCrToRGB_16s16s_P3P3_t)(
|
||||
const INT16 *pSrc[3], INT32 srcStep,
|
||||
INT16 *pDst[3], INT32 dstStep,
|
||||
const prim_size_t *roi);
|
||||
typedef pstatus_t (*__RGBToYCbCr_16s16s_P3P3_t)(
|
||||
const INT16 *pSrc[3], INT32 srcStep,
|
||||
INT16 *pDst[3], INT32 dstStep,
|
||||
const prim_size_t *roi);
|
||||
typedef pstatus_t (*__RGBToRGB_16s8u_P3AC4R_t)(
|
||||
const INT16 *pSrc[3], INT32 srcStep,
|
||||
BYTE *pDst, INT32 dstStep,
|
||||
const prim_size_t *roi);
|
||||
typedef pstatus_t (*__andC_32u_t)(
|
||||
const UINT32 *pSrc,
|
||||
UINT32 val,
|
||||
UINT32 *pDst,
|
||||
INT32 len);
|
||||
typedef pstatus_t (*__orC_32u_t)(
|
||||
const UINT32 *pSrc,
|
||||
UINT32 val,
|
||||
UINT32 *pDst,
|
||||
INT32 len);
|
||||
|
||||
typedef struct
|
||||
{
|
||||
/* Memory-to-memory copy routines */
|
||||
__copy_t copy; /* memcpy/memmove, basically */
|
||||
__copy_8u_t copy_8u; /* more strongly typed */
|
||||
__copy_8u_AC4r_t copy_8u_AC4r; /* pixel copy function */
|
||||
/* Memory setting routines */
|
||||
__set_8u_t set_8u; /* memset, basically */
|
||||
__set_32s_t set_32s;
|
||||
__set_32u_t set_32u;
|
||||
__zero_t zero; /* bzero or faster */
|
||||
/* Arithmetic functions */
|
||||
__add_16s_t add_16s;
|
||||
/* And/or */
|
||||
__andC_32u_t andC_32u;
|
||||
__orC_32u_t orC_32u;
|
||||
/* Shifts */
|
||||
__lShiftC_16s_t lShiftC_16s;
|
||||
__lShiftC_16u_t lShiftC_16u;
|
||||
__rShiftC_16s_t rShiftC_16s;
|
||||
__rShiftC_16u_t rShiftC_16u;
|
||||
__shiftC_16s_t shiftC_16s;
|
||||
__shiftC_16u_t shiftC_16u;
|
||||
/* Alpha Composition */
|
||||
__alphaComp_argb_t alphaComp_argb;
|
||||
/* Sign */
|
||||
__sign_16s_t sign_16s;
|
||||
/* Color conversions */
|
||||
__yCbCrToRGB_16s16s_P3P3_t yCbCrToRGB_16s16s_P3P3;
|
||||
__RGBToYCbCr_16s16s_P3P3_t RGBToYCbCr_16s16s_P3P3;
|
||||
__RGBToRGB_16s8u_P3AC4R_t RGBToRGB_16s8u_P3AC4R;
|
||||
|
||||
/* internal use for CPU flags and such. */
|
||||
void *hints;
|
||||
} primitives_t;
|
||||
|
||||
/* Prototypes for the externally-visible entrypoints. */
|
||||
FREERDP_API void primitives_init(void);
|
||||
FREERDP_API primitives_t *primitives_get(void);
|
||||
FREERDP_API UINT32 primitives_get_flags(
|
||||
const primitives_t *prims);
|
||||
FREERDP_API void primitives_flags_str(
|
||||
const primitives_t *prims,
|
||||
char *str,
|
||||
size_t len);
|
||||
FREERDP_API void primitives_deinit(void);
|
||||
|
||||
#endif /* !__PRIMITIVES_H_INCLUDED__ */
|
|
@ -31,6 +31,7 @@ set(${MODULE_PREFIX}_SUBMODULES
|
|||
codec
|
||||
crypto
|
||||
locale
|
||||
primitives
|
||||
core)
|
||||
|
||||
foreach(${MODULE_PREFIX}_SUBMODULE ${${MODULE_PREFIX}_SUBMODULES})
|
||||
|
|
|
@ -79,7 +79,7 @@ static void rfx_profiler_create(RFX_CONTEXT* context)
|
|||
PROFILER_CREATE(context->priv->prof_rfx_differential_decode, "rfx_differential_decode");
|
||||
PROFILER_CREATE(context->priv->prof_rfx_quantization_decode, "rfx_quantization_decode");
|
||||
PROFILER_CREATE(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode");
|
||||
PROFILER_CREATE(context->priv->prof_rfx_decode_ycbcr_to_rgb, "rfx_decode_ycbcr_to_rgb");
|
||||
PROFILER_CREATE(context->priv->prof_rfx_ycbcr_to_rgb, "prims->yCbCrToRGB");
|
||||
PROFILER_CREATE(context->priv->prof_rfx_decode_format_rgb, "rfx_decode_format_rgb");
|
||||
|
||||
PROFILER_CREATE(context->priv->prof_rfx_encode_rgb, "rfx_encode_rgb");
|
||||
|
@ -88,7 +88,7 @@ static void rfx_profiler_create(RFX_CONTEXT* context)
|
|||
PROFILER_CREATE(context->priv->prof_rfx_differential_encode, "rfx_differential_encode");
|
||||
PROFILER_CREATE(context->priv->prof_rfx_quantization_encode, "rfx_quantization_encode");
|
||||
PROFILER_CREATE(context->priv->prof_rfx_dwt_2d_encode, "rfx_dwt_2d_encode");
|
||||
PROFILER_CREATE(context->priv->prof_rfx_encode_rgb_to_ycbcr, "rfx_encode_rgb_to_ycbcr");
|
||||
PROFILER_CREATE(context->priv->prof_rfx_rgb_to_ycbcr, "prims->RGBToYCbCr");
|
||||
PROFILER_CREATE(context->priv->prof_rfx_encode_format_rgb, "rfx_encode_format_rgb");
|
||||
}
|
||||
|
||||
|
@ -100,7 +100,7 @@ static void rfx_profiler_free(RFX_CONTEXT* context)
|
|||
PROFILER_FREE(context->priv->prof_rfx_differential_decode);
|
||||
PROFILER_FREE(context->priv->prof_rfx_quantization_decode);
|
||||
PROFILER_FREE(context->priv->prof_rfx_dwt_2d_decode);
|
||||
PROFILER_FREE(context->priv->prof_rfx_decode_ycbcr_to_rgb);
|
||||
PROFILER_FREE(context->priv->prof_rfx_ycbcr_to_rgb);
|
||||
PROFILER_FREE(context->priv->prof_rfx_decode_format_rgb);
|
||||
|
||||
PROFILER_FREE(context->priv->prof_rfx_encode_rgb);
|
||||
|
@ -109,7 +109,7 @@ static void rfx_profiler_free(RFX_CONTEXT* context)
|
|||
PROFILER_FREE(context->priv->prof_rfx_differential_encode);
|
||||
PROFILER_FREE(context->priv->prof_rfx_quantization_encode);
|
||||
PROFILER_FREE(context->priv->prof_rfx_dwt_2d_encode);
|
||||
PROFILER_FREE(context->priv->prof_rfx_encode_rgb_to_ycbcr);
|
||||
PROFILER_FREE(context->priv->prof_rfx_rgb_to_ycbcr);
|
||||
PROFILER_FREE(context->priv->prof_rfx_encode_format_rgb);
|
||||
}
|
||||
|
||||
|
@ -123,7 +123,7 @@ static void rfx_profiler_print(RFX_CONTEXT* context)
|
|||
PROFILER_PRINT(context->priv->prof_rfx_differential_decode);
|
||||
PROFILER_PRINT(context->priv->prof_rfx_quantization_decode);
|
||||
PROFILER_PRINT(context->priv->prof_rfx_dwt_2d_decode);
|
||||
PROFILER_PRINT(context->priv->prof_rfx_decode_ycbcr_to_rgb);
|
||||
PROFILER_PRINT(context->priv->prof_rfx_ycbcr_to_rgb);
|
||||
PROFILER_PRINT(context->priv->prof_rfx_decode_format_rgb);
|
||||
|
||||
PROFILER_PRINT(context->priv->prof_rfx_encode_rgb);
|
||||
|
@ -132,7 +132,7 @@ static void rfx_profiler_print(RFX_CONTEXT* context)
|
|||
PROFILER_PRINT(context->priv->prof_rfx_differential_encode);
|
||||
PROFILER_PRINT(context->priv->prof_rfx_quantization_encode);
|
||||
PROFILER_PRINT(context->priv->prof_rfx_dwt_2d_encode);
|
||||
PROFILER_PRINT(context->priv->prof_rfx_encode_rgb_to_ycbcr);
|
||||
PROFILER_PRINT(context->priv->prof_rfx_rgb_to_ycbcr);
|
||||
PROFILER_PRINT(context->priv->prof_rfx_encode_format_rgb);
|
||||
|
||||
PROFILER_PRINT_FOOTER;
|
||||
|
@ -164,8 +164,6 @@ RFX_CONTEXT* rfx_context_new(void)
|
|||
rfx_profiler_create(context);
|
||||
|
||||
/* set up default routines */
|
||||
context->decode_ycbcr_to_rgb = rfx_decode_ycbcr_to_rgb;
|
||||
context->encode_rgb_to_ycbcr = rfx_encode_rgb_to_ycbcr;
|
||||
context->quantization_decode = rfx_quantization_decode;
|
||||
context->quantization_encode = rfx_quantization_encode;
|
||||
context->dwt_2d_decode = rfx_dwt_2d_decode;
|
||||
|
@ -414,7 +412,7 @@ static void rfx_process_message_tile(RFX_CONTEXT* context, RFX_TILE* tile, STREA
|
|||
YLen, context->quants + (quantIdxY * 10),
|
||||
CbLen, context->quants + (quantIdxCb * 10),
|
||||
CrLen, context->quants + (quantIdxCr * 10),
|
||||
tile->data);
|
||||
tile->data, 64*sizeof(UINT32));
|
||||
}
|
||||
|
||||
static void rfx_process_message_tileset(RFX_CONTEXT* context, RFX_MESSAGE* message, STREAM* s)
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
#include <string.h>
|
||||
|
||||
#include <freerdp/utils/stream.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "rfx_types.h"
|
||||
#include "rfx_rlgr.h"
|
||||
|
@ -36,49 +37,55 @@
|
|||
|
||||
#include "rfx_decode.h"
|
||||
|
||||
/* stride is bytes between rows in the output buffer. */
|
||||
static void rfx_decode_format_rgb(INT16* r_buf, INT16* g_buf, INT16* b_buf,
|
||||
RDP_PIXEL_FORMAT pixel_format, BYTE* dst_buf)
|
||||
RDP_PIXEL_FORMAT pixel_format, BYTE* dst_buf, int stride)
|
||||
{
|
||||
primitives_t *prims = primitives_get();
|
||||
INT16* r = r_buf;
|
||||
INT16* g = g_buf;
|
||||
INT16* b = b_buf;
|
||||
INT16* pSrc[3];
|
||||
static const prim_size_t roi_64x64 = { 64, 64 };
|
||||
BYTE* dst = dst_buf;
|
||||
int i;
|
||||
int x, y;
|
||||
|
||||
switch (pixel_format)
|
||||
{
|
||||
case RDP_PIXEL_FORMAT_B8G8R8A8:
|
||||
for (i = 0; i < 4096; i++)
|
||||
{
|
||||
*dst++ = (BYTE) (*b++);
|
||||
*dst++ = (BYTE) (*g++);
|
||||
*dst++ = (BYTE) (*r++);
|
||||
*dst++ = 0xFF;
|
||||
}
|
||||
pSrc[0] = r; pSrc[1] = g; pSrc[2] = b;
|
||||
prims->RGBToRGB_16s8u_P3AC4R(
|
||||
(const INT16 **) pSrc, 64*sizeof(INT16),
|
||||
dst, stride, &roi_64x64);
|
||||
break;
|
||||
case RDP_PIXEL_FORMAT_R8G8B8A8:
|
||||
for (i = 0; i < 4096; i++)
|
||||
{
|
||||
*dst++ = (BYTE) (*r++);
|
||||
*dst++ = (BYTE) (*g++);
|
||||
*dst++ = (BYTE) (*b++);
|
||||
*dst++ = 0xFF;
|
||||
}
|
||||
pSrc[0] = b; pSrc[1] = g; pSrc[2] = r;
|
||||
prims->RGBToRGB_16s8u_P3AC4R(
|
||||
(const INT16 **) pSrc, 64*sizeof(INT16),
|
||||
dst, stride, &roi_64x64);
|
||||
break;
|
||||
case RDP_PIXEL_FORMAT_B8G8R8:
|
||||
for (i = 0; i < 4096; i++)
|
||||
for (y=0; y<64; y++)
|
||||
{
|
||||
*dst++ = (BYTE) (*b++);
|
||||
*dst++ = (BYTE) (*g++);
|
||||
*dst++ = (BYTE) (*r++);
|
||||
for (x=0; x<64; x++)
|
||||
{
|
||||
*dst++ = (BYTE) (*b++);
|
||||
*dst++ = (BYTE) (*g++);
|
||||
*dst++ = (BYTE) (*r++);
|
||||
}
|
||||
dst += stride - (64*3);
|
||||
}
|
||||
break;
|
||||
case RDP_PIXEL_FORMAT_R8G8B8:
|
||||
for (i = 0; i < 4096; i++)
|
||||
for (y=0; y<64; y++)
|
||||
{
|
||||
*dst++ = (BYTE) (*r++);
|
||||
*dst++ = (BYTE) (*g++);
|
||||
*dst++ = (BYTE) (*b++);
|
||||
for (x=0; x<64; x++)
|
||||
{
|
||||
*dst++ = (BYTE) (*r++);
|
||||
*dst++ = (BYTE) (*g++);
|
||||
*dst++ = (BYTE) (*b++);
|
||||
}
|
||||
dst += stride - (64*3);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
|
@ -86,69 +93,6 @@ static void rfx_decode_format_rgb(INT16* r_buf, INT16* g_buf, INT16* b_buf,
|
|||
}
|
||||
}
|
||||
|
||||
#define MINMAX(_v,_l,_h) ((_v) < (_l) ? (_l) : ((_v) > (_h) ? (_h) : (_v)))
|
||||
|
||||
void rfx_decode_ycbcr_to_rgb(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf)
|
||||
{
|
||||
/* INT32 is used intentionally because we calculate with shifted factors! */
|
||||
INT32 y, cb, cr;
|
||||
INT32 r, g, b;
|
||||
int i;
|
||||
|
||||
/**
|
||||
* The decoded YCbCr coeffectients are represented as 11.5 fixed-point numbers:
|
||||
*
|
||||
* 1 sign bit + 10 integer bits + 5 fractional bits
|
||||
*
|
||||
* However only 7 integer bits will be actually used since the value range is [-128.0, 127.0].
|
||||
* In other words, the decoded coeffectients is scaled by << 5 when intepreted as INT16.
|
||||
* It was scaled in the quantization phase, so we must scale it back here.
|
||||
*/
|
||||
for (i = 0; i < 4096; i++)
|
||||
{
|
||||
y = y_r_buf[i];
|
||||
cb = cb_g_buf[i];
|
||||
cr = cr_b_buf[i];
|
||||
|
||||
#if 0
|
||||
/**
|
||||
* This is the slow floating point version kept here for reference
|
||||
*/
|
||||
|
||||
y = y + 4096; /* 128<<5=4096 so that we can scale the sum by >> 5 */
|
||||
|
||||
r = y + cr*1.403f;
|
||||
g = y - cb*0.344f - cr*0.714f;
|
||||
b = y + cb*1.770f;
|
||||
|
||||
y_r_buf[i] = MINMAX(r>>5, 0, 255);
|
||||
cb_g_buf[i] = MINMAX(g>>5, 0, 255);
|
||||
cr_b_buf[i] = MINMAX(b>>5, 0, 255);
|
||||
#else
|
||||
/**
|
||||
* We scale the factors by << 16 into 32-bit integers in order to avoid slower
|
||||
* floating point multiplications. Since the final result needs to be scaled
|
||||
* by >> 5 we will extract only the upper 11 bits (>> 21) from the final sum.
|
||||
* Hence we also have to scale the other terms of the sum by << 16.
|
||||
*
|
||||
* R: 1.403 << 16 = 91947
|
||||
* G: 0.344 << 16 = 22544, 0.714 << 16 = 46792
|
||||
* B: 1.770 << 16 = 115998
|
||||
*/
|
||||
|
||||
y = (y+4096)<<16;
|
||||
|
||||
r = y + cr*91947;
|
||||
g = y - cb*22544 - cr*46792;
|
||||
b = y + cb*115998;
|
||||
|
||||
y_r_buf[i] = MINMAX(r>>21, 0, 255);
|
||||
cb_g_buf[i] = MINMAX(g>>21, 0, 255);
|
||||
cr_b_buf[i] = MINMAX(b>>21, 0, 255);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void rfx_decode_component(RFX_CONTEXT* context, const UINT32* quantization_values,
|
||||
const BYTE* data, int size, INT16* buffer)
|
||||
{
|
||||
|
@ -173,11 +117,18 @@ static void rfx_decode_component(RFX_CONTEXT* context, const UINT32* quantizatio
|
|||
PROFILER_EXIT(context->priv->prof_rfx_decode_component);
|
||||
}
|
||||
|
||||
/* rfx_decode_ycbcr_to_rgb code now resides in the primitives library. */
|
||||
|
||||
/* stride is bytes between rows in the output buffer. */
|
||||
void rfx_decode_rgb(RFX_CONTEXT* context, STREAM* data_in,
|
||||
int y_size, const UINT32 * y_quants,
|
||||
int cb_size, const UINT32 * cb_quants,
|
||||
int cr_size, const UINT32 * cr_quants, BYTE* rgb_buffer)
|
||||
int cr_size, const UINT32 * cr_quants, BYTE* rgb_buffer, int stride)
|
||||
{
|
||||
static const prim_size_t roi_64x64 = { 64, 64 };
|
||||
const primitives_t *prims = primitives_get();
|
||||
INT16 *pSrcDst[3];
|
||||
|
||||
PROFILER_ENTER(context->priv->prof_rfx_decode_rgb);
|
||||
|
||||
rfx_decode_component(context, y_quants, stream_get_tail(data_in), y_size, context->priv->y_r_buffer); /* YData */
|
||||
|
@ -188,12 +139,16 @@ void rfx_decode_rgb(RFX_CONTEXT* context, STREAM* data_in,
|
|||
stream_seek(data_in, cr_size);
|
||||
|
||||
PROFILER_ENTER(context->priv->prof_rfx_decode_ycbcr_to_rgb);
|
||||
context->decode_ycbcr_to_rgb(context->priv->y_r_buffer, context->priv->cb_g_buffer, context->priv->cr_b_buffer);
|
||||
pSrcDst[0] = context->priv->y_r_buffer;
|
||||
pSrcDst[1] = context->priv->cb_g_buffer;
|
||||
pSrcDst[2] = context->priv->cr_b_buffer;
|
||||
prims->yCbCrToRGB_16s16s_P3P3((const INT16 **) pSrcDst, 64*sizeof(INT16),
|
||||
pSrcDst, 64*sizeof(INT16), &roi_64x64);
|
||||
PROFILER_EXIT(context->priv->prof_rfx_decode_ycbcr_to_rgb);
|
||||
|
||||
PROFILER_ENTER(context->priv->prof_rfx_decode_format_rgb);
|
||||
rfx_decode_format_rgb(context->priv->y_r_buffer, context->priv->cb_g_buffer, context->priv->cr_b_buffer,
|
||||
context->pixel_format, rgb_buffer);
|
||||
context->pixel_format, rgb_buffer, stride);
|
||||
PROFILER_EXIT(context->priv->prof_rfx_decode_format_rgb);
|
||||
|
||||
PROFILER_EXIT(context->priv->prof_rfx_decode_rgb);
|
||||
|
|
|
@ -22,12 +22,12 @@
|
|||
|
||||
#include <freerdp/codec/rfx.h>
|
||||
|
||||
void rfx_decode_ycbcr_to_rgb(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf);
|
||||
|
||||
/* stride is bytes between rows in the output buffer. */
|
||||
void rfx_decode_rgb(RFX_CONTEXT* context, STREAM* data_in,
|
||||
int y_size, const UINT32 * y_quants,
|
||||
int cb_size, const UINT32 * cb_quants,
|
||||
int cr_size, const UINT32 * cr_quants, BYTE* rgb_buffer);
|
||||
int cr_size, const UINT32 * cr_quants, BYTE* rgb_buffer,
|
||||
int stride);
|
||||
|
||||
#endif /* __RFX_DECODE_H */
|
||||
|
||||
|
|
|
@ -26,6 +26,8 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
#include "rfx_types.h"
|
||||
#include "rfx_rlgr.h"
|
||||
#include "rfx_differential.h"
|
||||
|
@ -180,47 +182,7 @@ static void rfx_encode_format_rgb(const BYTE* rgb_data, int width, int height, i
|
|||
}
|
||||
}
|
||||
|
||||
void rfx_encode_rgb_to_ycbcr(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf)
|
||||
{
|
||||
/* INT32 is used intentionally because we calculate with shifted factors! */
|
||||
int i;
|
||||
INT32 r, g, b;
|
||||
INT32 y, cb, cr;
|
||||
|
||||
/**
|
||||
* The encoded YCbCr coefficients are represented as 11.5 fixed-point numbers:
|
||||
*
|
||||
* 1 sign bit + 10 integer bits + 5 fractional bits
|
||||
*
|
||||
* However only 7 integer bits will be actually used since the value range is [-128.0, 127.0].
|
||||
* In other words, the encoded coefficients is scaled by << 5 when interpreted as INT16.
|
||||
* It will be scaled down to original during the quantization phase.
|
||||
*/
|
||||
for (i = 0; i < 4096; i++)
|
||||
{
|
||||
r = y_r_buf[i];
|
||||
g = cb_g_buf[i];
|
||||
b = cr_b_buf[i];
|
||||
|
||||
/*
|
||||
* We scale the factors by << 15 into 32-bit integers in order to avoid slower
|
||||
* floating point multiplications. Since the terms need to be scaled by << 5 we
|
||||
* simply scale the final sum by >> 10
|
||||
*
|
||||
* Y: 0.299000 << 15 = 9798, 0.587000 << 15 = 19235, 0.114000 << 15 = 3735
|
||||
* Cb: 0.168935 << 15 = 5535, 0.331665 << 15 = 10868, 0.500590 << 15 = 16403
|
||||
* Cr: 0.499813 << 15 = 16377, 0.418531 << 15 = 13714, 0.081282 << 15 = 2663
|
||||
*/
|
||||
|
||||
y = (r * 9798 + g * 19235 + b * 3735) >> 10;
|
||||
cb = (r * -5535 + g * -10868 + b * 16403) >> 10;
|
||||
cr = (r * 16377 + g * -13714 + b * -2663) >> 10;
|
||||
|
||||
y_r_buf[i] = MINMAX(y - 4096, -4096, 4095);
|
||||
cb_g_buf[i] = MINMAX(cb, -4096, 4095);
|
||||
cr_b_buf[i] = MINMAX(cr, -4096, 4095);
|
||||
}
|
||||
}
|
||||
/* rfx_encode_rgb_to_ycbcr code now resides in the primitives library. */
|
||||
|
||||
static void rfx_encode_component(RFX_CONTEXT* context, const UINT32* quantization_values,
|
||||
INT16* data, BYTE* buffer, int buffer_size, int* size)
|
||||
|
@ -250,6 +212,9 @@ void rfx_encode_rgb(RFX_CONTEXT* context, const BYTE* rgb_data, int width, int h
|
|||
const UINT32* y_quants, const UINT32* cb_quants, const UINT32* cr_quants,
|
||||
STREAM* data_out, int* y_size, int* cb_size, int* cr_size)
|
||||
{
|
||||
primitives_t *prims = primitives_get();
|
||||
INT16* pSrcDst[3];
|
||||
static const prim_size_t roi_64x64 = { 64, 64 };
|
||||
INT16* y_r_buffer = context->priv->y_r_buffer;
|
||||
INT16* cb_g_buffer = context->priv->cb_g_buffer;
|
||||
INT16* cr_b_buffer = context->priv->cr_b_buffer;
|
||||
|
@ -261,9 +226,13 @@ void rfx_encode_rgb(RFX_CONTEXT* context, const BYTE* rgb_data, int width, int h
|
|||
context->pixel_format, context->palette, y_r_buffer, cb_g_buffer, cr_b_buffer);
|
||||
PROFILER_EXIT(context->priv->prof_rfx_encode_format_rgb);
|
||||
|
||||
PROFILER_ENTER(context->priv->prof_rfx_encode_rgb_to_ycbcr);
|
||||
context->encode_rgb_to_ycbcr(context->priv->y_r_buffer, context->priv->cb_g_buffer, context->priv->cr_b_buffer);
|
||||
PROFILER_EXIT(context->priv->prof_rfx_encode_rgb_to_ycbcr);
|
||||
PROFILER_ENTER(context->priv->prof_rfx_rgb_to_ycbcr);
|
||||
pSrcDst[0] = context->priv->y_r_buffer;
|
||||
pSrcDst[1] = context->priv->cb_g_buffer;
|
||||
pSrcDst[2] = context->priv->cr_b_buffer;
|
||||
prims->RGBToYCbCr_16s16s_P3P3((const INT16 **) pSrcDst, 64*sizeof(INT16),
|
||||
pSrcDst, 64*sizeof(INT16), &roi_64x64);
|
||||
PROFILER_EXIT(context->priv->prof_rfx_rgb_to_ycbcr);
|
||||
|
||||
/* Ensure the buffer is reasonably large enough */
|
||||
stream_check_size(data_out, 4096);
|
||||
|
|
|
@ -22,8 +22,6 @@
|
|||
|
||||
#include <freerdp/codec/rfx.h>
|
||||
|
||||
void rfx_encode_rgb_to_ycbcr(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf);
|
||||
|
||||
void rfx_encode_rgb(RFX_CONTEXT* context, const BYTE* rgb_data, int width, int height, int rowstride,
|
||||
const UINT32* y_quants, const UINT32* cb_quants, const UINT32* cr_quants,
|
||||
STREAM* data_out, int* y_size, int* cb_size, int* cr_size);
|
||||
|
|
|
@ -35,56 +35,7 @@
|
|||
#include "cpu-features.h"
|
||||
#endif
|
||||
|
||||
void rfx_decode_YCbCr_to_RGB_NEON(INT16 * y_r_buffer, INT16 * cb_g_buffer, INT16 * cr_b_buffer)
|
||||
{
|
||||
int16x8_t zero = vdupq_n_s16(0);
|
||||
int16x8_t max = vdupq_n_s16(255);
|
||||
int16x8_t y_add = vdupq_n_s16(128);
|
||||
|
||||
int16x8_t* y_r_buf = (int16x8_t*) y_r_buffer;
|
||||
int16x8_t* cb_g_buf = (int16x8_t*) cb_g_buffer;
|
||||
int16x8_t* cr_b_buf = (int16x8_t*) cr_b_buffer;
|
||||
|
||||
int i;
|
||||
for (i = 0; i < 4096 / 8; i++)
|
||||
{
|
||||
int16x8_t y = vld1q_s16((INT16*) &y_r_buf[i]);
|
||||
y = vaddq_s16(y, y_add);
|
||||
|
||||
int16x8_t cr = vld1q_s16((INT16*) &cr_b_buf[i]);
|
||||
|
||||
// r = between((y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5)), 0, 255);
|
||||
int16x8_t r = vaddq_s16(y, cr);
|
||||
r = vaddq_s16(r, vshrq_n_s16(cr, 2));
|
||||
r = vaddq_s16(r, vshrq_n_s16(cr, 3));
|
||||
r = vaddq_s16(r, vshrq_n_s16(cr, 5));
|
||||
r = vminq_s16(vmaxq_s16(r, zero), max);
|
||||
vst1q_s16((INT16*)&y_r_buf[i], r);
|
||||
|
||||
// cb = cb_g_buf[i];
|
||||
int16x8_t cb = vld1q_s16((INT16*)&cb_g_buf[i]);
|
||||
|
||||
// g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255);
|
||||
int16x8_t g = vsubq_s16(y, vshrq_n_s16(cb, 2));
|
||||
g = vsubq_s16(g, vshrq_n_s16(cb, 4));
|
||||
g = vsubq_s16(g, vshrq_n_s16(cb, 5));
|
||||
g = vsubq_s16(g, vshrq_n_s16(cr, 1));
|
||||
g = vsubq_s16(g, vshrq_n_s16(cr, 3));
|
||||
g = vsubq_s16(g, vshrq_n_s16(cr, 4));
|
||||
g = vsubq_s16(g, vshrq_n_s16(cr, 5));
|
||||
g = vminq_s16(vmaxq_s16(g, zero), max);
|
||||
vst1q_s16((INT16*)&cb_g_buf[i], g);
|
||||
|
||||
// b = between((y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6)), 0, 255);
|
||||
int16x8_t b = vaddq_s16(y, cb);
|
||||
b = vaddq_s16(b, vshrq_n_s16(cb, 1));
|
||||
b = vaddq_s16(b, vshrq_n_s16(cb, 2));
|
||||
b = vaddq_s16(b, vshrq_n_s16(cb, 6));
|
||||
b = vminq_s16(vmaxq_s16(b, zero), max);
|
||||
vst1q_s16((INT16*)&cr_b_buf[i], b);
|
||||
}
|
||||
|
||||
}
|
||||
/* rfx_decode_YCbCr_to_RGB_NEON code now resides in the primitives library. */
|
||||
|
||||
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
rfx_quantization_decode_block_NEON(INT16 * buffer, const int buffer_size, const UINT32 factor)
|
||||
|
@ -338,11 +289,10 @@ void rfx_init_neon(RFX_CONTEXT * context)
|
|||
{
|
||||
DEBUG_RFX("Using NEON optimizations");
|
||||
|
||||
IF_PROFILER(context->priv->prof_rfx_decode_ycbcr_to_rgb->name = "rfx_decode_YCbCr_to_RGB_NEON");
|
||||
IF_PROFILER(context->priv->prof_rfx_ycbcr_to_rgb->name = "rfx_decode_YCbCr_to_RGB_NEON");
|
||||
IF_PROFILER(context->priv->prof_rfx_quantization_decode->name = "rfx_quantization_decode_NEON");
|
||||
IF_PROFILER(context->priv->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_NEON");
|
||||
|
||||
context->decode_ycbcr_to_rgb = rfx_decode_YCbCr_to_RGB_NEON;
|
||||
context->quantization_decode = rfx_quantization_decode_NEON;
|
||||
context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
|
||||
}
|
||||
|
|
|
@ -21,36 +21,34 @@
|
|||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include <freerdp/primitives.h>
|
||||
#include "rfx_quantization.h"
|
||||
|
||||
static void rfx_quantization_decode_block(INT16* buffer, int buffer_size, UINT32 factor)
|
||||
static void rfx_quantization_decode_block(const primitives_t *prims, INT16* buffer, int buffer_size, UINT32 factor)
|
||||
{
|
||||
INT16* dst;
|
||||
|
||||
if (factor == 0)
|
||||
return;
|
||||
|
||||
for (dst = buffer; buffer_size > 0; dst++, buffer_size--)
|
||||
{
|
||||
*dst <<= factor;
|
||||
}
|
||||
prims->lShiftC_16s(buffer, factor, buffer, buffer_size);
|
||||
}
|
||||
|
||||
void rfx_quantization_decode(INT16* buffer, const UINT32* quantization_values)
|
||||
{
|
||||
/* Scale the values so that they are represented as 11.5 fixed-point number */
|
||||
rfx_quantization_decode_block(buffer, 4096, 5);
|
||||
const primitives_t *prims = primitives_get();
|
||||
|
||||
rfx_quantization_decode_block(buffer, 1024, quantization_values[8] - 6); /* HL1 */
|
||||
rfx_quantization_decode_block(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
|
||||
rfx_quantization_decode_block(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
|
||||
rfx_quantization_decode_block(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
|
||||
rfx_quantization_decode_block(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
|
||||
rfx_quantization_decode_block(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
|
||||
rfx_quantization_decode_block(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
|
||||
rfx_quantization_decode_block(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
|
||||
rfx_quantization_decode_block(buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
|
||||
rfx_quantization_decode_block(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
|
||||
/* Scale the values so that they are represented as 11.5 fixed-point number */
|
||||
rfx_quantization_decode_block(prims, buffer, 4096, 5);
|
||||
|
||||
rfx_quantization_decode_block(prims, buffer, 1024, quantization_values[8] - 6); /* HL1 */
|
||||
rfx_quantization_decode_block(prims, buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
|
||||
rfx_quantization_decode_block(prims, buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
|
||||
rfx_quantization_decode_block(prims, buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
|
||||
rfx_quantization_decode_block(prims, buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
|
||||
rfx_quantization_decode_block(prims, buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
|
||||
rfx_quantization_decode_block(prims, buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
|
||||
rfx_quantization_decode_block(prims, buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
|
||||
rfx_quantization_decode_block(prims, buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
|
||||
rfx_quantization_decode_block(prims, buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
|
||||
}
|
||||
|
||||
static void rfx_quantization_encode_block(INT16* buffer, int buffer_size, UINT32 factor)
|
||||
|
@ -62,6 +60,7 @@ static void rfx_quantization_encode_block(INT16* buffer, int buffer_size, UINT32
|
|||
return;
|
||||
|
||||
half = (1 << (factor - 1));
|
||||
/* Could probably use prims->rShiftC_16s(dst+half, factor, dst, buffer_size); */
|
||||
for (dst = buffer; buffer_size > 0; dst++, buffer_size--)
|
||||
{
|
||||
*dst = (*dst + half) >> factor;
|
||||
|
|
|
@ -52,177 +52,8 @@ _mm_prefetch_buffer(char * buffer, int num_bytes)
|
|||
}
|
||||
}
|
||||
|
||||
static void rfx_decode_ycbcr_to_rgb_sse2(INT16* y_r_buffer, INT16* cb_g_buffer, INT16* cr_b_buffer)
|
||||
{
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
__m128i max = _mm_set1_epi16(255);
|
||||
|
||||
__m128i* y_r_buf = (__m128i*) y_r_buffer;
|
||||
__m128i* cb_g_buf = (__m128i*) cb_g_buffer;
|
||||
__m128i* cr_b_buf = (__m128i*) cr_b_buffer;
|
||||
|
||||
__m128i y;
|
||||
__m128i cr;
|
||||
__m128i cb;
|
||||
__m128i r;
|
||||
__m128i g;
|
||||
__m128i b;
|
||||
|
||||
int i;
|
||||
|
||||
__m128i r_cr = _mm_set1_epi16(22986); // 1.403 << 14
|
||||
__m128i g_cb = _mm_set1_epi16(-5636); // -0.344 << 14
|
||||
__m128i g_cr = _mm_set1_epi16(-11698); // -0.714 << 14
|
||||
__m128i b_cb = _mm_set1_epi16(28999); // 1.770 << 14
|
||||
__m128i c4096 = _mm_set1_epi16(4096);
|
||||
|
||||
for (i = 0; i < (4096 * sizeof(INT16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
|
||||
{
|
||||
_mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
|
||||
_mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA);
|
||||
_mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA);
|
||||
}
|
||||
for (i = 0; i < (4096 * sizeof(INT16) / sizeof(__m128i)); i++)
|
||||
{
|
||||
/*
|
||||
In order to use SSE2 signed 16-bit integer multiplication we need to convert
|
||||
the floating point factors to signed int without loosing information.
|
||||
The result of this multiplication is 32 bit and we have two SSE instructions
|
||||
that return either the hi or lo word.
|
||||
Thus we will multiply the factors by the highest possible 2^n, take the
|
||||
upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16) and correct this
|
||||
result by multiplying it by 2^(16-n).
|
||||
For the given factors in the conversion matrix the best possible n is 14.
|
||||
|
||||
Example for calculating r:
|
||||
r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
|
||||
r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
|
||||
r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
|
||||
r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
|
||||
*/
|
||||
|
||||
/* y = (y_r_buf[i] + 4096) >> 2 */
|
||||
y = _mm_load_si128(&y_r_buf[i]);
|
||||
y = _mm_add_epi16(y, c4096);
|
||||
y = _mm_srai_epi16(y, 2);
|
||||
/* cb = cb_g_buf[i]; */
|
||||
cb = _mm_load_si128(&cb_g_buf[i]);
|
||||
/* cr = cr_b_buf[i]; */
|
||||
cr = _mm_load_si128(&cr_b_buf[i]);
|
||||
|
||||
/* (y + HIWORD(cr*22986)) >> 3 */
|
||||
r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
|
||||
r = _mm_srai_epi16(r, 3);
|
||||
/* y_r_buf[i] = MINMAX(r, 0, 255); */
|
||||
_mm_between_epi16(r, zero, max);
|
||||
_mm_store_si128(&y_r_buf[i], r);
|
||||
|
||||
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
|
||||
g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
|
||||
g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
|
||||
g = _mm_srai_epi16(g, 3);
|
||||
/* cb_g_buf[i] = MINMAX(g, 0, 255); */
|
||||
_mm_between_epi16(g, zero, max);
|
||||
_mm_store_si128(&cb_g_buf[i], g);
|
||||
|
||||
/* (y + HIWORD(cb*28999)) >> 3 */
|
||||
b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
|
||||
b = _mm_srai_epi16(b, 3);
|
||||
/* cr_b_buf[i] = MINMAX(b, 0, 255); */
|
||||
_mm_between_epi16(b, zero, max);
|
||||
_mm_store_si128(&cr_b_buf[i], b);
|
||||
}
|
||||
}
|
||||
|
||||
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point numbers. See rfx_encode.c */
|
||||
static void rfx_encode_rgb_to_ycbcr_sse2(INT16* y_r_buffer, INT16* cb_g_buffer, INT16* cr_b_buffer)
|
||||
{
|
||||
__m128i min = _mm_set1_epi16(-128 << 5);
|
||||
__m128i max = _mm_set1_epi16(127 << 5);
|
||||
|
||||
__m128i* y_r_buf = (__m128i*) y_r_buffer;
|
||||
__m128i* cb_g_buf = (__m128i*) cb_g_buffer;
|
||||
__m128i* cr_b_buf = (__m128i*) cr_b_buffer;
|
||||
|
||||
__m128i y;
|
||||
__m128i cr;
|
||||
__m128i cb;
|
||||
__m128i r;
|
||||
__m128i g;
|
||||
__m128i b;
|
||||
|
||||
__m128i y_r = _mm_set1_epi16(9798); // 0.299000 << 15
|
||||
__m128i y_g = _mm_set1_epi16(19235); // 0.587000 << 15
|
||||
__m128i y_b = _mm_set1_epi16(3735); // 0.114000 << 15
|
||||
__m128i cb_r = _mm_set1_epi16(-5535); // -0.168935 << 15
|
||||
__m128i cb_g = _mm_set1_epi16(-10868); // -0.331665 << 15
|
||||
__m128i cb_b = _mm_set1_epi16(16403); // 0.500590 << 15
|
||||
__m128i cr_r = _mm_set1_epi16(16377); // 0.499813 << 15
|
||||
__m128i cr_g = _mm_set1_epi16(-13714); // -0.418531 << 15
|
||||
__m128i cr_b = _mm_set1_epi16(-2663); // -0.081282 << 15
|
||||
|
||||
int i;
|
||||
|
||||
for (i = 0; i < (4096 * sizeof(INT16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
|
||||
{
|
||||
_mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
|
||||
_mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA);
|
||||
_mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA);
|
||||
}
|
||||
for (i = 0; i < (4096 * sizeof(INT16) / sizeof(__m128i)); i++)
|
||||
{
|
||||
/*
|
||||
In order to use SSE2 signed 16-bit integer multiplication we need to convert
|
||||
the floating point factors to signed int without loosing information.
|
||||
The result of this multiplication is 32 bit and using SSE2 we get either the
|
||||
product's hi or lo word.
|
||||
Thus we will multiply the factors by the highest possible 2^n and take the
|
||||
upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16).
|
||||
Since the final result needs to be scaled by << 5 and also in in order to keep
|
||||
the precision within the upper 16 bits we will also have to scale the RGB
|
||||
values used in the multiplication by << 5+(16-n).
|
||||
*/
|
||||
|
||||
/* r = y_r_buf[i]; */
|
||||
r = _mm_load_si128(&y_r_buf[i]);
|
||||
|
||||
/* g = cb_g_buf[i]; */
|
||||
g = _mm_load_si128(&cb_g_buf[i]);
|
||||
|
||||
/* b = cr_b_buf[i]; */
|
||||
b = _mm_load_si128(&cr_b_buf[i]);
|
||||
|
||||
/* r<<6; g<<6; b<<6 */
|
||||
r = _mm_slli_epi16(r, 6);
|
||||
g = _mm_slli_epi16(g, 6);
|
||||
b = _mm_slli_epi16(b, 6);
|
||||
|
||||
/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
|
||||
y = _mm_mulhi_epi16(r, y_r);
|
||||
y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
|
||||
y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
|
||||
y = _mm_add_epi16(y, min);
|
||||
/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
|
||||
_mm_between_epi16(y, min, max);
|
||||
_mm_store_si128(&y_r_buf[i], y);
|
||||
|
||||
/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
|
||||
cb = _mm_mulhi_epi16(r, cb_r);
|
||||
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
|
||||
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
|
||||
/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
|
||||
_mm_between_epi16(cb, min, max);
|
||||
_mm_store_si128(&cb_g_buf[i], cb);
|
||||
|
||||
/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
|
||||
cr = _mm_mulhi_epi16(r, cr_r);
|
||||
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
|
||||
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
|
||||
/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
|
||||
_mm_between_epi16(cr, min, max);
|
||||
_mm_store_si128(&cr_b_buf[i], cr);
|
||||
}
|
||||
}
|
||||
/* rfx_decode_ycbcr_to_rgb_sse2 code now resides in the primitives library. */
|
||||
/* rfx_encode_rgb_to_ycbcr_sse2 code now resides in the primitives library. */
|
||||
|
||||
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
rfx_quantization_decode_block_sse2(INT16* buffer, const int buffer_size, const UINT32 factor)
|
||||
|
@ -658,15 +489,11 @@ void rfx_init_sse2(RFX_CONTEXT* context)
|
|||
{
|
||||
DEBUG_RFX("Using SSE2 optimizations");
|
||||
|
||||
IF_PROFILER(context->priv->prof_rfx_decode_ycbcr_to_rgb->name = "rfx_decode_ycbcr_to_rgb_sse2");
|
||||
IF_PROFILER(context->priv->prof_rfx_encode_rgb_to_ycbcr->name = "rfx_encode_rgb_to_ycbcr_sse2");
|
||||
IF_PROFILER(context->priv->prof_rfx_quantization_decode->name = "rfx_quantization_decode_sse2");
|
||||
IF_PROFILER(context->priv->prof_rfx_quantization_encode->name = "rfx_quantization_encode_sse2");
|
||||
IF_PROFILER(context->priv->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_sse2");
|
||||
IF_PROFILER(context->priv->prof_rfx_dwt_2d_encode->name = "rfx_dwt_2d_encode_sse2");
|
||||
|
||||
context->decode_ycbcr_to_rgb = rfx_decode_ycbcr_to_rgb_sse2;
|
||||
context->encode_rgb_to_ycbcr = rfx_encode_rgb_to_ycbcr_sse2;
|
||||
context->quantization_decode = rfx_quantization_decode_sse2;
|
||||
context->quantization_encode = rfx_quantization_encode_sse2;
|
||||
context->dwt_2d_decode = rfx_dwt_2d_decode_sse2;
|
||||
|
|
|
@ -60,7 +60,7 @@ struct _RFX_CONTEXT_PRIV
|
|||
PROFILER_DEFINE(prof_rfx_differential_decode);
|
||||
PROFILER_DEFINE(prof_rfx_quantization_decode);
|
||||
PROFILER_DEFINE(prof_rfx_dwt_2d_decode);
|
||||
PROFILER_DEFINE(prof_rfx_decode_ycbcr_to_rgb);
|
||||
PROFILER_DEFINE(prof_rfx_ycbcr_to_rgb);
|
||||
PROFILER_DEFINE(prof_rfx_decode_format_rgb);
|
||||
|
||||
PROFILER_DEFINE(prof_rfx_encode_rgb);
|
||||
|
@ -69,7 +69,7 @@ struct _RFX_CONTEXT_PRIV
|
|||
PROFILER_DEFINE(prof_rfx_differential_encode);
|
||||
PROFILER_DEFINE(prof_rfx_quantization_encode);
|
||||
PROFILER_DEFINE(prof_rfx_dwt_2d_encode);
|
||||
PROFILER_DEFINE(prof_rfx_encode_rgb_to_ycbcr);
|
||||
PROFILER_DEFINE(prof_rfx_rgb_to_ycbcr);
|
||||
PROFILER_DEFINE(prof_rfx_encode_format_rgb);
|
||||
};
|
||||
|
||||
|
|
|
@ -0,0 +1,81 @@
|
|||
# FreeRDP: A Remote Desktop Protocol Client
|
||||
# libfreerdp-primitives cmake build script
|
||||
# vi:ts=4 sw=4:
|
||||
#
|
||||
# (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
# implied. See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
|
||||
set(FREERDP_PRIMITIVE_CFILES
|
||||
prim_add.c
|
||||
prim_andor.c
|
||||
prim_alphaComp.c
|
||||
prim_colors.c
|
||||
prim_copy.c
|
||||
prim_set.c
|
||||
prim_shift.c
|
||||
prim_sign.c
|
||||
primitives.c
|
||||
)
|
||||
set(FREERDP_PRIMITIVE_HEADERS
|
||||
prim_internal.h
|
||||
)
|
||||
set(FREERDP_PRIMITIVE_SRCS
|
||||
${FREERDP_PRIMITIVE_CFILES}
|
||||
${FREERDP_PRIMITIVE_HEADERS}
|
||||
)
|
||||
|
||||
add_definitions(-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})
|
||||
|
||||
### IPP Variable debugging
|
||||
if(WITH_IPP)
|
||||
if(CMAKE_COMPILER_IS_GNUCC)
|
||||
foreach(INCLDIR ${IPP_INCLUDE_DIRS})
|
||||
set(OPTIMIZATION "${OPTIMIZATION} -I${INCLDIR}")
|
||||
endforeach(INCLDIR)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(WITH_SSE2)
|
||||
if(CMAKE_COMPILER_IS_GNUCC)
|
||||
set(OPTIMIZATION "${OPTIMIZATION} -msse2 -mssse3 -Wdeclaration-after-statement")
|
||||
endif()
|
||||
|
||||
if(MSVC)
|
||||
set(OPTIMIZATION "${OPTIMIZATION} /arch:SSE2")
|
||||
endif()
|
||||
elseif(WITH_NEON)
|
||||
if(CMAKE_COMPILER_IS_GNUCC)
|
||||
set(OPTIMZATION "${OPTIMIZATION} -mfpu=neon -mfloat-abi=softfp")
|
||||
endif()
|
||||
# TODO: Add MSVC equivalent
|
||||
endif()
|
||||
set_property(SOURCE ${FREERDP_PRIMITIVE_CFILES} PROPERTY COMPILE_FLAGS ${OPTIMIZATION})
|
||||
|
||||
if(WITH_MONOLITHIC_BUILD)
|
||||
add_library(freerdp-primitives OBJECT ${FREERDP_PRIMITIVE_SRCS})
|
||||
else()
|
||||
add_library(freerdp-primitives ${FREERDP_PRIMITIVE_SRCS})
|
||||
endif()
|
||||
|
||||
set_target_properties(freerdp-primitives PROPERTIES VERSION ${FREERDP_VERSION_FULL} SOVERSION ${FREERDP_VERSION} PREFIX "lib")
|
||||
|
||||
#set(FREERDP_PRIMITIVE_LIBS, rt)
|
||||
|
||||
if(WITH_MONOLITHIC_BUILD)
|
||||
set(FREERDP_LIBS ${FREERDP_LIBS} ${FREERDP_PRIMITIVE_LIBS} PARENT_SCOPE)
|
||||
else()
|
||||
target_link_libraries(freerdp-primitives ${FREERDP_PRIMITIVE_LIBS})
|
||||
install(TARGETS freerdp-primitives DESTINATION ${CMAKE_INSTALL_LIBDIR})
|
||||
endif()
|
||||
|
||||
if(BUILD_TESTING)
|
||||
add_subdirectory(test)
|
||||
endif(BUILD_TESTING)
|
|
@ -0,0 +1,113 @@
|
|||
The Primitives Library
|
||||
|
||||
Introduction
|
||||
------------
|
||||
The purpose of the primitives library is to give the freerdp code easy
|
||||
access to *run-time* optimization via SIMD operations. When the library
|
||||
is initialized, dynamic checks of processor features are run (such as
|
||||
the support of SSE3 or Neon), and entrypoints are linked to through
|
||||
function pointers to provide the fastest possible operations. All
|
||||
routines offer generic C alternatives as fallbacks.
|
||||
|
||||
Run-time optimization has the advantage of allowing a single executable
|
||||
to run fast on multiple platforms with different SIMD capabilities.
|
||||
|
||||
|
||||
Use In Code
|
||||
-----------
|
||||
A singleton pointing to a structure containing the function pointers
|
||||
is accessed through primitives_get(). The function pointers can then
|
||||
be used from that structure, e.g.
|
||||
|
||||
primitives_t *prims = primitives_get();
|
||||
prims->shiftC_16s(buffer, shifts, buffer, 256);
|
||||
|
||||
Of course, there is some overhead in calling through the function pointer
|
||||
and setting up the SIMD operations, so it would be counterproductive to
|
||||
call the primitives library for very small operation, e.g. initializing an
|
||||
array of eight values to a constant. The primitives library is intended
|
||||
for larger-scale operations, e.g. arrays of size 64 and larger.
|
||||
|
||||
|
||||
Initialization and Cleanup
|
||||
--------------------------
|
||||
Library initialization is done the first time primitives_init() is called
|
||||
or the first time primitives_get() is used. Cleanup (if any) is done by
|
||||
primitives_deinit().
|
||||
|
||||
|
||||
Intel Integrated Performance Primitives (IPP)
|
||||
---------------------------------------------
|
||||
If freerdp is compiled with IPP support (-DWITH_IPP=ON), the IPP function
|
||||
calls will be used (where available) to fill the function pointers.
|
||||
Where possible, function names and parameter lists match IPP format so
|
||||
that the IPP functions can be plugged into the function pointers without
|
||||
a wrapper layer. Use of IPP is completely optional, and in many cases
|
||||
the SSE operations in the primitives library itself are faster or similar
|
||||
in performance.
|
||||
|
||||
|
||||
Coverage
|
||||
--------
|
||||
The primitives library is not meant to be comprehensive, offering
|
||||
entrypoints for every operation and operand type. Instead, the coverage
|
||||
is focused on operations known to be performance bottlenecks in the code.
|
||||
For instance, 16-bit signed operations are used widely in the RemoteFX
|
||||
software, so you'll find 16s versions of several operations, but there
|
||||
is no attempt to provide (unused) copies of the same code for 8u, 16u,
|
||||
32s, etc.
|
||||
|
||||
|
||||
New Optimizations
|
||||
-----------------
|
||||
As the need arises, new optimizations can be added to the library,
|
||||
including NEON, AVX, and perhaps OpenCL or other SIMD implementations.
|
||||
The initialization routine is free to do any quick run-time test to
|
||||
determine which features are available before hooking the operation's
|
||||
function pointer, or it can simply look at the processor features list
|
||||
from the hints passed to the initialization routine.
|
||||
|
||||
|
||||
Adding Entrypoints
|
||||
------------------
|
||||
As the need for new operations or operands arises, new entrypoints can
|
||||
be added.
|
||||
1) Function prototypes and pointers are added to
|
||||
include/freerdp/primitives.h
|
||||
2) New module initialization and cleanup function prototypes are added
|
||||
to prim_internal.h and called in primitives.c (primitives_init()
|
||||
and primitives_deinit()).
|
||||
3) Operation names and parameter lists should be compatible with the IPP.
|
||||
IPP manuals are available online at software.intel.com.
|
||||
4) A generic C entrypoint must be available as a fallback.
|
||||
5) prim_templates.h contains macro-based templates for simple operations,
|
||||
such as applying a single SSE operation to arrays of data.
|
||||
The template functions can frequently be used to extend the
|
||||
operations without writing a lot of new code.
|
||||
|
||||
|
||||
Flags
|
||||
-----
|
||||
The entrypoint primitives_get_flags() returns a bitfield of processor flags
|
||||
(as defined in primitives.h) and primitives_flag_str() returns a string
|
||||
related to those processor flags, for debugging and information. The
|
||||
bitfield can be used elsewhere in the code as needed.
|
||||
|
||||
|
||||
Cache Management
|
||||
----------------
|
||||
I haven't found a lot of speed improvement by attempting prefetch, and
|
||||
in fact it seems to have a negative impact in some cases. Done correctly
|
||||
perhaps the routines could be further accelerated by proper use of prefetch,
|
||||
fences, etc.
|
||||
|
||||
|
||||
Testing
|
||||
-------
|
||||
In the test subdirectory is an executable (prim_test) that tests both
|
||||
functionality and speed of primitives library operations. Any new
|
||||
modules should be added to that test, following the conventions already
|
||||
established in that directory. The program can be executed on various
|
||||
target hardware to compare generic C, optimized, and IPP performance
|
||||
with various array sizes.
|
||||
|
|
@ -0,0 +1,81 @@
|
|||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Add operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <string.h>
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#ifdef WITH_SSE2
|
||||
# include <emmintrin.h>
|
||||
# include <pmmintrin.h>
|
||||
#endif /* WITH_SSE2 */
|
||||
#ifdef WITH_IPP
|
||||
# include <ipps.h>
|
||||
#endif /* WITH_IPP */
|
||||
#include "prim_internal.h"
|
||||
#include "prim_templates.h"
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* 16-bit signed add with saturation (under and over).
|
||||
*/
|
||||
PRIM_STATIC pstatus_t general_add_16s(
|
||||
const INT16 *pSrc1,
|
||||
const INT16 *pSrc2,
|
||||
INT16 *pDst,
|
||||
INT32 len)
|
||||
{
|
||||
while (len--)
|
||||
{
|
||||
INT32 k = (INT32) (*pSrc1++) + (INT32) (*pSrc2++);
|
||||
if (k > 32767) *pDst++ = ((INT16) 32767);
|
||||
else if (k < -32768) *pDst++ = ((INT16) -32768);
|
||||
else *pDst++ = (INT16) k;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SSD_ROUTINE(sse3_add_16s, INT16, general_add_16s,
|
||||
_mm_adds_epi16, general_add_16s(sptr1++, sptr2++, dptr++, 1))
|
||||
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_add(
|
||||
const primitives_hints_t *hints,
|
||||
primitives_t *prims)
|
||||
{
|
||||
prims->add_16s = general_add_16s;
|
||||
#ifdef WITH_IPP
|
||||
prims->add_16s = (__add_16s_t) ippsAdd_16s;
|
||||
#elif defined(WITH_SSE2)
|
||||
if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
|
||||
&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) /* for LDDQU */
|
||||
{
|
||||
prims->add_16s = sse3_add_16s;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_deinit_add(
|
||||
primitives_t *prims)
|
||||
{
|
||||
/* Nothing to do. */
|
||||
}
|
|
@ -0,0 +1,298 @@
|
|||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Alpha blending routines.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*
|
||||
* Note: this code assumes the second operand is fully opaque,
|
||||
* e.g.
|
||||
* newval = alpha1*val1 + (1-alpha1)*val2
|
||||
* rather than
|
||||
* newval = alpha1*val1 + (1-alpha1)*alpha2*val2
|
||||
* The IPP gives other options.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <string.h>
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include "prim_internal.h"
|
||||
#ifdef WITH_SSE2
|
||||
# include <emmintrin.h>
|
||||
# include <pmmintrin.h>
|
||||
#endif /* WITH_SSE2 */
|
||||
#ifdef WITH_IPP
|
||||
# include <ippi.h>
|
||||
#endif /* WITH_IPP */
|
||||
|
||||
#define ALPHA(_k_) (((_k_) & 0xFF000000U) >> 24)
|
||||
#define RED(_k_) (((_k_) & 0x00FF0000U) >> 16)
|
||||
#define GRN(_k_) (((_k_) & 0x0000FF00U) >> 8)
|
||||
#define BLU(_k_) (((_k_) & 0x000000FFU))
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
PRIM_STATIC pstatus_t general_alphaComp_argb(
|
||||
const BYTE *pSrc1, INT32 src1Step,
|
||||
const BYTE *pSrc2, INT32 src2Step,
|
||||
BYTE *pDst, INT32 dstStep,
|
||||
INT32 width, INT32 height)
|
||||
{
|
||||
const UINT32 *sptr1 = (const UINT32 *) pSrc1;
|
||||
const UINT32 *sptr2 = (const UINT32 *) pSrc2;
|
||||
UINT32 *dptr = (UINT32 *) pDst;
|
||||
int linebytes = width * sizeof(UINT32);
|
||||
int src1Jump = (src1Step - linebytes) / sizeof(UINT32);
|
||||
int src2Jump = (src2Step - linebytes) / sizeof(UINT32);
|
||||
int dstJump = (dstStep - linebytes) / sizeof(UINT32);
|
||||
|
||||
int y;
|
||||
for (y=0; y<height; y++)
|
||||
{
|
||||
int x;
|
||||
for (x=0; x<width; x++)
|
||||
{
|
||||
const UINT32 src1 = *sptr1++;
|
||||
const UINT32 src2 = *sptr2++;
|
||||
UINT32 alpha = ALPHA(src1) + 1;
|
||||
if (alpha == 256)
|
||||
{
|
||||
/* If alpha is 255+1, just copy src1. */
|
||||
*dptr++ = src1;
|
||||
}
|
||||
else if (alpha <= 1)
|
||||
{
|
||||
/* If alpha is 0+1, just copy src2. */
|
||||
*dptr++ = src2;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* A perfectly accurate blend would do (a*src + (255-a)*dst)/255
|
||||
* rather than adding one to alpha and dividing by 256, but this
|
||||
* is much faster and only differs by one 16% of the time.
|
||||
* I'm not sure who first designed the double-ops trick
|
||||
* (Red Blue and Alpha Green).
|
||||
*/
|
||||
UINT32 rb, ag;
|
||||
UINT32 s2rb = src2 & 0x00FF00FFU;
|
||||
UINT32 s2ag = (src2 >> 8) & 0x00FF00FFU;
|
||||
UINT32 s1rb = src1 & 0x00FF00FFU;
|
||||
UINT32 s1ag = (src1 >> 8) & 0x00FF00FFU;
|
||||
|
||||
UINT32 drb = s1rb - s2rb;
|
||||
UINT32 dag = s1ag - s2ag;
|
||||
drb *= alpha;
|
||||
dag *= alpha;
|
||||
|
||||
rb = ((drb >> 8) + s2rb) & 0x00FF00FFU;
|
||||
ag = (((dag >> 8) + s2ag) << 8) & 0xFF00FF00U;
|
||||
*dptr++ = rb | ag;
|
||||
}
|
||||
}
|
||||
sptr1 += src1Jump;
|
||||
sptr2 += src2Jump;
|
||||
dptr += dstJump;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
#ifdef WITH_SSE2
|
||||
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
|
||||
|
||||
PRIM_STATIC pstatus_t sse2_alphaComp_argb(
|
||||
const BYTE *pSrc1, INT32 src1Step,
|
||||
const BYTE *pSrc2, INT32 src2Step,
|
||||
BYTE *pDst, INT32 dstStep,
|
||||
INT32 width, INT32 height)
|
||||
{
|
||||
const UINT32 *sptr1 = (const UINT32 *) pSrc1;
|
||||
const UINT32 *sptr2 = (const UINT32 *) pSrc2;
|
||||
UINT32 *dptr;
|
||||
int linebytes, src1Jump, src2Jump, dstJump, y;
|
||||
__m128i xmm0, xmm1;
|
||||
|
||||
if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS;
|
||||
|
||||
if (width < 4) /* pointless if too small */
|
||||
{
|
||||
return general_alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step,
|
||||
pDst, dstStep, width, height);
|
||||
}
|
||||
dptr = (UINT32 *) pDst;
|
||||
linebytes = width * sizeof(UINT32);
|
||||
src1Jump = (src1Step - linebytes) / sizeof(UINT32);
|
||||
src2Jump = (src2Step - linebytes) / sizeof(UINT32);
|
||||
dstJump = (dstStep - linebytes) / sizeof(UINT32);
|
||||
|
||||
xmm0 = _mm_set1_epi32(0);
|
||||
xmm1 = _mm_set1_epi16(1);
|
||||
|
||||
for (y=0; y<height; ++y)
|
||||
{
|
||||
int pixels = width;
|
||||
int count;
|
||||
|
||||
/* Get to the 16-byte boundary now. */
|
||||
int leadIn = 0;
|
||||
switch ((ULONG_PTR) dptr & 0x0f)
|
||||
{
|
||||
case 0:
|
||||
leadIn = 0;
|
||||
break;
|
||||
case 4:
|
||||
leadIn = 3;
|
||||
break;
|
||||
case 8:
|
||||
leadIn = 2;
|
||||
break;
|
||||
case 12:
|
||||
leadIn = 1;
|
||||
break;
|
||||
default:
|
||||
/* We'll never hit a 16-byte boundary, so do the whole
|
||||
* thing the slow way.
|
||||
*/
|
||||
leadIn = width;
|
||||
break;
|
||||
}
|
||||
if (leadIn)
|
||||
{
|
||||
general_alphaComp_argb((const BYTE *) sptr1,
|
||||
src1Step, (const BYTE *) sptr2, src2Step,
|
||||
(BYTE *) dptr, dstStep, leadIn, 1);
|
||||
sptr1 += leadIn;
|
||||
sptr2 += leadIn;
|
||||
dptr += leadIn;
|
||||
pixels -= leadIn;
|
||||
}
|
||||
|
||||
/* Use SSE registers to do 4 pixels at a time. */
|
||||
count = pixels >> 2;
|
||||
pixels -= count << 2;
|
||||
while (count--)
|
||||
{
|
||||
__m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
|
||||
xmm2 = LOAD_SI128(sptr1); sptr1 += 4;
|
||||
/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
|
||||
xmm3 = LOAD_SI128(sptr2); sptr2 += 4;
|
||||
/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
|
||||
xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
|
||||
/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
|
||||
xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
|
||||
/* subtract */
|
||||
xmm6 = _mm_subs_epi16(xmm4, xmm5);
|
||||
/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
|
||||
xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
|
||||
/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
|
||||
xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
|
||||
/* Add one to alphas */
|
||||
xmm4 = _mm_adds_epi16(xmm4, xmm1);
|
||||
/* Multiply and take low word */
|
||||
xmm4 = _mm_mullo_epi16(xmm4, xmm6);
|
||||
/* Shift 8 right */
|
||||
xmm4 = _mm_srai_epi16(xmm4, 8);
|
||||
/* Add xmm5 */
|
||||
xmm4 = _mm_adds_epi16(xmm4, xmm5);
|
||||
/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
|
||||
|
||||
/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
|
||||
xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
|
||||
/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
|
||||
xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
|
||||
/* subtract */
|
||||
xmm7 = _mm_subs_epi16(xmm5, xmm6);
|
||||
/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
|
||||
xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
|
||||
/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
|
||||
xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
|
||||
/* Add one to alphas */
|
||||
xmm5 = _mm_adds_epi16(xmm5, xmm1);
|
||||
/* Multiply and take low word */
|
||||
xmm5 = _mm_mullo_epi16(xmm5, xmm7);
|
||||
/* Shift 8 right */
|
||||
xmm5 = _mm_srai_epi16(xmm5, 8);
|
||||
/* Add xmm6 */
|
||||
xmm5 = _mm_adds_epi16(xmm5, xmm6);
|
||||
/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
|
||||
|
||||
/* Must mask off remainders or pack gets confused */
|
||||
xmm3 = _mm_set1_epi16(0x00ffU);
|
||||
xmm4 = _mm_and_si128(xmm4, xmm3);
|
||||
xmm5 = _mm_and_si128(xmm5, xmm3);
|
||||
|
||||
/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
|
||||
xmm5 = _mm_packus_epi16(xmm5, xmm4);
|
||||
_mm_store_si128((__m128i *) dptr, xmm5); dptr += 4;
|
||||
}
|
||||
|
||||
/* Finish off the remainder. */
|
||||
if (pixels)
|
||||
{
|
||||
general_alphaComp_argb((const BYTE *) sptr1, src1Step,
|
||||
(const BYTE *) sptr2, src2Step,
|
||||
(BYTE *) dptr, dstStep, pixels, 1);
|
||||
sptr1 += pixels;
|
||||
sptr2 += pixels;
|
||||
dptr += pixels;
|
||||
}
|
||||
|
||||
/* Jump to next row. */
|
||||
sptr1 += src1Jump;
|
||||
sptr2 += src2Jump;
|
||||
dptr += dstJump;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
|
||||
#endif
|
||||
|
||||
#ifdef WITH_IPP
|
||||
/* ------------------------------------------------------------------------- */
|
||||
PRIM_STATIC pstatus_t ipp_alphaComp_argb(
|
||||
const BYTE *pSrc1, INT32 src1Step,
|
||||
const BYTE *pSrc2, INT32 src2Step,
|
||||
BYTE *pDst, INT32 dstStep,
|
||||
INT32 width, INT32 height)
|
||||
{
|
||||
IppiSize sz;
|
||||
sz.width = width;
|
||||
sz.height = height;
|
||||
return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step,
|
||||
pDst, dstStep, sz, ippAlphaOver);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_alphaComp(
|
||||
const primitives_hints_t *hints,
|
||||
primitives_t *prims)
|
||||
{
|
||||
prims->alphaComp_argb = general_alphaComp_argb;
|
||||
#ifdef WITH_IPP
|
||||
prims->alphaComp_argb = ipp_alphaComp_argb;
|
||||
#elif defined(WITH_SSE2)
|
||||
if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
|
||||
&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) /* for LDDQU */
|
||||
{
|
||||
prims->alphaComp_argb = sse2_alphaComp_argb;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_deinit_alphaComp(
|
||||
primitives_t *prims)
|
||||
{
|
||||
/* Nothing to do. */
|
||||
}
|
|
@ -0,0 +1,94 @@
|
|||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Logical operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <string.h>
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#ifdef WITH_SSE2
|
||||
# include <emmintrin.h>
|
||||
# include <pmmintrin.h>
|
||||
#endif /* WITH_SSE2 */
|
||||
#ifdef WITH_IPP
|
||||
# include <ipps.h>
|
||||
#endif /* WITH_IPP */
|
||||
#include "prim_internal.h"
|
||||
#include "prim_templates.h"
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* 32-bit AND with a constant.
|
||||
*/
|
||||
PRIM_STATIC pstatus_t general_andC_32u(
|
||||
const UINT32 *pSrc,
|
||||
UINT32 val,
|
||||
UINT32 *pDst,
|
||||
INT32 len)
|
||||
{
|
||||
if (val == 0) return PRIMITIVES_SUCCESS;
|
||||
while (len--) *pDst++ = *pSrc++ & val;
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* 32-bit OR with a constant.
|
||||
*/
|
||||
PRIM_STATIC pstatus_t general_orC_32u(
|
||||
const UINT32 *pSrc,
|
||||
UINT32 val,
|
||||
UINT32 *pDst,
|
||||
INT32 len)
|
||||
{
|
||||
if (val == 0) return PRIMITIVES_SUCCESS;
|
||||
while (len--) *pDst++ = *pSrc++ | val;
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, general_andC_32u,
|
||||
_mm_and_si128, *dptr++ = *sptr++ & val)
|
||||
SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, general_orC_32u,
|
||||
_mm_or_si128, *dptr++ = *sptr++ | val)
|
||||
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_andor(
|
||||
const primitives_hints_t *hints,
|
||||
primitives_t *prims)
|
||||
{
|
||||
/* Start with the default. */
|
||||
prims->andC_32u = general_andC_32u;
|
||||
prims->orC_32u = general_orC_32u;
|
||||
#if defined(WITH_IPP)
|
||||
prims->andC_32u = (__andC_32u_t) ippsAndC_32u;
|
||||
prims->orC_32u = (__orC_32u_t) ippsOrC_32u;
|
||||
#elif defined(WITH_SSE2)
|
||||
if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
|
||||
&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
|
||||
{
|
||||
prims->andC_32u = sse3_andC_32u;
|
||||
prims->orC_32u = sse3_orC_32u;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_deinit_andor(
|
||||
primitives_t *prims)
|
||||
{
|
||||
/* Nothing to do. */
|
||||
}
|
|
@ -0,0 +1,742 @@
|
|||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Color conversion operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* Copyright 2011 Stephen Erisman
|
||||
* Copyright 2011 Norbert Federa <nfedera@thinstuff.com>
|
||||
* Copyright 2011 Martin Fleisz <mfleisz@thinstuff.com>
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include <config.h>
|
||||
#endif
|
||||
#include <string.h>
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#ifdef WITH_SSE2
|
||||
#include <emmintrin.h>
|
||||
#elif WITH_NEON
|
||||
#include <arm_neon.h>
|
||||
#endif /* WITH_SSE2 else WITH_NEON */
|
||||
#include "prim_internal.h"
|
||||
#include "prim_templates.h"
|
||||
|
||||
#ifndef MINMAX
|
||||
#define MINMAX(_v_, _l_, _h_) \
|
||||
((_v_) < (_l_) ? (_l_) : ((_v_) > (_h_) ? (_h_) : (_v_)))
|
||||
#endif /* !MINMAX */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
PRIM_STATIC pstatus_t general_yCbCrToRGB_16s16s_P3P3(
|
||||
const INT16 *pSrc[3], INT32 srcStep,
|
||||
INT16 *pDst[3], INT32 dstStep,
|
||||
const prim_size_t *roi) /* region of interest */
|
||||
{
|
||||
/**
|
||||
* The decoded YCbCr coeffectients are represented as 11.5 fixed-point
|
||||
* numbers:
|
||||
*
|
||||
* 1 sign bit + 10 integer bits + 5 fractional bits
|
||||
*
|
||||
* However only 7 integer bits will be actually used since the value range
|
||||
* is [-128.0, 127.0]. In other words, the decoded coefficients are scaled
|
||||
* by << 5 when interpreted as INT16.
|
||||
* It was scaled in the quantization phase, so we must scale it back here.
|
||||
*/
|
||||
const INT16 *yptr = pSrc[0];
|
||||
const INT16 *cbptr = pSrc[1];
|
||||
const INT16 *crptr = pSrc[2];
|
||||
INT16 *rptr = pDst[0];
|
||||
INT16 *gptr = pDst[1];
|
||||
INT16 *bptr = pDst[2];
|
||||
int srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
|
||||
int dstbump = (dstStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
|
||||
int y;
|
||||
|
||||
for (y=0; y<roi->height; y++)
|
||||
{
|
||||
int x;
|
||||
for (x=0; x<roi->width; ++x)
|
||||
{
|
||||
/* INT32 is used intentionally because we calculate
|
||||
* with shifted factors!
|
||||
*/
|
||||
INT32 y = (INT32) (*yptr++);
|
||||
INT32 cb = (INT32) (*cbptr++);
|
||||
INT32 cr = (INT32) (*crptr++);
|
||||
INT32 r,g,b;
|
||||
|
||||
/*
|
||||
* This is the slow floating point version kept here for reference.
|
||||
* y = y + 4096; // 128<<5=4096 so that we can scale the sum by>>5
|
||||
* r = y + cr*1.403f;
|
||||
* g = y - cb*0.344f - cr*0.714f;
|
||||
* b = y + cb*1.770f;
|
||||
* y_r_buf[i] = MINMAX(r>>5, 0, 255);
|
||||
* cb_g_buf[i] = MINMAX(g>>5, 0, 255);
|
||||
* cr_b_buf[i] = MINMAX(b>>5, 0, 255);
|
||||
*/
|
||||
|
||||
/*
|
||||
* We scale the factors by << 16 into 32-bit integers in order to
|
||||
* avoid slower floating point multiplications. Since the final
|
||||
* result needs to be scaled by >> 5 we will extract only the
|
||||
* upper 11 bits (>> 21) from the final sum.
|
||||
* Hence we also have to scale the other terms of the sum by << 16.
|
||||
* R: 1.403 << 16 = 91947
|
||||
* G: 0.344 << 16 = 22544, 0.714 << 16 = 46792
|
||||
* B: 1.770 << 16 = 115998
|
||||
*/
|
||||
y = (y+4096)<<16;
|
||||
|
||||
r = y + cr*91947;
|
||||
g = y - cb*22544 - cr*46792;
|
||||
b = y + cb*115998;
|
||||
|
||||
*rptr++ = MINMAX(r>>21, 0, 255);
|
||||
*gptr++ = MINMAX(g>>21, 0, 255);
|
||||
*bptr++ = MINMAX(b>>21, 0, 255);
|
||||
}
|
||||
yptr += srcbump;
|
||||
cbptr += srcbump;
|
||||
crptr += srcbump;
|
||||
rptr += dstbump;
|
||||
gptr += dstbump;
|
||||
bptr += dstbump;
|
||||
}
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
PRIM_STATIC pstatus_t general_RGBToYCbCr_16s16s_P3P3(
|
||||
const INT16 *pSrc[3], INT32 srcStep,
|
||||
INT16 *pDst[3], INT32 dstStep,
|
||||
const prim_size_t *roi) /* region of interest */
|
||||
{
|
||||
/* The encoded YCbCr coefficients are represented as 11.5 fixed-point
|
||||
* numbers:
|
||||
*
|
||||
* 1 sign bit + 10 integer bits + 5 fractional bits
|
||||
*
|
||||
* However only 7 integer bits will be actually used since the value
|
||||
* range is [-128.0, 127.0]. In other words, the encoded coefficients
|
||||
* is scaled by << 5 when interpreted as INT16.
|
||||
* It will be scaled down to original during the quantization phase.
|
||||
*/
|
||||
const INT16 *rptr = pSrc[0];
|
||||
const INT16 *gptr = pSrc[1];
|
||||
const INT16 *bptr = pSrc[2];
|
||||
INT16 *yptr = pDst[0];
|
||||
INT16 *cbptr = pDst[1];
|
||||
INT16 *crptr = pDst[2];
|
||||
int srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
|
||||
int dstbump = (dstStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
|
||||
int y;
|
||||
|
||||
for (y=0; y<roi->height; y++)
|
||||
{
|
||||
int x;
|
||||
for (x=0; x<roi->width; ++x)
|
||||
{
|
||||
/* INT32 is used intentionally because we calculate with
|
||||
* shifted factors!
|
||||
*/
|
||||
INT32 r = (INT32) (*rptr++);
|
||||
INT32 g = (INT32) (*gptr++);
|
||||
INT32 b = (INT32) (*bptr++);
|
||||
|
||||
/* We scale the factors by << 15 into 32-bit integers in order
|
||||
* to avoid slower floating point multiplications. Since the
|
||||
* terms need to be scaled by << 5 we simply scale the final
|
||||
* sum by >> 10
|
||||
*
|
||||
* Y: 0.299000 << 15 = 9798, 0.587000 << 15 = 19235,
|
||||
* 0.114000 << 15 = 3735
|
||||
* Cb: 0.168935 << 15 = 5535, 0.331665 << 15 = 10868,
|
||||
* 0.500590 << 15 = 16403
|
||||
* Cr: 0.499813 << 15 = 16377, 0.418531 << 15 = 13714,
|
||||
* 0.081282 << 15 = 2663
|
||||
*/
|
||||
INT32 y = (r * 9798 + g * 19235 + b * 3735) >> 10;
|
||||
INT32 cb = (r * -5535 + g * -10868 + b * 16403) >> 10;
|
||||
INT32 cr = (r * 16377 + g * -13714 + b * -2663) >> 10;
|
||||
|
||||
*yptr++ = (INT16) MINMAX(y - 4096, -4096, 4095);
|
||||
*cbptr++ = (INT16) MINMAX(cb, -4096, 4095);
|
||||
*crptr++ = (INT16) MINMAX(cr, -4096, 4095);
|
||||
}
|
||||
yptr += srcbump;
|
||||
cbptr += srcbump;
|
||||
crptr += srcbump;
|
||||
rptr += dstbump;
|
||||
gptr += dstbump;
|
||||
bptr += dstbump;
|
||||
}
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
PRIM_STATIC pstatus_t general_RGBToRGB_16s8u_P3AC4R(
|
||||
const INT16 *pSrc[3], /* 16-bit R,G, and B arrays */
|
||||
int srcStep, /* bytes between rows in source data */
|
||||
BYTE *pDst, /* 32-bit interleaved ARGB (ABGR?) data */
|
||||
int dstStep, /* bytes between rows in dest data */
|
||||
const prim_size_t *roi) /* region of interest */
|
||||
{
|
||||
const INT16 *r = pSrc[0];
|
||||
const INT16 *g = pSrc[1];
|
||||
const INT16 *b = pSrc[2];
|
||||
BYTE *dst = pDst;
|
||||
int x,y;
|
||||
int srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
|
||||
int dstbump = (dstStep - (roi->width * sizeof(UINT32)));
|
||||
|
||||
for (y=0; y<roi->height; ++y)
|
||||
{
|
||||
for (x=0; x<roi->width; ++x)
|
||||
{
|
||||
*dst++ = (BYTE) (*b++);
|
||||
*dst++ = (BYTE) (*g++);
|
||||
*dst++ = (BYTE) (*r++);
|
||||
*dst++ = ((BYTE) (0xFFU));
|
||||
}
|
||||
dst += dstbump;
|
||||
r += srcbump;
|
||||
g += srcbump;
|
||||
b += srcbump;
|
||||
}
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
|
||||
#ifdef __GNUC__
|
||||
# define GNU_INLINE \
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
#else
|
||||
# define GNU_INLINE
|
||||
#endif
|
||||
|
||||
#define CACHE_LINE_BYTES 64
|
||||
|
||||
#define _mm_between_epi16(_val, _min, _max) \
|
||||
do { _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); } while (0)
|
||||
|
||||
#ifdef DO_PREFETCH
|
||||
/*---------------------------------------------------------------------------*/
|
||||
static inline void GNU_INLINE _mm_prefetch_buffer(
|
||||
char * buffer,
|
||||
int num_bytes)
|
||||
{
|
||||
__m128i * buf = (__m128i*) buffer;
|
||||
unsigned int i;
|
||||
for (i = 0; i < (num_bytes / sizeof(__m128i));
|
||||
i+=(CACHE_LINE_BYTES / sizeof(__m128i)))
|
||||
{
|
||||
_mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
|
||||
}
|
||||
}
|
||||
#endif /* DO_PREFETCH */
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
PRIM_STATIC pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(
|
||||
const INT16 *pSrc[3],
|
||||
int srcStep,
|
||||
INT16 *pDst[3],
|
||||
int dstStep,
|
||||
const prim_size_t *roi) /* region of interest */
|
||||
{
|
||||
__m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096;
|
||||
__m128i *y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf;
|
||||
int srcbump, dstbump, yp, imax;
|
||||
|
||||
if (((ULONG_PTR) (pSrc[0]) & 0x0f)
|
||||
|| ((ULONG_PTR) (pSrc[1]) & 0x0f)
|
||||
|| ((ULONG_PTR) (pSrc[2]) & 0x0f)
|
||||
|| ((ULONG_PTR) (pDst[0]) & 0x0f)
|
||||
|| ((ULONG_PTR) (pDst[1]) & 0x0f)
|
||||
|| ((ULONG_PTR) (pDst[2]) & 0x0f)
|
||||
|| (roi->width & 0x07)
|
||||
|| (srcStep & 127)
|
||||
|| (dstStep & 127))
|
||||
{
|
||||
/* We can't maintain 16-byte alignment. */
|
||||
return general_yCbCrToRGB_16s16s_P3P3(pSrc, srcStep,
|
||||
pDst, dstStep, roi);
|
||||
}
|
||||
|
||||
zero = _mm_setzero_si128();
|
||||
max = _mm_set1_epi16(255);
|
||||
|
||||
y_buf = (__m128i*) (pSrc[0]);
|
||||
cb_buf = (__m128i*) (pSrc[1]);
|
||||
cr_buf = (__m128i*) (pSrc[2]);
|
||||
r_buf = (__m128i*) (pDst[0]);
|
||||
g_buf = (__m128i*) (pDst[1]);
|
||||
b_buf = (__m128i*) (pDst[2]);
|
||||
|
||||
r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */
|
||||
g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */
|
||||
g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
|
||||
b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */
|
||||
c4096 = _mm_set1_epi16(4096);
|
||||
srcbump = srcStep / sizeof(__m128i);
|
||||
dstbump = dstStep / sizeof(__m128i);
|
||||
|
||||
#ifdef DO_PREFETCH
|
||||
/* Prefetch Y's, Cb's, and Cr's. */
|
||||
for (yp=0; yp<roi->height; yp++)
|
||||
{
|
||||
int i;
|
||||
for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
|
||||
i += (CACHE_LINE_BYTES / sizeof(__m128i)))
|
||||
{
|
||||
_mm_prefetch((char*)(&y_buf[i]), _MM_HINT_NTA);
|
||||
_mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA);
|
||||
_mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA);
|
||||
}
|
||||
y_buf += srcbump;
|
||||
cb_buf += srcbump;
|
||||
cr_buf += srcbump;
|
||||
}
|
||||
y_buf = (__m128i*) (pSrc[0]);
|
||||
cb_buf = (__m128i*) (pSrc[1]);
|
||||
cr_buf = (__m128i*) (pSrc[2]);
|
||||
#endif /* DO_PREFETCH */
|
||||
|
||||
imax = roi->width * sizeof(INT16) / sizeof(__m128i);
|
||||
for (yp=0; yp<roi->height; ++yp)
|
||||
{
|
||||
int i;
|
||||
for (i=0; i<imax; i++)
|
||||
{
|
||||
/* In order to use SSE2 signed 16-bit integer multiplication
|
||||
* we need to convert the floating point factors to signed int
|
||||
* without losing information.
|
||||
* The result of this multiplication is 32 bit and we have two
|
||||
* SSE instructions that return either the hi or lo word.
|
||||
* Thus we will multiply the factors by the highest possible 2^n,
|
||||
* take the upper 16 bits of the signed 32-bit result
|
||||
* (_mm_mulhi_epi16) and correct this result by multiplying
|
||||
* it by 2^(16-n).
|
||||
*
|
||||
* For the given factors in the conversion matrix the best
|
||||
* possible n is 14.
|
||||
*
|
||||
* Example for calculating r:
|
||||
* r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
|
||||
* r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
|
||||
* r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
|
||||
* r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
|
||||
*/
|
||||
|
||||
/* y = (y_r_buf[i] + 4096) >> 2 */
|
||||
__m128i y, cb, cr, r, g, b;
|
||||
y = _mm_load_si128(y_buf + i);
|
||||
y = _mm_add_epi16(y, c4096);
|
||||
y = _mm_srai_epi16(y, 2);
|
||||
/* cb = cb_g_buf[i]; */
|
||||
cb = _mm_load_si128(cb_buf + i);
|
||||
/* cr = cr_b_buf[i]; */
|
||||
cr = _mm_load_si128(cr_buf + i);
|
||||
|
||||
/* (y + HIWORD(cr*22986)) >> 3 */
|
||||
r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
|
||||
r = _mm_srai_epi16(r, 3);
|
||||
|
||||
/* r_buf[i] = MINMAX(r, 0, 255); */
|
||||
_mm_between_epi16(r, zero, max);
|
||||
_mm_store_si128(r_buf + i, r);
|
||||
|
||||
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
|
||||
g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
|
||||
g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
|
||||
g = _mm_srai_epi16(g, 3);
|
||||
|
||||
/* g_buf[i] = MINMAX(g, 0, 255); */
|
||||
_mm_between_epi16(g, zero, max);
|
||||
_mm_store_si128(g_buf + i, g);
|
||||
|
||||
/* (y + HIWORD(cb*28999)) >> 3 */
|
||||
b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
|
||||
b = _mm_srai_epi16(b, 3);
|
||||
/* b_buf[i] = MINMAX(b, 0, 255); */
|
||||
_mm_between_epi16(b, zero, max);
|
||||
_mm_store_si128(b_buf + i, b);
|
||||
}
|
||||
y_buf += srcbump;
|
||||
cb_buf += srcbump;
|
||||
cr_buf += srcbump;
|
||||
r_buf += dstbump;
|
||||
g_buf += dstbump;
|
||||
b_buf += dstbump;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
|
||||
* numbers. See the general code above.
|
||||
*/
|
||||
PRIM_STATIC pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(
|
||||
const INT16 *pSrc[3],
|
||||
int srcStep,
|
||||
INT16 *pDst[3],
|
||||
int dstStep,
|
||||
const prim_size_t *roi) /* region of interest */
|
||||
{
|
||||
__m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b;
|
||||
__m128i *r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf;
|
||||
int srcbump, dstbump, yp, imax;
|
||||
|
||||
if (((ULONG_PTR) (pSrc[0]) & 0x0f)
|
||||
|| ((ULONG_PTR) (pSrc[1]) & 0x0f)
|
||||
|| ((ULONG_PTR) (pSrc[2]) & 0x0f)
|
||||
|| ((ULONG_PTR) (pDst[0]) & 0x0f)
|
||||
|| ((ULONG_PTR) (pDst[1]) & 0x0f)
|
||||
|| ((ULONG_PTR) (pDst[2]) & 0x0f)
|
||||
|| (roi->width & 0x07)
|
||||
|| (srcStep & 127)
|
||||
|| (dstStep & 127))
|
||||
{
|
||||
/* We can't maintain 16-byte alignment. */
|
||||
return general_RGBToYCbCr_16s16s_P3P3(pSrc, srcStep,
|
||||
pDst, dstStep, roi);
|
||||
}
|
||||
|
||||
min = _mm_set1_epi16(-128 << 5);
|
||||
max = _mm_set1_epi16(127 << 5);
|
||||
|
||||
r_buf = (__m128i*) (pSrc[0]);
|
||||
g_buf = (__m128i*) (pSrc[1]);
|
||||
b_buf = (__m128i*) (pSrc[2]);
|
||||
y_buf = (__m128i*) (pDst[0]);
|
||||
cb_buf = (__m128i*) (pDst[1]);
|
||||
cr_buf = (__m128i*) (pDst[2]);
|
||||
|
||||
y_r = _mm_set1_epi16(9798); /* 0.299000 << 15 */
|
||||
y_g = _mm_set1_epi16(19235); /* 0.587000 << 15 */
|
||||
y_b = _mm_set1_epi16(3735); /* 0.114000 << 15 */
|
||||
cb_r = _mm_set1_epi16(-5535); /* -0.168935 << 15 */
|
||||
cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
|
||||
cb_b = _mm_set1_epi16(16403); /* 0.500590 << 15 */
|
||||
cr_r = _mm_set1_epi16(16377); /* 0.499813 << 15 */
|
||||
cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
|
||||
cr_b = _mm_set1_epi16(-2663); /* -0.081282 << 15 */
|
||||
|
||||
srcbump = srcStep / sizeof(__m128i);
|
||||
dstbump = dstStep / sizeof(__m128i);
|
||||
|
||||
#ifdef DO_PREFETCH
|
||||
/* Prefetch RGB's. */
|
||||
for (yp=0; yp<roi->height; yp++)
|
||||
{
|
||||
int i;
|
||||
for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
|
||||
i += (CACHE_LINE_BYTES / sizeof(__m128i)))
|
||||
{
|
||||
_mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA);
|
||||
_mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA);
|
||||
_mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA);
|
||||
}
|
||||
r_buf += srcbump;
|
||||
g_buf += srcbump;
|
||||
b_buf += srcbump;
|
||||
}
|
||||
r_buf = (__m128i*) (pSrc[0]);
|
||||
g_buf = (__m128i*) (pSrc[1]);
|
||||
b_buf = (__m128i*) (pSrc[2]);
|
||||
#endif /* DO_PREFETCH */
|
||||
|
||||
imax = roi->width * sizeof(INT16) / sizeof(__m128i);
|
||||
for (yp=0; yp<roi->height; ++yp)
|
||||
{
|
||||
int i;
|
||||
for (i=0; i<imax; i++)
|
||||
{
|
||||
/* In order to use SSE2 signed 16-bit integer multiplication we
|
||||
* need to convert the floating point factors to signed int
|
||||
* without loosing information. The result of this multiplication
|
||||
* is 32 bit and using SSE2 we get either the product's hi or lo
|
||||
* word. Thus we will multiply the factors by the highest
|
||||
* possible 2^n and take the upper 16 bits of the signed 32-bit
|
||||
* result (_mm_mulhi_epi16). Since the final result needs to
|
||||
* be scaled by << 5 and also in in order to keep the precision
|
||||
* within the upper 16 bits we will also have to scale the RGB
|
||||
* values used in the multiplication by << 5+(16-n).
|
||||
*/
|
||||
__m128i r, g, b, y, cb, cr;
|
||||
r = _mm_load_si128(y_buf+i);
|
||||
g = _mm_load_si128(g_buf+i);
|
||||
b = _mm_load_si128(b_buf+i);
|
||||
|
||||
/* r<<6; g<<6; b<<6 */
|
||||
r = _mm_slli_epi16(r, 6);
|
||||
g = _mm_slli_epi16(g, 6);
|
||||
b = _mm_slli_epi16(b, 6);
|
||||
|
||||
/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
|
||||
y = _mm_mulhi_epi16(r, y_r);
|
||||
y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
|
||||
y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
|
||||
y = _mm_add_epi16(y, min);
|
||||
/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
|
||||
_mm_between_epi16(y, min, max);
|
||||
_mm_store_si128(y_buf+i, y);
|
||||
|
||||
/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
|
||||
cb = _mm_mulhi_epi16(r, cb_r);
|
||||
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
|
||||
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
|
||||
/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
|
||||
_mm_between_epi16(cb, min, max);
|
||||
_mm_store_si128(cb_buf+i, cb);
|
||||
|
||||
/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
|
||||
cr = _mm_mulhi_epi16(r, cr_r);
|
||||
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
|
||||
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
|
||||
/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
|
||||
_mm_between_epi16(cr, min, max);
|
||||
_mm_store_si128(cr_buf+i, cr);
|
||||
}
|
||||
y_buf += srcbump;
|
||||
cb_buf += srcbump;
|
||||
cr_buf += srcbump;
|
||||
r_buf += dstbump;
|
||||
g_buf += dstbump;
|
||||
b_buf += dstbump;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
#define LOAD128(_src_) \
|
||||
_mm_load_si128((__m128i *) _src_)
|
||||
#define STORE128(_dst_, _src_) \
|
||||
_mm_store_si128((__m128i *) _dst_, _src_)
|
||||
#define PUNPCKLBW(_dst_, _src_) \
|
||||
_dst_ = _mm_unpacklo_epi8(_src_, _dst_)
|
||||
#define PUNPCKHBW(_dst_, _src_) \
|
||||
_dst_ = _mm_unpackhi_epi8(_src_, _dst_)
|
||||
#define PUNPCKLWD(_dst_, _src_) \
|
||||
_dst_ = _mm_unpacklo_epi16(_src_, _dst_)
|
||||
#define PUNPCKHWD(_dst_, _src_) \
|
||||
_dst_ = _mm_unpackhi_epi16(_src_, _dst_)
|
||||
#define PACKUSWB(_dst_, _src_) \
|
||||
_dst_ = _mm_packus_epi16(_dst_, _src_)
|
||||
#define PREFETCH(_ptr_) \
|
||||
_mm_prefetch((const void *) _ptr_, _MM_HINT_T0)
|
||||
#define XMM_ALL_ONES \
|
||||
_mm_set1_epi32(0xFFFFFFFFU)
|
||||
|
||||
PRIM_STATIC pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
|
||||
const INT16 *pSrc[3], /* 16-bit R,G, and B arrays */
|
||||
INT32 srcStep, /* bytes between rows in source data */
|
||||
BYTE *pDst, /* 32-bit interleaved ARGB (ABGR?) data */
|
||||
INT32 dstStep, /* bytes between rows in dest data */
|
||||
const prim_size_t *roi) /* region of interest */
|
||||
{
|
||||
const UINT16 *r = (const UINT16 *) (pSrc[0]);
|
||||
const UINT16 *g = (const UINT16 *) (pSrc[1]);
|
||||
const UINT16 *b = (const UINT16 *) (pSrc[2]);
|
||||
BYTE *out;
|
||||
int srcbump, dstbump, y;
|
||||
|
||||
/* Ensure 16-byte alignment on all pointers,
|
||||
* that width is a multiple of 8,
|
||||
* and that the next row will also remain aligned.
|
||||
* Since this is usually used for 64x64 aligned arrays,
|
||||
* these checks should presumably pass.
|
||||
*/
|
||||
if ((((ULONG_PTR) (pSrc[0]) & 0x0f) != 0)
|
||||
|| (((ULONG_PTR) (pSrc[1]) & 0x0f) != 0)
|
||||
|| (((ULONG_PTR) (pSrc[2]) & 0x0f) != 0)
|
||||
|| (((ULONG_PTR) pDst & 0x0f) != 0)
|
||||
|| (roi->width & 0x0f)
|
||||
|| (srcStep & 0x0f)
|
||||
|| (dstStep & 0x0f))
|
||||
{
|
||||
return general_RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, roi);
|
||||
}
|
||||
|
||||
out = (BYTE *) pDst;
|
||||
srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
|
||||
dstbump = (dstStep - (roi->width * sizeof(UINT32)));
|
||||
|
||||
for (y=0; y<roi->height; ++y)
|
||||
{
|
||||
int width = roi->width;
|
||||
do {
|
||||
__m128i R0, R1, R2, R3, R4;
|
||||
/* The comments below pretend these are 8-byte registers
|
||||
* rather than 16-byte, for readability.
|
||||
*/
|
||||
R0 = LOAD128(b); b += 8; /* R0 = 00B300B200B100B0 */
|
||||
R1 = LOAD128(b); b += 8; /* R1 = 00B700B600B500B4 */
|
||||
PACKUSWB(R0,R1); /* R0 = B7B6B5B4B3B2B1B0 */
|
||||
R1 = LOAD128(g); g += 8; /* R1 = 00G300G200G100G0 */
|
||||
R2 = LOAD128(g); g += 8; /* R2 = 00G700G600G500G4 */
|
||||
PACKUSWB(R1,R2); /* R1 = G7G6G5G4G3G2G1G0 */
|
||||
R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */
|
||||
PUNPCKLBW(R2,R0); /* R2 = G3B3G2B2G1B1G0B0 */
|
||||
PUNPCKHBW(R1,R0); /* R1 = G7B7G6B7G5B5G4B4 */
|
||||
R0 = LOAD128(r); r += 8; /* R0 = 00R300R200R100R0 */
|
||||
R3 = LOAD128(r); r += 8; /* R3 = 00R700R600R500R4 */
|
||||
PACKUSWB(R0,R3); /* R0 = R7R6R5R4R3R2R1R0 */
|
||||
R3 = XMM_ALL_ONES; /* R3 = FFFFFFFFFFFFFFFF */
|
||||
R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */
|
||||
PUNPCKLBW(R4,R0); /* R4 = FFR3FFR2FFR1FFR0 */
|
||||
PUNPCKHBW(R3,R0); /* R3 = FFR7FFR6FFR5FFR4 */
|
||||
R0 = R4; /* R0 = R4 */
|
||||
PUNPCKLWD(R0,R2); /* R0 = FFR1G1B1FFR0G0B0 */
|
||||
PUNPCKHWD(R4,R2); /* R4 = FFR3G3B3FFR2G2B2 */
|
||||
R2 = R3; /* R2 = R3 */
|
||||
PUNPCKLWD(R2,R1); /* R2 = FFR5G5B5FFR4G4B4 */
|
||||
PUNPCKHWD(R3,R1); /* R3 = FFR7G7B7FFR6G6B6 */
|
||||
STORE128(out, R0); out += 16; /* FFR1G1B1FFR0G0B0 */
|
||||
STORE128(out, R4); out += 16; /* FFR3G3B3FFR2G2B2 */
|
||||
STORE128(out, R2); out += 16; /* FFR5G5B5FFR4G4B4 */
|
||||
STORE128(out, R3); out += 16; /* FFR7G7B7FFR6G6B6 */
|
||||
} while (width -= 16);
|
||||
/* Jump to next row. */
|
||||
r += srcbump;
|
||||
g += srcbump;
|
||||
b += srcbump;
|
||||
out += dstbump;
|
||||
}
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
#endif /* WITH_SSE2 */
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
#ifdef WITH_NEON
|
||||
PRIM_STATIC pstatus_t neon_yCbCrToRGB_16s16s_P3P3(
|
||||
const INT16 *pSrc[3],
|
||||
int srcStep,
|
||||
INT16 *pDst[3],
|
||||
int dstStep,
|
||||
const prim_size_t *roi) /* region of interest */
|
||||
{
|
||||
/* TODO: If necessary, check alignments and call the general version. */
|
||||
|
||||
int16x8_t zero = vdupq_n_s16(0);
|
||||
int16x8_t max = vdupq_n_s16(255);
|
||||
int16x8_t y_add = vdupq_n_s16(128);
|
||||
|
||||
int16x8_t* y_buf = (int16x8_t*) pSrc[0];
|
||||
int16x8_t* cb_buf = (int16x8_t*) pSrc[1];
|
||||
int16x8_t* cr_buf = (int16x8_t*) pSrc[2];
|
||||
int16x8_t* r_buf = (int16x8_t*) pDst[0];
|
||||
int16x8_t* g_buf = (int16x8_t*) pDst[1];
|
||||
int16x8_t* b_buf = (int16x8_t*) pDst[2];
|
||||
|
||||
int srcbump = srcStep / sizeof(int16x8_t);
|
||||
int dstbump = dstStep / sizeof(int16x8_t);
|
||||
int yp;
|
||||
|
||||
int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
|
||||
for (yp=0; yp<roi->height; ++yp)
|
||||
{
|
||||
int i;
|
||||
for (i=0; i<imax; i++)
|
||||
{
|
||||
int16x8_t y = vld1q_s16((INT16*) (y_buf+i));
|
||||
y = vaddq_s16(y, y_add);
|
||||
|
||||
int16x8_t cr = vld1q_s16((INT16*) (cr_buf+i));
|
||||
|
||||
/* r = between((y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5)),
|
||||
* 0, 255);
|
||||
*/
|
||||
int16x8_t r = vaddq_s16(y, cr);
|
||||
r = vaddq_s16(r, vshrq_n_s16(cr, 2));
|
||||
r = vaddq_s16(r, vshrq_n_s16(cr, 3));
|
||||
r = vaddq_s16(r, vshrq_n_s16(cr, 5));
|
||||
r = vminq_s16(vmaxq_s16(r, zero), max);
|
||||
vst1q_s16((INT16*) (r_buf+i), r);
|
||||
|
||||
/* cb = cb_g_buf[i]; */
|
||||
int16x8_t cb = vld1q_s16((INT16*) (cb_buf+i));
|
||||
|
||||
/* g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1)
|
||||
* - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255);
|
||||
*/
|
||||
int16x8_t g = vsubq_s16(y, vshrq_n_s16(cb, 2));
|
||||
g = vsubq_s16(g, vshrq_n_s16(cb, 4));
|
||||
g = vsubq_s16(g, vshrq_n_s16(cb, 5));
|
||||
g = vsubq_s16(g, vshrq_n_s16(cr, 1));
|
||||
g = vsubq_s16(g, vshrq_n_s16(cr, 3));
|
||||
g = vsubq_s16(g, vshrq_n_s16(cr, 4));
|
||||
g = vsubq_s16(g, vshrq_n_s16(cr, 5));
|
||||
g = vminq_s16(vmaxq_s16(g, zero), max);
|
||||
vst1q_s16((INT16*) (g_buf+i), g);
|
||||
|
||||
/* b = between((y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6)),
|
||||
* 0, 255);
|
||||
*/
|
||||
int16x8_t b = vaddq_s16(y, cb);
|
||||
b = vaddq_s16(b, vshrq_n_s16(cb, 1));
|
||||
b = vaddq_s16(b, vshrq_n_s16(cb, 2));
|
||||
b = vaddq_s16(b, vshrq_n_s16(cb, 6));
|
||||
b = vminq_s16(vmaxq_s16(b, zero), max);
|
||||
vst1q_s16((INT16*) (b_buf+i), b);
|
||||
}
|
||||
y_buf += srcbump;
|
||||
cb_buf += srcbump;
|
||||
cr_buf += srcbump;
|
||||
r_buf += dstbump;
|
||||
g_buf += dstbump;
|
||||
b_buf += dstbump;
|
||||
}
|
||||
}
|
||||
#endif /* WITH_NEON */
|
||||
|
||||
|
||||
/* I don't see a direct IPP version of this, since the input is INT16
|
||||
* YCbCr. It may be possible via Deinterleave and then YCbCrToRGB_<mod>.
|
||||
* But that would likely be slower.
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_colors(
|
||||
const primitives_hints_t *hints,
|
||||
primitives_t *prims)
|
||||
{
|
||||
prims->RGBToRGB_16s8u_P3AC4R = general_RGBToRGB_16s8u_P3AC4R;
|
||||
prims->yCbCrToRGB_16s16s_P3P3 = general_yCbCrToRGB_16s16s_P3P3;
|
||||
prims->RGBToYCbCr_16s16s_P3P3 = general_RGBToYCbCr_16s16s_P3P3;
|
||||
#if defined(WITH_SSE2)
|
||||
if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
|
||||
{
|
||||
prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R;
|
||||
prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
|
||||
prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
|
||||
}
|
||||
#elif defined(WITH_NEON)
|
||||
if (hints->arm_flags & PRIM_ARM_NEON_AVAILABLE)
|
||||
{
|
||||
prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
|
||||
}
|
||||
#endif /* WITH_SSE2 */
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_deinit_colors(
|
||||
primitives_t *prims)
|
||||
{
|
||||
/* Nothing to do. */
|
||||
}
|
|
@ -0,0 +1,177 @@
|
|||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Copy operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <string.h>
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#ifdef WITH_IPP
|
||||
# include <ipps.h>
|
||||
# include <ippi.h>
|
||||
#endif /* WITH_IPP */
|
||||
#include "prim_internal.h"
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/*static inline BOOL memory_regions_overlap_1d(*/
|
||||
static BOOL memory_regions_overlap_1d(
|
||||
const BYTE *p1,
|
||||
const BYTE *p2,
|
||||
size_t bytes)
|
||||
{
|
||||
const ULONG_PTR p1m = (const ULONG_PTR) p1;
|
||||
const ULONG_PTR p2m = (const ULONG_PTR) p2;
|
||||
if (p1m <= p2m)
|
||||
{
|
||||
if (p1m + bytes > p2m) return TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (p2m + bytes > p1m) return TRUE;
|
||||
}
|
||||
/* else */
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/*static inline BOOL memory_regions_overlap_2d( */
|
||||
static BOOL memory_regions_overlap_2d(
|
||||
const BYTE *p1, int p1Step, int p1Size,
|
||||
const BYTE *p2, int p2Step, int p2Size,
|
||||
int width, int height)
|
||||
{
|
||||
ULONG_PTR p1m = (ULONG_PTR) p1;
|
||||
ULONG_PTR p2m = (ULONG_PTR) p2;
|
||||
|
||||
if (p1m <= p2m)
|
||||
{
|
||||
ULONG_PTR p1mEnd = p1m + (height-1)*p1Step + width*p1Size;
|
||||
if (p1mEnd > p2m) return TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
ULONG_PTR p2mEnd = p2m + (height-1)*p2Step + width*p2Size;
|
||||
if (p2mEnd > p1m) return TRUE;
|
||||
}
|
||||
/* else */
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
PRIM_STATIC pstatus_t general_copy_8u(
|
||||
const BYTE *pSrc,
|
||||
BYTE *pDst,
|
||||
INT32 len)
|
||||
{
|
||||
if (memory_regions_overlap_1d(pSrc, pDst, (size_t) len))
|
||||
{
|
||||
memmove((void *) pDst, (const void *) pSrc, (size_t) len);
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy((void *) pDst, (const void *) pSrc, (size_t) len);
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/* Copy a block of pixels from one buffer to another.
|
||||
* The addresses are assumed to have been already offset to the upper-left
|
||||
* corners of the source and destination region of interest.
|
||||
*/
|
||||
PRIM_STATIC pstatus_t general_copy_8u_AC4r(
|
||||
const BYTE *pSrc, INT32 srcStep,
|
||||
BYTE *pDst, INT32 dstStep,
|
||||
INT32 width, INT32 height)
|
||||
{
|
||||
primitives_t *prims = primitives_get();
|
||||
const BYTE *src = (const BYTE *) pSrc;
|
||||
BYTE *dst = (BYTE *) pDst;
|
||||
int rowbytes = width * sizeof(UINT32);
|
||||
|
||||
if ((width == 0) || (height == 0)) return PRIMITIVES_SUCCESS;
|
||||
|
||||
if (memory_regions_overlap_2d(pSrc, srcStep, sizeof(UINT32),
|
||||
pDst, dstStep, sizeof(UINT32), width, height))
|
||||
{
|
||||
do {
|
||||
prims->copy(src, dst, rowbytes);
|
||||
src += srcStep;
|
||||
dst += dstStep;
|
||||
} while (--height);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* TODO: do it in one operation when the rowdata is adjacent. */
|
||||
do {
|
||||
/* If we find a replacement for memcpy that is consistently
|
||||
* faster, this could be replaced with that.
|
||||
*/
|
||||
memcpy(dst, src, rowbytes);
|
||||
src += srcStep;
|
||||
dst += dstStep;
|
||||
} while (--height);
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
#ifdef WITH_IPP
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/* This is just ippiCopy_8u_AC4R without the IppiSize structure parameter. */
|
||||
static pstatus_t ippiCopy_8u_AC4r(
|
||||
const BYTE *pSrc, INT32 srcStep,
|
||||
BYTE *pDst, INT32 dstStep,
|
||||
INT32 width, INT32 height)
|
||||
{
|
||||
IppiSize roi;
|
||||
roi.width = width;
|
||||
roi.height = height;
|
||||
return (pstatus_t) ippiCopy_8u_AC4R(pSrc, srcStep, pDst, dstStep, roi);
|
||||
}
|
||||
#endif /* WITH_IPP */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_copy(
|
||||
const primitives_hints_t *hints,
|
||||
primitives_t *prims)
|
||||
{
|
||||
/* Start with the default. */
|
||||
prims->copy_8u = general_copy_8u;
|
||||
prims->copy_8u_AC4r = general_copy_8u_AC4r;
|
||||
|
||||
/* Pick tuned versions if possible. */
|
||||
#ifdef WITH_IPP
|
||||
prims->copy_8u = (__copy_8u_t) ippsCopy_8u;
|
||||
prims->copy_8u_AC4r = (__copy_8u_AC4r_t) ippiCopy_8u_AC4r;
|
||||
#endif
|
||||
/* Performance with an SSE2 version with no prefetch seemed to be
|
||||
* all over the map vs. memcpy.
|
||||
* Sometimes it was significantly faster, sometimes dreadfully slower,
|
||||
* and it seemed to vary a lot depending on block size and processor.
|
||||
* Hence, no SSE version is used here unless once can be written that
|
||||
* is consistently faster than memcpy.
|
||||
*/
|
||||
|
||||
/* This is just an alias with void* parameters */
|
||||
prims->copy = (__copy_t) (prims->copy_8u);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_deinit_copy(
|
||||
primitives_t *prims)
|
||||
{
|
||||
/* Nothing to do. */
|
||||
}
|
|
@ -0,0 +1,105 @@
|
|||
/* prim_internal.h
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License. Algorithms used by
|
||||
* this code may be covered by patents by HP, Microsoft, or other parties.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifdef __GNUC__
|
||||
# pragma once
|
||||
#endif
|
||||
|
||||
#ifndef __PRIM_INTERNAL_H_INCLUDED__
|
||||
#define __PRIM_INTERNAL_H_INCLUDED__
|
||||
|
||||
#ifndef CMAKE_BUILD_TYPE
|
||||
#define CMAKE_BUILD_TYPE Release
|
||||
#endif
|
||||
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
/* Normally the internal entrypoints should be static, but a benchmark
|
||||
* program may want to access them directly and turn this off.
|
||||
*/
|
||||
#ifndef PRIM_STATIC
|
||||
# define PRIM_STATIC static
|
||||
#else
|
||||
# undef PRIM_STATIC
|
||||
# define PRIM_STATIC
|
||||
#endif /* !PRIM_STATIC */
|
||||
|
||||
/* Use lddqu for unaligned; load for 16-byte aligned. */
|
||||
#define LOAD_SI128(_ptr_) \
|
||||
(((ULONG_PTR) (_ptr_) & 0x0f) \
|
||||
? _mm_lddqu_si128((__m128i *) (_ptr_)) \
|
||||
: _mm_load_si128((__m128i *) (_ptr_)))
|
||||
|
||||
/* This structure can (eventually) be used to provide hints to the
|
||||
* initialization routines, e.g. whether SSE2 or NEON or IPP instructions
|
||||
* or calls are available.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
UINT32 x86_flags;
|
||||
UINT32 arm_flags;
|
||||
} primitives_hints_t;
|
||||
|
||||
/* Function prototypes for all the init/deinit routines. */
|
||||
extern void primitives_init_copy(
|
||||
const primitives_hints_t *hints,
|
||||
primitives_t *prims);
|
||||
extern void primitives_deinit_copy(
|
||||
primitives_t *prims);
|
||||
|
||||
extern void primitives_init_set(
|
||||
const primitives_hints_t *hints,
|
||||
primitives_t *prims);
|
||||
extern void primitives_deinit_set(
|
||||
primitives_t *prims);
|
||||
|
||||
extern void primitives_init_add(
|
||||
const primitives_hints_t *hints,
|
||||
primitives_t *prims);
|
||||
extern void primitives_deinit_add(
|
||||
primitives_t *prims);
|
||||
|
||||
extern void primitives_init_andor(
|
||||
const primitives_hints_t *hints,
|
||||
primitives_t *prims);
|
||||
extern void primitives_deinit_andor(
|
||||
primitives_t *prims);
|
||||
|
||||
extern void primitives_init_shift(
|
||||
const primitives_hints_t *hints,
|
||||
primitives_t *prims);
|
||||
extern void primitives_deinit_shift(
|
||||
primitives_t *prims);
|
||||
|
||||
extern void primitives_init_sign(
|
||||
const primitives_hints_t *hints,
|
||||
primitives_t *prims);
|
||||
extern void primitives_deinit_sign(
|
||||
primitives_t *prims);
|
||||
|
||||
extern void primitives_init_alphaComp(
|
||||
const primitives_hints_t *hints,
|
||||
primitives_t *prims);
|
||||
extern void primitives_deinit_alphaComp(
|
||||
primitives_t *prims);
|
||||
|
||||
extern void primitives_init_colors(
|
||||
const primitives_hints_t *hints,
|
||||
primitives_t *prims);
|
||||
extern void primitives_deinit_colors(
|
||||
primitives_t *prims);
|
||||
|
||||
#endif /* !__PRIM_INTERNAL_H_INCLUDED__ */
|
|
@ -0,0 +1,309 @@
|
|||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Routines to set a chunk of memory to a constant.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <string.h>
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#ifdef WITH_SSE2
|
||||
# include <emmintrin.h>
|
||||
#endif /* WITH_SSE2 */
|
||||
#ifdef WITH_IPP
|
||||
# include <ipps.h>
|
||||
#endif /* WITH_IPP */
|
||||
#include "prim_internal.h"
|
||||
|
||||
/* ========================================================================= */
|
||||
PRIM_STATIC pstatus_t general_set_8u(
|
||||
BYTE val,
|
||||
BYTE *pDst,
|
||||
INT32 len)
|
||||
{
|
||||
memset((void *) pDst, (int) val, (size_t) len);
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
PRIM_STATIC pstatus_t general_zero(
|
||||
void *pDst,
|
||||
size_t len)
|
||||
{
|
||||
memset(pDst, 0, len);
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
#ifdef WITH_SSE2
|
||||
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
|
||||
PRIM_STATIC pstatus_t sse2_set_8u(
|
||||
BYTE val,
|
||||
BYTE *pDst,
|
||||
INT32 len)
|
||||
{
|
||||
BYTE byte, *dptr;
|
||||
__m128i xmm0;
|
||||
size_t count;
|
||||
|
||||
if (len < 16) return general_set_8u(val, pDst, len);
|
||||
|
||||
byte = val;
|
||||
dptr = (BYTE *) pDst;
|
||||
|
||||
/* Seek 16-byte alignment. */
|
||||
while ((ULONG_PTR) dptr & 0x0f)
|
||||
{
|
||||
*dptr++ = byte;
|
||||
if (--len == 0) return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
xmm0 = _mm_set1_epi8(byte);
|
||||
|
||||
/* Cover 256-byte chunks via SSE register stores. */
|
||||
count = len >> 8;
|
||||
len -= count << 8;
|
||||
/* Do 256-byte chunks using one XMM register. */
|
||||
while (count--)
|
||||
{
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
|
||||
}
|
||||
|
||||
/* Cover 16-byte chunks via SSE register stores. */
|
||||
count = len >> 4;
|
||||
len -= count << 4;
|
||||
/* Do 16-byte chunks using one XMM register. */
|
||||
while (count--)
|
||||
{
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
|
||||
}
|
||||
|
||||
/* Do leftover bytes. */
|
||||
while (len--) *dptr++ = byte;
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
|
||||
#endif /* WITH_SSE2 */
|
||||
|
||||
/* ========================================================================= */
|
||||
PRIM_STATIC pstatus_t general_set_32s(
|
||||
INT32 val,
|
||||
INT32 *pDst,
|
||||
INT32 len)
|
||||
{
|
||||
INT32 *dptr = (INT32 *) pDst;
|
||||
size_t span, remaining;
|
||||
primitives_t *prims;
|
||||
|
||||
if (len < 256)
|
||||
{
|
||||
while (len--) *dptr++ = val;
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* else quadratic growth memcpy algorithm */
|
||||
span = 1;
|
||||
*dptr = val;
|
||||
remaining = len - 1;
|
||||
prims = primitives_get();
|
||||
while (remaining)
|
||||
{
|
||||
size_t thiswidth = span;
|
||||
if (thiswidth > remaining) thiswidth = remaining;
|
||||
prims->copy_8u((BYTE *) dptr, (BYTE *) (dptr + span), thiswidth<<2);
|
||||
remaining -= thiswidth;
|
||||
span <<= 1;
|
||||
}
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
PRIM_STATIC pstatus_t general_set_32u(
|
||||
UINT32 val,
|
||||
UINT32 *pDst,
|
||||
INT32 len)
|
||||
{
|
||||
UINT32 *dptr = (UINT32 *) pDst;
|
||||
size_t span, remaining;
|
||||
primitives_t *prims;
|
||||
|
||||
if (len < 256)
|
||||
{
|
||||
while (len--) *dptr++ = val;
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* else quadratic growth memcpy algorithm */
|
||||
span = 1;
|
||||
*dptr = val;
|
||||
remaining = len - 1;
|
||||
prims = primitives_get();
|
||||
while (remaining)
|
||||
{
|
||||
size_t thiswidth = span;
|
||||
if (thiswidth > remaining) thiswidth = remaining;
|
||||
prims->copy_8u((BYTE *) dptr, (BYTE *) (dptr + span), thiswidth<<2);
|
||||
remaining -= thiswidth;
|
||||
span <<= 1;
|
||||
}
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
#ifdef WITH_SSE2
|
||||
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
|
||||
PRIM_STATIC pstatus_t sse2_set_32u(
|
||||
UINT32 val,
|
||||
UINT32 *pDst,
|
||||
INT32 len)
|
||||
{
|
||||
UINT32 *dptr = (UINT32 *) pDst;
|
||||
__m128i xmm0;
|
||||
size_t count;
|
||||
|
||||
/* If really short, just do it here. */
|
||||
if (len < 32)
|
||||
{
|
||||
while (len--) *dptr++ = val;
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* Assure we can reach 16-byte alignment. */
|
||||
if (((ULONG_PTR) dptr & 0x03) != 0)
|
||||
{
|
||||
return general_set_32u(val, pDst, len);
|
||||
}
|
||||
|
||||
/* Seek 16-byte alignment. */
|
||||
while ((ULONG_PTR) dptr & 0x0f)
|
||||
{
|
||||
*dptr++ = val;
|
||||
if (--len == 0) return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
xmm0 = _mm_set1_epi32(val);
|
||||
|
||||
/* Cover 256-byte chunks via SSE register stores. */
|
||||
count = len >> 6;
|
||||
len -= count << 6;
|
||||
/* Do 256-byte chunks using one XMM register. */
|
||||
while (count--)
|
||||
{
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
|
||||
}
|
||||
|
||||
/* Cover 16-byte chunks via SSE register stores. */
|
||||
count = len >> 2;
|
||||
len -= count << 2;
|
||||
/* Do 16-byte chunks using one XMM register. */
|
||||
while (count--)
|
||||
{
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
|
||||
}
|
||||
|
||||
/* Do leftover bytes. */
|
||||
while (len--) *dptr++ = val;
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
PRIM_STATIC pstatus_t sse2_set_32s(
|
||||
INT32 val,
|
||||
INT32 *pDst,
|
||||
INT32 len)
|
||||
{
|
||||
UINT32 uval = *((UINT32 *) &val);
|
||||
return sse2_set_32u(uval, (UINT32 *) pDst, len);
|
||||
}
|
||||
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
|
||||
#endif /* WITH_SSE2 */
|
||||
|
||||
#ifdef WITH_IPP
|
||||
/* ------------------------------------------------------------------------- */
|
||||
PRIM_STATIC pstatus_t ipp_wrapper_set_32u(
|
||||
UINT32 val,
|
||||
UINT32 *pDst,
|
||||
INT32 len)
|
||||
{
|
||||
/* A little type conversion, then use the signed version. */
|
||||
INT32 sval = *((INT32 *) &val);
|
||||
return ippsSet_32s(sval, (INT32 *) pDst, len);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_set(
|
||||
const primitives_hints_t *hints,
|
||||
primitives_t *prims)
|
||||
{
|
||||
/* Start with the default. */
|
||||
prims->set_8u = general_set_8u;
|
||||
prims->set_32s = general_set_32s;
|
||||
prims->set_32u = general_set_32u;
|
||||
prims->zero = general_zero;
|
||||
|
||||
/* Pick tuned versions if possible. */
|
||||
#ifdef WITH_IPP
|
||||
prims->set_8u = (__set_8u_t) ippsSet_8u;
|
||||
prims->set_32s = (__set_32s_t) ippsSet_32s;
|
||||
prims->set_32u = (__set_32u_t) ipp_wrapper_set_32u;
|
||||
prims->zero = (__zero_t) ippsZero_8u;
|
||||
#elif defined(WITH_SSE2)
|
||||
if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
|
||||
{
|
||||
prims->set_8u = sse2_set_8u;
|
||||
prims->set_32s = sse2_set_32s;
|
||||
prims->set_32u = sse2_set_32u;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_deinit_set(
|
||||
primitives_t *prims)
|
||||
{
|
||||
/* Nothing to do. */
|
||||
}
|
|
@ -0,0 +1,165 @@
|
|||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Shift operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <string.h>
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#ifdef WITH_SSE2
|
||||
# include <emmintrin.h>
|
||||
# include <pmmintrin.h>
|
||||
#endif /* WITH_SSE2 */
|
||||
#ifdef WITH_IPP
|
||||
# include <ipps.h>
|
||||
#endif /* WITH_IPP */
|
||||
#include "prim_internal.h"
|
||||
#include "prim_templates.h"
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
PRIM_STATIC pstatus_t general_lShiftC_16s(
|
||||
const INT16 *pSrc,
|
||||
INT32 val,
|
||||
INT16 *pDst,
|
||||
INT32 len)
|
||||
{
|
||||
if (val == 0) return PRIMITIVES_SUCCESS;
|
||||
while (len--) *pDst++ = *pSrc++ << val;
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
PRIM_STATIC pstatus_t general_rShiftC_16s(
|
||||
const INT16 *pSrc,
|
||||
INT32 val,
|
||||
INT16 *pDst,
|
||||
INT32 len)
|
||||
{
|
||||
if (val == 0) return PRIMITIVES_SUCCESS;
|
||||
while (len--) *pDst++ = *pSrc++ >> val;
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
PRIM_STATIC pstatus_t general_lShiftC_16u(
|
||||
const UINT16 *pSrc,
|
||||
INT32 val,
|
||||
UINT16 *pDst,
|
||||
INT32 len)
|
||||
{
|
||||
if (val == 0) return PRIMITIVES_SUCCESS;
|
||||
while (len--) *pDst++ = *pSrc++ << val;
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
PRIM_STATIC pstatus_t general_rShiftC_16u(
|
||||
const UINT16 *pSrc,
|
||||
INT32 val,
|
||||
UINT16 *pDst,
|
||||
INT32 len)
|
||||
{
|
||||
if (val == 0) return PRIMITIVES_SUCCESS;
|
||||
while (len--) *pDst++ = *pSrc++ >> val;
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, general_lShiftC_16s,
|
||||
_mm_slli_epi16, *dptr++ = *sptr++ << val)
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, general_rShiftC_16s,
|
||||
_mm_srai_epi16, *dptr++ = *sptr++ >> val)
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, general_lShiftC_16u,
|
||||
_mm_slli_epi16, *dptr++ = *sptr++ << val)
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, general_rShiftC_16u,
|
||||
_mm_srli_epi16, *dptr++ = *sptr++ >> val)
|
||||
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
PRIM_STATIC pstatus_t general_shiftC_16s(
|
||||
const INT16 *pSrc,
|
||||
INT32 val,
|
||||
INT16 *pDst,
|
||||
INT32 len)
|
||||
{
|
||||
primitives_t *prims;
|
||||
|
||||
if (val == 0) return PRIMITIVES_SUCCESS;
|
||||
prims = primitives_get();
|
||||
if (val < 0) return prims->rShiftC_16s(pSrc, -val, pDst, len);
|
||||
else return prims->lShiftC_16s(pSrc, val, pDst, len);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
PRIM_STATIC pstatus_t general_shiftC_16u(
|
||||
const UINT16 *pSrc,
|
||||
INT32 val,
|
||||
UINT16 *pDst,
|
||||
INT32 len)
|
||||
{
|
||||
primitives_t *prims;
|
||||
|
||||
if (val == 0) return PRIMITIVES_SUCCESS;
|
||||
prims = primitives_get();
|
||||
if (val < 0) return prims->rShiftC_16u(pSrc, -val, pDst, len);
|
||||
else return prims->lShiftC_16u(pSrc, val, pDst, len);
|
||||
}
|
||||
|
||||
/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
|
||||
* depending on the sign of val. To avoid using the deprecated inplace
|
||||
* routines, a wrapper can use the src for the dest.
|
||||
*/
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_shift(
|
||||
const primitives_hints_t *hints,
|
||||
primitives_t *prims)
|
||||
{
|
||||
/* Start with the default. */
|
||||
prims->lShiftC_16s = general_lShiftC_16s;
|
||||
prims->rShiftC_16s = general_rShiftC_16s;
|
||||
prims->lShiftC_16u = general_lShiftC_16u;
|
||||
prims->rShiftC_16u = general_rShiftC_16u;
|
||||
#if defined(WITH_IPP)
|
||||
prims->lShiftC_16s = (__lShiftC_16s_t) ippsLShiftC_16s;
|
||||
prims->rShiftC_16s = (__rShiftC_16s_t) ippsRShiftC_16s;
|
||||
prims->lShiftC_16u = (__lShiftC_16u_t) ippsLShiftC_16u;
|
||||
prims->rShiftC_16u = (__rShiftC_16u_t) ippsRShiftC_16u;
|
||||
#elif defined(WITH_SSE2)
|
||||
if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
|
||||
&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
|
||||
{
|
||||
prims->lShiftC_16s = sse2_lShiftC_16s;
|
||||
prims->rShiftC_16s = sse2_rShiftC_16s;
|
||||
prims->lShiftC_16u = sse2_lShiftC_16u;
|
||||
prims->rShiftC_16u = sse2_rShiftC_16u;
|
||||
}
|
||||
#endif
|
||||
/* Wrappers */
|
||||
prims->shiftC_16s = general_shiftC_16s;
|
||||
prims->shiftC_16u = general_shiftC_16u;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_deinit_shift(
|
||||
primitives_t *prims)
|
||||
{
|
||||
/* Nothing to do. */
|
||||
}
|
|
@ -0,0 +1,170 @@
|
|||
/* FreeRDP: A Remote Desktop Protocol Client
|
||||
* Sign operations.
|
||||
* vi:ts=4 sw=4:
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <string.h>
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#ifdef WITH_SSE2
|
||||
# include <emmintrin.h>
|
||||
# include <tmmintrin.h>
|
||||
#endif /* WITH_SSE2 */
|
||||
#include "prim_internal.h"
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* Set pDst to the sign-value of the 16-bit values in pSrc (-1, 0, or 1).
|
||||
*/
|
||||
PRIM_STATIC pstatus_t general_sign_16s(
|
||||
const INT16 *pSrc,
|
||||
INT16 *pDst,
|
||||
INT32 len)
|
||||
{
|
||||
while (len--)
|
||||
{
|
||||
INT16 src = *pSrc++;
|
||||
*pDst++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0);
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
/* ------------------------------------------------------------------------- */
|
||||
PRIM_STATIC pstatus_t ssse3_sign_16s(
|
||||
const INT16 *pSrc,
|
||||
INT16 *pDst,
|
||||
INT32 len)
|
||||
{
|
||||
const INT16 *sptr = (const INT16 *) pSrc;
|
||||
INT16 *dptr = (INT16 *) pDst;
|
||||
size_t count;
|
||||
|
||||
if (len < 16)
|
||||
{
|
||||
return general_sign_16s(pSrc, pDst, len);
|
||||
}
|
||||
|
||||
/* Check for 16-byte alignment (eventually). */
|
||||
if ((ULONG_PTR) pDst & 0x01)
|
||||
{
|
||||
return general_sign_16s(pSrc, pDst, len);
|
||||
}
|
||||
|
||||
/* Seek 16-byte alignment. */
|
||||
while ((ULONG_PTR) dptr & 0x0f)
|
||||
{
|
||||
INT16 src = *sptr++;
|
||||
*dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0);
|
||||
if (--len == 0) return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* Do 32-short chunks using 8 XMM registers. */
|
||||
count = len >> 5; /* / 32 */
|
||||
len -= count << 5; /* * 32 */
|
||||
if ((ULONG_PTR) sptr & 0x0f)
|
||||
{
|
||||
/* Unaligned */
|
||||
while (count--)
|
||||
{
|
||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
xmm0 = _mm_set1_epi16(0x0001U);
|
||||
xmm1 = _mm_set1_epi16(0x0001U);
|
||||
xmm2 = _mm_set1_epi16(0x0001U);
|
||||
xmm3 = _mm_set1_epi16(0x0001U);
|
||||
xmm4 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
|
||||
xmm5 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
|
||||
xmm6 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
|
||||
xmm7 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
|
||||
xmm0 = _mm_sign_epi16(xmm0, xmm4);
|
||||
xmm1 = _mm_sign_epi16(xmm1, xmm5);
|
||||
xmm2 = _mm_sign_epi16(xmm2, xmm6);
|
||||
xmm3 = _mm_sign_epi16(xmm3, xmm7);
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 8;
|
||||
_mm_store_si128((__m128i *) dptr, xmm1); dptr += 8;
|
||||
_mm_store_si128((__m128i *) dptr, xmm2); dptr += 8;
|
||||
_mm_store_si128((__m128i *) dptr, xmm3); dptr += 8;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Aligned */
|
||||
while (count--)
|
||||
{
|
||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
||||
xmm0 = _mm_set1_epi16(0x0001U);
|
||||
xmm1 = _mm_set1_epi16(0x0001U);
|
||||
xmm2 = _mm_set1_epi16(0x0001U);
|
||||
xmm3 = _mm_set1_epi16(0x0001U);
|
||||
xmm4 = _mm_load_si128((__m128i *) sptr); sptr += 8;
|
||||
xmm5 = _mm_load_si128((__m128i *) sptr); sptr += 8;
|
||||
xmm6 = _mm_load_si128((__m128i *) sptr); sptr += 8;
|
||||
xmm7 = _mm_load_si128((__m128i *) sptr); sptr += 8;
|
||||
xmm0 = _mm_sign_epi16(xmm0, xmm4);
|
||||
xmm1 = _mm_sign_epi16(xmm1, xmm5);
|
||||
xmm2 = _mm_sign_epi16(xmm2, xmm6);
|
||||
xmm3 = _mm_sign_epi16(xmm3, xmm7);
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 8;
|
||||
_mm_store_si128((__m128i *) dptr, xmm1); dptr += 8;
|
||||
_mm_store_si128((__m128i *) dptr, xmm2); dptr += 8;
|
||||
_mm_store_si128((__m128i *) dptr, xmm3); dptr += 8;
|
||||
}
|
||||
}
|
||||
|
||||
/* Do 8-short chunks using two XMM registers. */
|
||||
count = len >> 3;
|
||||
len -= count << 3;
|
||||
while (count--)
|
||||
{
|
||||
__m128i xmm0 = _mm_set1_epi16(0x0001U);
|
||||
__m128i xmm1 = LOAD_SI128(sptr); sptr += 8;
|
||||
xmm0 = _mm_sign_epi16(xmm0, xmm1);
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 8;
|
||||
}
|
||||
|
||||
/* Do leftovers. */
|
||||
while (len--)
|
||||
{
|
||||
INT16 src = *sptr++;
|
||||
*dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0);
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
#endif /* WITH_SSE2 */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_sign(
|
||||
const primitives_hints_t *hints,
|
||||
primitives_t *prims)
|
||||
{
|
||||
/* Start with the default. */
|
||||
prims->sign_16s = general_sign_16s;
|
||||
/* Pick tuned versions if possible. */
|
||||
/* I didn't spot an IPP version of this. */
|
||||
#if defined(WITH_SSE2)
|
||||
if ((hints->x86_flags & PRIM_X86_SSSE3_AVAILABLE)
|
||||
&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
|
||||
{
|
||||
prims->sign_16s = ssse3_sign_16s;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_deinit_sign(
|
||||
primitives_t *prims)
|
||||
{
|
||||
/* Nothing to do. */
|
||||
}
|
|
@ -0,0 +1,416 @@
|
|||
/* prim_templates.h
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License. Algorithms used by
|
||||
* this code may be covered by patents by HP, Microsoft, or other parties.
|
||||
*/
|
||||
|
||||
#ifdef __GNUC__
|
||||
# pragma once
|
||||
#endif
|
||||
|
||||
#ifndef __PRIM_TEMPLATES_H_INCLUDED__
|
||||
#define __PRIM_TEMPLATES_H_INCLUDED__
|
||||
|
||||
/* These are prototypes for SSE (potentially NEON) routines that do a
|
||||
* simple SSE operation over an array of data. Since so much of this
|
||||
* code is shared except for the operation itself, these prototypes are
|
||||
* used rather than duplicating code. The naming convention depends on
|
||||
* the parameters: S=Source param; C=Constant; D=Destination.
|
||||
* All the macros have parameters for a fallback procedure if the data
|
||||
* is too small and an operation "the slow way" for use at 16-byte edges.
|
||||
*/
|
||||
|
||||
/* SSE3 note: If someone needs to support an SSE2 version of these without
|
||||
* SSE3 support, an alternative version could be added that merely checks
|
||||
* that 16-byte alignment on both destination and source(s) can be
|
||||
* achieved, rather than use LDDQU for unaligned reads.
|
||||
*/
|
||||
|
||||
/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
|
||||
* It easily can't do that if the value is stored in a variable.
|
||||
* So don't save it as an intermediate value.
|
||||
*/
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* SCD = Source, Constant, Destination
|
||||
*/
|
||||
#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
|
||||
PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, INT32 val, _type_ *pDst, INT32 len) \
|
||||
{ \
|
||||
int shifts; \
|
||||
UINT32 offBeatMask; \
|
||||
const _type_ *sptr = pSrc; \
|
||||
_type_ *dptr = pDst; \
|
||||
size_t count; \
|
||||
if (len < 16) /* pointless if too small */ \
|
||||
{ \
|
||||
return _fallback_(pSrc, val, pDst, len); \
|
||||
} \
|
||||
if (sizeof(_type_) == 1) shifts = 1; \
|
||||
else if (sizeof(_type_) == 2) shifts = 2; \
|
||||
else if (sizeof(_type_) == 4) shifts = 3; \
|
||||
else if (sizeof(_type_) == 8) shifts = 4; \
|
||||
offBeatMask = (1 << (shifts - 1)) - 1; \
|
||||
if ((ULONG_PTR) pDst & offBeatMask) \
|
||||
{ \
|
||||
/* Incrementing the pointer skips over 16-byte boundary. */ \
|
||||
return _fallback_(pSrc, val, pDst, len); \
|
||||
} \
|
||||
/* Get to the 16-byte boundary now. */ \
|
||||
while ((ULONG_PTR) dptr & 0x0f) \
|
||||
{ \
|
||||
_slowWay_; \
|
||||
if (--len == 0) return PRIMITIVES_SUCCESS; \
|
||||
} \
|
||||
/* Use 8 128-bit SSE registers. */ \
|
||||
count = len >> (8-shifts); \
|
||||
len -= count << (8-shifts); \
|
||||
if ((ULONG_PTR) sptr & 0x0f) \
|
||||
{ \
|
||||
while (count--) \
|
||||
{ \
|
||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
|
||||
xmm0 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm5 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm6 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm7 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm0 = _op_(xmm0, val); \
|
||||
xmm1 = _op_(xmm1, val); \
|
||||
xmm2 = _op_(xmm2, val); \
|
||||
xmm3 = _op_(xmm3, val); \
|
||||
xmm4 = _op_(xmm4, val); \
|
||||
xmm5 = _op_(xmm5, val); \
|
||||
xmm6 = _op_(xmm6, val); \
|
||||
xmm7 = _op_(xmm7, val); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm1); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm2); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm3); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm4); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm5); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm6); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm7); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
while (count--) \
|
||||
{ \
|
||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
|
||||
xmm0 = _mm_load_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm1 = _mm_load_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm2 = _mm_load_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm3 = _mm_load_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm4 = _mm_load_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm5 = _mm_load_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm6 = _mm_load_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm7 = _mm_load_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm0 = _op_(xmm0, val); \
|
||||
xmm1 = _op_(xmm1, val); \
|
||||
xmm2 = _op_(xmm2, val); \
|
||||
xmm3 = _op_(xmm3, val); \
|
||||
xmm4 = _op_(xmm4, val); \
|
||||
xmm5 = _op_(xmm5, val); \
|
||||
xmm6 = _op_(xmm6, val); \
|
||||
xmm7 = _op_(xmm7, val); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm1); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm2); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm3); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm4); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm5); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm6); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm7); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
} \
|
||||
} \
|
||||
/* Use a single 128-bit SSE register. */ \
|
||||
count = len >> (5-shifts); \
|
||||
len -= count << (5-shifts); \
|
||||
while (count--) \
|
||||
{ \
|
||||
__m128i xmm0 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
|
||||
xmm0 = _op_(xmm0, val); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
} \
|
||||
/* Finish off the remainder. */ \
|
||||
while (len--) { _slowWay_; } \
|
||||
return PRIMITIVES_SUCCESS; \
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* SCD = Source, Constant, Destination
|
||||
* PRE = preload xmm0 with the constant.
|
||||
*/
|
||||
#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
|
||||
PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32 len) \
|
||||
{ \
|
||||
int shifts; \
|
||||
UINT32 offBeatMask; \
|
||||
const _type_ *sptr = pSrc; \
|
||||
_type_ *dptr = pDst; \
|
||||
size_t count; \
|
||||
__m128i xmm0; \
|
||||
if (len < 16) /* pointless if too small */ \
|
||||
{ \
|
||||
return _fallback_(pSrc, val, pDst, len); \
|
||||
} \
|
||||
if (sizeof(_type_) == 1) shifts = 1; \
|
||||
else if (sizeof(_type_) == 2) shifts = 2; \
|
||||
else if (sizeof(_type_) == 4) shifts = 3; \
|
||||
else if (sizeof(_type_) == 8) shifts = 4; \
|
||||
offBeatMask = (1 << (shifts - 1)) - 1; \
|
||||
if ((ULONG_PTR) pDst & offBeatMask) \
|
||||
{ \
|
||||
/* Incrementing the pointer skips over 16-byte boundary. */ \
|
||||
return _fallback_(pSrc, val, pDst, len); \
|
||||
} \
|
||||
/* Get to the 16-byte boundary now. */ \
|
||||
while ((ULONG_PTR) dptr & 0x0f) \
|
||||
{ \
|
||||
_slowWay_; \
|
||||
if (--len == 0) return PRIMITIVES_SUCCESS; \
|
||||
} \
|
||||
/* Use 4 128-bit SSE registers. */ \
|
||||
count = len >> (7-shifts); \
|
||||
len -= count << (7-shifts); \
|
||||
xmm0 = _mm_set1_epi32(val); \
|
||||
if ((ULONG_PTR) sptr & 0x0f) \
|
||||
{ \
|
||||
while (count--) \
|
||||
{ \
|
||||
__m128i xmm1, xmm2, xmm3, xmm4; \
|
||||
xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm1 = _op_(xmm1, xmm0); \
|
||||
xmm2 = _op_(xmm2, xmm0); \
|
||||
xmm3 = _op_(xmm3, xmm0); \
|
||||
xmm4 = _op_(xmm4, xmm0); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm1); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm2); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm3); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm4); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
while (count--) \
|
||||
{ \
|
||||
__m128i xmm1, xmm2, xmm3, xmm4; \
|
||||
xmm1 = _mm_load_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm2 = _mm_load_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm3 = _mm_load_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm4 = _mm_load_si128((__m128i *) sptr); \
|
||||
sptr += (16/sizeof(_type_)); \
|
||||
xmm1 = _op_(xmm1, xmm0); \
|
||||
xmm2 = _op_(xmm2, xmm0); \
|
||||
xmm3 = _op_(xmm3, xmm0); \
|
||||
xmm4 = _op_(xmm4, xmm0); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm1); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm2); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm3); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm4); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
} \
|
||||
} \
|
||||
/* Use a single 128-bit SSE register. */ \
|
||||
count = len >> (5-shifts); \
|
||||
len -= count << (5-shifts); \
|
||||
while (count--) \
|
||||
{ \
|
||||
__m128i xmm1 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
|
||||
xmm1 = _op_(xmm1, xmm0); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm1); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
} \
|
||||
/* Finish off the remainder. */ \
|
||||
while (len--) { _slowWay_; } \
|
||||
return PRIMITIVES_SUCCESS; \
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* SSD = Source1, Source2, Destination
|
||||
*/
|
||||
#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
|
||||
PRIM_STATIC pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, INT32 len) \
|
||||
{ \
|
||||
int shifts; \
|
||||
UINT32 offBeatMask; \
|
||||
const _type_ *sptr1 = pSrc1; \
|
||||
const _type_ *sptr2 = pSrc2; \
|
||||
_type_ *dptr = pDst; \
|
||||
size_t count; \
|
||||
if (len < 16) /* pointless if too small */ \
|
||||
{ \
|
||||
return _fallback_(pSrc1, pSrc2, pDst, len); \
|
||||
} \
|
||||
if (sizeof(_type_) == 1) shifts = 1; \
|
||||
else if (sizeof(_type_) == 2) shifts = 2; \
|
||||
else if (sizeof(_type_) == 4) shifts = 3; \
|
||||
else if (sizeof(_type_) == 8) shifts = 4; \
|
||||
offBeatMask = (1 << (shifts - 1)) - 1; \
|
||||
if ((ULONG_PTR) pDst & offBeatMask) \
|
||||
{ \
|
||||
/* Incrementing the pointer skips over 16-byte boundary. */ \
|
||||
return _fallback_(pSrc1, pSrc2, pDst, len); \
|
||||
} \
|
||||
/* Get to the 16-byte boundary now. */ \
|
||||
while ((ULONG_PTR) dptr & 0x0f) \
|
||||
{ \
|
||||
_slowWay_; \
|
||||
if (--len == 0) return PRIMITIVES_SUCCESS; \
|
||||
} \
|
||||
/* Use 4 128-bit SSE registers. */ \
|
||||
count = len >> (7-shifts); \
|
||||
len -= count << (7-shifts); \
|
||||
if (((ULONG_PTR) sptr1 & 0x0f) || ((ULONG_PTR) sptr2 & 0x0f)) \
|
||||
{ \
|
||||
/* Unaligned loads */ \
|
||||
while (count--) \
|
||||
{ \
|
||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
|
||||
xmm0 = _mm_lddqu_si128((__m128i *) sptr1); \
|
||||
sptr1 += (16/sizeof(_type_)); \
|
||||
xmm1 = _mm_lddqu_si128((__m128i *) sptr1); \
|
||||
sptr1 += (16/sizeof(_type_)); \
|
||||
xmm2 = _mm_lddqu_si128((__m128i *) sptr1); \
|
||||
sptr1 += (16/sizeof(_type_)); \
|
||||
xmm3 = _mm_lddqu_si128((__m128i *) sptr1); \
|
||||
sptr1 += (16/sizeof(_type_)); \
|
||||
xmm4 = _mm_lddqu_si128((__m128i *) sptr2); \
|
||||
sptr2 += (16/sizeof(_type_)); \
|
||||
xmm5 = _mm_lddqu_si128((__m128i *) sptr2); \
|
||||
sptr2 += (16/sizeof(_type_)); \
|
||||
xmm6 = _mm_lddqu_si128((__m128i *) sptr2); \
|
||||
sptr2 += (16/sizeof(_type_)); \
|
||||
xmm7 = _mm_lddqu_si128((__m128i *) sptr2); \
|
||||
sptr2 += (16/sizeof(_type_)); \
|
||||
xmm0 = _op_(xmm0, xmm4); \
|
||||
xmm1 = _op_(xmm1, xmm5); \
|
||||
xmm2 = _op_(xmm2, xmm6); \
|
||||
xmm3 = _op_(xmm3, xmm7); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm1); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm2); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm3); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
} \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
/* Aligned loads */ \
|
||||
while (count--) \
|
||||
{ \
|
||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
|
||||
xmm0 = _mm_load_si128((__m128i *) sptr1); \
|
||||
sptr1 += (16/sizeof(_type_)); \
|
||||
xmm1 = _mm_load_si128((__m128i *) sptr1); \
|
||||
sptr1 += (16/sizeof(_type_)); \
|
||||
xmm2 = _mm_load_si128((__m128i *) sptr1); \
|
||||
sptr1 += (16/sizeof(_type_)); \
|
||||
xmm3 = _mm_load_si128((__m128i *) sptr1); \
|
||||
sptr1 += (16/sizeof(_type_)); \
|
||||
xmm4 = _mm_load_si128((__m128i *) sptr2); \
|
||||
sptr2 += (16/sizeof(_type_)); \
|
||||
xmm5 = _mm_load_si128((__m128i *) sptr2); \
|
||||
sptr2 += (16/sizeof(_type_)); \
|
||||
xmm6 = _mm_load_si128((__m128i *) sptr2); \
|
||||
sptr2 += (16/sizeof(_type_)); \
|
||||
xmm7 = _mm_load_si128((__m128i *) sptr2); \
|
||||
sptr2 += (16/sizeof(_type_)); \
|
||||
xmm0 = _op_(xmm0, xmm4); \
|
||||
xmm1 = _op_(xmm1, xmm5); \
|
||||
xmm2 = _op_(xmm2, xmm6); \
|
||||
xmm3 = _op_(xmm3, xmm7); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm1); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm2); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm3); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
} \
|
||||
} \
|
||||
/* Use a single 128-bit SSE register. */ \
|
||||
count = len >> (5-shifts); \
|
||||
len -= count << (5-shifts); \
|
||||
while (count--) \
|
||||
{ \
|
||||
__m128i xmm0, xmm1; \
|
||||
xmm0 = LOAD_SI128(sptr1); sptr1 += (16/sizeof(_type_)); \
|
||||
xmm1 = LOAD_SI128(sptr2); sptr2 += (16/sizeof(_type_)); \
|
||||
xmm0 = _op_(xmm0, xmm1); \
|
||||
_mm_store_si128((__m128i *) dptr, xmm0); \
|
||||
dptr += (16/sizeof(_type_)); \
|
||||
} \
|
||||
/* Finish off the remainder. */ \
|
||||
while (len--) { _slowWay_; } \
|
||||
return PRIMITIVES_SUCCESS; \
|
||||
}
|
||||
|
||||
#endif /* !__PRIM_TEMPLATES_H_INCLUDED__ */
|
|
@ -0,0 +1,297 @@
|
|||
/* primitives.c
|
||||
* This code queries processor features and calls the init/deinit routines.
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* Copyright 2011 Martin Fleisz <mfleisz@thinstuff.com>
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#include "prim_internal.h"
|
||||
#ifdef ANDROID
|
||||
# include "cpu-features.h"
|
||||
#endif
|
||||
|
||||
/* Singleton pointer used throughout the program when requested. */
|
||||
static primitives_t *pPrimitives = NULL;
|
||||
|
||||
#define D_BIT_MMX (1<<23)
|
||||
#define D_BIT_SSE (1<<25)
|
||||
#define D_BIT_SSE2 (1<<26)
|
||||
#define D_BIT_3DN (1<<30)
|
||||
#define C_BIT_SSE3 (1<<0)
|
||||
#define C_BIT_3DNP (1<<8)
|
||||
#define C_BIT_SSSE3 (1<<9)
|
||||
#define C_BIT_SSE41 (1<<19)
|
||||
#define C_BIT_SSE42 (1<<20)
|
||||
#define C_BIT_XGETBV (1<<27)
|
||||
#define C_BIT_AVX (1<<28)
|
||||
#define C_BITS_AVX (C_BIT_XGETBV|C_BIT_AVX)
|
||||
#define E_BIT_XMM (1<<1)
|
||||
#define E_BIT_YMM (1<<2)
|
||||
#define E_BITS_AVX (E_BIT_XMM|E_BIT_YMM)
|
||||
#define C_BIT_FMA (1<<11)
|
||||
#define C_BIT_AVX_AES (1<<24)
|
||||
|
||||
/* If x86 */
|
||||
#if defined(__x86_64) || defined(__x86_64__) || defined(__amd64) \
|
||||
|| defined(__amd64__) || defined(_M_AMD64) || defined(_M_X64) \
|
||||
|| defined(i386) || defined(__i386) || defined(__i386__) \
|
||||
|| defined(_M_IX86) || defined(_X86_)
|
||||
#ifndef i386
|
||||
#define i386
|
||||
#endif
|
||||
|
||||
/* If GCC */
|
||||
# ifdef __GNUC__
|
||||
# define xgetbv(_func_, _lo_, _hi_) \
|
||||
__asm__ __volatile__ ("xgetbv" : "=a" (_lo_), "=d" (_hi_) : "c" (_func_))
|
||||
|
||||
static void cpuid(
|
||||
unsigned info,
|
||||
unsigned *eax,
|
||||
unsigned *ebx,
|
||||
unsigned *ecx,
|
||||
unsigned *edx)
|
||||
{
|
||||
*eax = *ebx = *ecx = *edx = 0;
|
||||
__asm volatile
|
||||
(
|
||||
/* The EBX (or RBX register on x86_64) is used for the PIC base address
|
||||
* and must not be corrupted by our inline assembly.
|
||||
*/
|
||||
# if defined(__i386__)
|
||||
"mov %%ebx, %%esi;"
|
||||
"cpuid;"
|
||||
"xchg %%ebx, %%esi;"
|
||||
#else
|
||||
"mov %%rbx, %%rsi;"
|
||||
"cpuid;"
|
||||
"xchg %%rbx, %%rsi;"
|
||||
#endif
|
||||
: "=a" (*eax), "=S" (*ebx), "=c" (*ecx), "=d" (*edx)
|
||||
: "0" (info)
|
||||
);
|
||||
}
|
||||
|
||||
static void set_hints(
|
||||
primitives_hints_t *hints)
|
||||
{
|
||||
unsigned a, b, c, d;
|
||||
cpuid(1, &a, &b, &c, &d);
|
||||
if (d & D_BIT_MMX) hints->x86_flags |= PRIM_X86_MMX_AVAILABLE;
|
||||
if (d & D_BIT_SSE) hints->x86_flags |= PRIM_X86_SSE_AVAILABLE;
|
||||
if (d & D_BIT_SSE2) hints->x86_flags |= PRIM_X86_SSE2_AVAILABLE;
|
||||
if (d & D_BIT_3DN) hints->x86_flags |= PRIM_X86_3DNOW_AVAILABLE;
|
||||
if (c & C_BIT_3DNP) hints->x86_flags |= PRIM_X86_3DNOW_PREFETCH_AVAILABLE;
|
||||
if (c & C_BIT_SSE3) hints->x86_flags |= PRIM_X86_SSE3_AVAILABLE;
|
||||
if (c & C_BIT_SSSE3) hints->x86_flags |= PRIM_X86_SSSE3_AVAILABLE;
|
||||
if (c & C_BIT_SSE41) hints->x86_flags |= PRIM_X86_SSE41_AVAILABLE;
|
||||
if (c & C_BIT_SSE42) hints->x86_flags |= PRIM_X86_SSE42_AVAILABLE;
|
||||
if ((c & C_BITS_AVX) == C_BITS_AVX)
|
||||
{
|
||||
int e, f;
|
||||
xgetbv(0, e, f);
|
||||
if ((e & E_BITS_AVX) == E_BITS_AVX)
|
||||
{
|
||||
hints->x86_flags |= PRIM_X86_AVX_AVAILABLE;
|
||||
if (c & C_BIT_FMA) hints->x86_flags |= PRIM_X86_FMA_AVAILABLE;
|
||||
if (c & C_BIT_AVX_AES) hints->x86_flags |= PRIM_X86_AVX_AES_AVAILABLE;
|
||||
}
|
||||
}
|
||||
/* TODO: AVX2: set eax=7, ecx=0, cpuid, check ebx-bit5 */
|
||||
}
|
||||
# else
|
||||
static void set_hints(
|
||||
primitives_hints_t *hints)
|
||||
{
|
||||
/* x86 non-GCC: TODO */
|
||||
}
|
||||
# endif /* __GNUC__ */
|
||||
/* ------------------------------------------------------------------------- */
|
||||
#elif defined(__arm__) || defined(__ARM_ARCH_7A__) \
|
||||
|| defined(__ARM_EABI__) || defined(__ARMEL__) || defined(ANDROID)
|
||||
#ifndef __arm__
|
||||
#define __arm__
|
||||
#endif
|
||||
|
||||
static UINT32 androidNeon(void)
|
||||
{
|
||||
#if ANDROID
|
||||
if (android_getCpuFamily() != ANDROID_CPU_FAMILY_ARM) return 0;
|
||||
|
||||
UINT64 features = android_getCpuFeatures();
|
||||
|
||||
if ((features & ANDROID_CPU_ARM_FEATURE_ARMv7))
|
||||
{
|
||||
if (features & ANDROID_CPU_ARM_FEATURE_NEON)
|
||||
{
|
||||
return PRIM_ARM_NEON_AVAILABLE;
|
||||
}
|
||||
}
|
||||
/* else */
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void set_hints(
|
||||
primitives_hints_t *hints)
|
||||
{
|
||||
/* ARM: TODO */
|
||||
hints->arm_flags |= androidNeon();
|
||||
}
|
||||
|
||||
#else
|
||||
static void set_hints(
|
||||
primitives_hints_t *hints)
|
||||
{
|
||||
}
|
||||
#endif /* x86 else ARM else */
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init(void)
|
||||
{
|
||||
primitives_hints_t *hints;
|
||||
|
||||
if (pPrimitives == NULL)
|
||||
{
|
||||
pPrimitives = calloc(1, sizeof(primitives_t));
|
||||
if (pPrimitives == NULL) return;
|
||||
}
|
||||
|
||||
hints = calloc(1, sizeof(primitives_hints_t));
|
||||
set_hints(hints);
|
||||
pPrimitives->hints = (void *) hints;
|
||||
|
||||
/* Now call each section's initialization routine. */
|
||||
primitives_init_add(hints, pPrimitives);
|
||||
primitives_init_andor(hints, pPrimitives);
|
||||
primitives_init_alphaComp(hints, pPrimitives);
|
||||
primitives_init_copy(hints, pPrimitives);
|
||||
primitives_init_set(hints, pPrimitives);
|
||||
primitives_init_shift(hints, pPrimitives);
|
||||
primitives_init_sign(hints, pPrimitives);
|
||||
primitives_init_colors(hints, pPrimitives);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
primitives_t *primitives_get(void)
|
||||
{
|
||||
if (pPrimitives == NULL) primitives_init();
|
||||
return pPrimitives;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
UINT32 primitives_get_flags(
|
||||
const primitives_t *prims)
|
||||
{
|
||||
primitives_hints_t *hints = (primitives_hints_t *) (prims->hints);
|
||||
#ifdef i386
|
||||
return hints->x86_flags;
|
||||
#elif defined(__arm__)
|
||||
return hints->arm_flags;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_flags_str(
|
||||
const primitives_t *prims,
|
||||
char *str,
|
||||
size_t len)
|
||||
{
|
||||
typedef struct
|
||||
{
|
||||
UINT32 flag;
|
||||
const char *str;
|
||||
} flagpair_t;
|
||||
static const flagpair_t x86_flags[] =
|
||||
{
|
||||
{ PRIM_X86_MMX_AVAILABLE, "MMX" },
|
||||
{ PRIM_X86_3DNOW_AVAILABLE, "3DNow" },
|
||||
{ PRIM_X86_3DNOW_PREFETCH_AVAILABLE, "3DNow-PF" },
|
||||
{ PRIM_X86_SSE_AVAILABLE, "SSE" },
|
||||
{ PRIM_X86_SSE2_AVAILABLE, "SSE2" },
|
||||
{ PRIM_X86_SSE3_AVAILABLE, "SSE3" },
|
||||
{ PRIM_X86_SSSE3_AVAILABLE, "SSSE3" },
|
||||
{ PRIM_X86_SSE41_AVAILABLE, "SSE4.1" },
|
||||
{ PRIM_X86_SSE42_AVAILABLE, "SSE4.2" },
|
||||
{ PRIM_X86_AVX_AVAILABLE, "AVX" },
|
||||
{ PRIM_X86_FMA_AVAILABLE, "FMA" },
|
||||
{ PRIM_X86_AVX_AES_AVAILABLE, "AVX-AES" },
|
||||
{ PRIM_X86_AVX2_AVAILABLE, "AVX2" },
|
||||
};
|
||||
static const flagpair_t arm_flags[] =
|
||||
{
|
||||
{ PRIM_ARM_VFP1_AVAILABLE, "VFP1" },
|
||||
{ PRIM_ARM_VFP2_AVAILABLE, "VFP2" },
|
||||
{ PRIM_ARM_VFP3_AVAILABLE, "VFP3" },
|
||||
{ PRIM_ARM_VFP4_AVAILABLE, "VFP4" },
|
||||
{ PRIM_ARM_FPA_AVAILABLE, "FPA" },
|
||||
{ PRIM_ARM_FPE_AVAILABLE, "FPE" },
|
||||
{ PRIM_ARM_IWMMXT_AVAILABLE, "IWMMXT" },
|
||||
{ PRIM_ARM_NEON_AVAILABLE, "NEON" },
|
||||
};
|
||||
int i;
|
||||
primitives_hints_t *hints;
|
||||
|
||||
*str = '\0';
|
||||
--len; /* for the '/0' */
|
||||
|
||||
hints = (primitives_hints_t *) (prims->hints);
|
||||
for (i=0; i<sizeof(x86_flags)/sizeof(flagpair_t); ++i)
|
||||
{
|
||||
if (hints->x86_flags & x86_flags[i].flag)
|
||||
{
|
||||
int slen = strlen(x86_flags[i].str) + 1;
|
||||
if (len < slen) break;
|
||||
if (*str != '\0') strcat(str, " ");
|
||||
strcat(str, x86_flags[i].str);
|
||||
len -= slen;
|
||||
}
|
||||
}
|
||||
|
||||
for (i=0; i<sizeof(arm_flags)/sizeof(flagpair_t); ++i)
|
||||
{
|
||||
if (hints->arm_flags & arm_flags[i].flag)
|
||||
{
|
||||
int slen = strlen(arm_flags[i].str) + 1;
|
||||
if (len < slen) break;
|
||||
if (*str != '\0') strcat(str, " ");
|
||||
strcat(str, arm_flags[i].str);
|
||||
len -= slen;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_deinit(void)
|
||||
{
|
||||
if (pPrimitives == NULL) return;
|
||||
|
||||
/* Call each section's de-initialization routine. */
|
||||
primitives_deinit_add(pPrimitives);
|
||||
primitives_deinit_andor(pPrimitives);
|
||||
primitives_deinit_alphaComp(pPrimitives);
|
||||
primitives_deinit_copy(pPrimitives);
|
||||
primitives_deinit_set(pPrimitives);
|
||||
primitives_deinit_shift(pPrimitives);
|
||||
primitives_deinit_sign(pPrimitives);
|
||||
primitives_deinit_colors(pPrimitives);
|
||||
|
||||
if (pPrimitives->hints != NULL) free((void *) (pPrimitives->hints));
|
||||
free((void *) pPrimitives);
|
||||
pPrimitives = NULL;
|
||||
}
|
|
@ -0,0 +1,139 @@
|
|||
# FreeRDP: A Remote Desktop Protocol Client
|
||||
# primitives test makefile builder
|
||||
# vi:ts=4 sw=4:
|
||||
#
|
||||
# (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
# implied. See the License for the specific language governing permissions
|
||||
# and limitations under the License.
|
||||
#
|
||||
|
||||
# TODO: Integrate this into the testing framework, in some form.
|
||||
# Right now this produces a standalone test that covers both functionality
|
||||
# and performance of the primitives library entrypoints.
|
||||
|
||||
cmake_minimum_required(VERSION 2.8)
|
||||
set(MODULE_NAME "prim_test")
|
||||
set(MODULE_PREFIX "PRIMITIVES_LIBRARY_TEST")
|
||||
|
||||
set(PRIMITIVE_TEST_CFILES
|
||||
prim_test.c
|
||||
test_add.c
|
||||
test_alphaComp.c
|
||||
test_andor.c
|
||||
test_colors.c
|
||||
test_copy.c
|
||||
test_set.c
|
||||
test_shift.c
|
||||
test_sign.c
|
||||
../prim_add.c
|
||||
../prim_andor.c
|
||||
../prim_alphaComp.c
|
||||
../prim_colors.c
|
||||
../prim_copy.c
|
||||
../prim_set.c
|
||||
../prim_shift.c
|
||||
../prim_sign.c
|
||||
../primitives.c
|
||||
)
|
||||
|
||||
set(PRIMITIVE_TEST_HEADERS
|
||||
measure.h
|
||||
prim_test.h
|
||||
../prim_internal.h
|
||||
)
|
||||
|
||||
set(PRIMITIVE_TEST_SRCS
|
||||
${PRIMITIVE_TEST_CFILES}
|
||||
${PRIMITIVE_TEST_HEADERS}
|
||||
)
|
||||
|
||||
include_directories(. ../../.. ../../../include ../../../winpr/include)
|
||||
add_definitions(-DPRIM_STATIC=auto -DALL_PRIMITIVES_VERSIONS -DHAVE_CONFIG_H)
|
||||
|
||||
# If these haven't been set by the caller, set them now to defaults.
|
||||
if(NOT DEFINED WITH_IPP)
|
||||
set(WITH_IPP FALSE)
|
||||
endif()
|
||||
if(NOT DEFINED WITH_SSE2)
|
||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm*")
|
||||
set(WITH_SSE2 FALSE)
|
||||
else()
|
||||
set(WITH_SSE2 TRUE)
|
||||
endif()
|
||||
endif()
|
||||
if(NOT DEFINED WITH_NEON)
|
||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm*")
|
||||
set(WITH_NEON TRUE)
|
||||
else()
|
||||
set(WITH_NEON FALSE)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(WITH_SSE2)
|
||||
if(CMAKE_COMPILER_IS_GNUCC)
|
||||
set(OPTFLAGS "${OPTFLAGS} -msse2 -mssse3 -O2 -Wdeclaration-after-statement")
|
||||
endif()
|
||||
|
||||
if(MSVC)
|
||||
set(OPTFLAGS "${OPTFLAGS} /arch:SSE2")
|
||||
endif()
|
||||
elseif(WITH_NEON)
|
||||
if(CMAKE_COMPILER_IS_GNUCC)
|
||||
set(OPTIMZATION "${OPTFLAGS} -mfpu=neon -mfloat-abi=softfp -O2")
|
||||
endif()
|
||||
# TODO: Add MSVC equivalent
|
||||
endif()
|
||||
|
||||
add_executable(prim_test ${PRIMITIVE_TEST_SRCS})
|
||||
|
||||
if(WITH_IPP)
|
||||
if(NOT DEFINED IPP_FOUND)
|
||||
include(../../../cmake/FindIPP.cmake)
|
||||
endif()
|
||||
add_definitions(-DWITH_IPP)
|
||||
|
||||
# IPP PATH debugging messages
|
||||
message(IPP_FOUND=${IPP_FOUND})
|
||||
message(IPP_VERSION_STR=${IPP_VERSION_STR})
|
||||
message(IPP_VERSION_MAJOR=${IPP_VERSION_MAJOR})
|
||||
message(IPP_VERSION_MINOR=${IPP_VERSION_MINOR})
|
||||
message(IPP_VERSION_BUILD=${IPP_VERSION_BUILD})
|
||||
message(IPP_ROOT_DIR=${IPP_ROOT_DIR})
|
||||
message(IPP_INCLUDE_DIRS=${IPP_INCLUDE_DIRS})
|
||||
message(IPP_LIBRARY_DIRS=${IPP_LIBRARY_DIRS})
|
||||
message(IPP_LIBRARIES=${IPP_LIBRARIES})
|
||||
message(IPP_COMPILER_LIBRARY_DIRS=${IPP_COMPILER_LIBRARY_DIRS})
|
||||
message(IPP_COMPILER_LIBRARIES=${IPP_COMPILER_LIBRARIES})
|
||||
message(IPP_LIBRARY_LIST=${IPP_LIBRARY_LIST})
|
||||
message(IPP_LIB_PREFIX=${IPP_LIB_PREFIX})
|
||||
message(IPP_LIB_SUFFIX=${IPP_LIB_SUFFIX})
|
||||
message(IPP_PREFIX=${IPP_PREFIX})
|
||||
message(IPP_SUFFIX=${IPP_SUFFIX})
|
||||
message(IPPCORE=${IPPCORE})
|
||||
message(IPPS=${IPPS})
|
||||
message(IPPI=${IPPI})
|
||||
message(IPPCC=${IPPCC})
|
||||
message(IPPCV=${IPPCV})
|
||||
message(IPPVM=${IPPVM})
|
||||
|
||||
if(CMAKE_COMPILER_IS_GNUCC)
|
||||
foreach(INCLDIR ${IPP_INCLUDE_DIRS})
|
||||
set(OPTFLAGS "${OPTFLAGS} -I${INCLDIR}")
|
||||
endforeach(INCLDIR)
|
||||
endif()
|
||||
target_link_libraries(prim_test ${IPP_LIBRARY_LIST})
|
||||
endif()
|
||||
|
||||
set_property(SOURCE ${PRIMITIVE_TEST_CFILES} PROPERTY COMPILE_FLAGS ${OPTFLAGS})
|
||||
|
||||
target_link_libraries(prim_test rt)
|
||||
if(NOT TESTING_OUTPUT_DIRECTORY)
|
||||
set(TESTING_OUTPUT_DIRECTORY .)
|
||||
endif()
|
||||
add_test(prim_test ${TESTING_OUTPUT_DIRECTORY}/prim_test functionality)
|
|
@ -0,0 +1,121 @@
|
|||
/* measure.h
|
||||
* Macros to help with performance measurement.
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License. Algorithms used by
|
||||
* this code may be covered by patents by HP, Microsoft, or other parties.
|
||||
*
|
||||
* MEASURE_LOOP_START("measurement", 2000)
|
||||
* code to be measured
|
||||
* MEASURE_LOOP_STOP
|
||||
* buffer flush and such
|
||||
* MEASURE_SHOW_RESULTS
|
||||
*
|
||||
* Define GOOGLE_PROFILER if you want gperftools included.
|
||||
*/
|
||||
|
||||
#ifdef _GNUC_
|
||||
# pragma once
|
||||
#endif
|
||||
|
||||
#ifndef __MEASURE_H_INCLUDED__
|
||||
#define __MEASURE_H_INCLUDED__
|
||||
|
||||
#include <time.h>
|
||||
#include <sys/param.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifdef GOOGLE_PROFILER
|
||||
#include <gperftools/profiler.h>
|
||||
#define PROFILER_START(_prefix_) \
|
||||
do { \
|
||||
char _path[PATH_MAX]; \
|
||||
sprintf(_path, "./%s.prof", (_prefix_)); \
|
||||
ProfilerStart(_path); \
|
||||
} while (0);
|
||||
# define PROFILER_STOP \
|
||||
do { \
|
||||
ProfilerStop(); \
|
||||
} while (0);
|
||||
#else
|
||||
#define PROFILER_START(_prefix_)
|
||||
#define PROFILER_STOP
|
||||
#endif // GOOGLE_PROFILER
|
||||
|
||||
extern float _delta_time(const struct timespec *t0, const struct timespec *t1);
|
||||
extern void _floatprint(float t, char *output);
|
||||
|
||||
#ifndef CLOCK_MONOTONIC_RAW
|
||||
#define CLOCK_MONOTONIC_RAW 4
|
||||
#endif // !CLOCK_MONOTONIC_RAW
|
||||
|
||||
#define MEASURE_LOOP_START(_prefix_, _count_) \
|
||||
{ struct timespec _start, _stop; \
|
||||
char *_prefix; \
|
||||
int _count = (_count_); \
|
||||
int _loop; \
|
||||
float _delta; \
|
||||
char _str1[32], _str2[32]; \
|
||||
_prefix = strdup(_prefix_); \
|
||||
_str1[0] = '\0'; _str2[0] = '\0'; \
|
||||
clock_gettime(CLOCK_MONOTONIC_RAW, &_start); \
|
||||
PROFILER_START(_prefix); \
|
||||
_loop = (_count); \
|
||||
do {
|
||||
|
||||
#define MEASURE_LOOP_STOP \
|
||||
} while (--_loop);
|
||||
|
||||
#define MEASURE_GET_RESULTS(_result_) \
|
||||
PROFILER_STOP; \
|
||||
clock_gettime(CLOCK_MONOTONIC_RAW, &_stop); \
|
||||
_delta = _delta_time(&_start, &_stop); \
|
||||
(_result_) = (float) _count / _delta; \
|
||||
free(_prefix); \
|
||||
}
|
||||
|
||||
#define MEASURE_SHOW_RESULTS(_result_) \
|
||||
PROFILER_STOP; \
|
||||
clock_gettime(CLOCK_MONOTONIC_RAW, &_stop); \
|
||||
_delta = _delta_time(&_start, &_stop); \
|
||||
(_result_) = (float) _count / _delta; \
|
||||
_floatprint((float) _count / _delta, _str1); \
|
||||
printf("%s: %9d iterations in %5.1f seconds = %s/s \n", \
|
||||
_prefix, _count, _delta, _str1); \
|
||||
free(_prefix); \
|
||||
}
|
||||
|
||||
#define MEASURE_SHOW_RESULTS_SCALED(_scale_, _label_) \
|
||||
PROFILER_STOP; \
|
||||
clock_gettime(CLOCK_MONOTONIC_RAW, &_stop); \
|
||||
_delta = _delta_time(&_start, &_stop); \
|
||||
_floatprint((float) _count / _delta, _str1); \
|
||||
_floatprint((float) _count / _delta * (_scale_), _str2); \
|
||||
printf("%s: %9d iterations in %5.1f seconds = %s/s = %s%s \n", \
|
||||
_prefix, _count, _delta, _str1, _str2, _label_); \
|
||||
free(_prefix); \
|
||||
}
|
||||
|
||||
#define MEASURE_TIMED(_label_, _init_iter_, _test_time_, _result_, _call_) \
|
||||
{ float _r; \
|
||||
MEASURE_LOOP_START(_label_, _init_iter_); \
|
||||
_call_; \
|
||||
MEASURE_LOOP_STOP; \
|
||||
MEASURE_GET_RESULTS(_r); \
|
||||
MEASURE_LOOP_START(_label_, _r * _test_time_); \
|
||||
_call_; \
|
||||
MEASURE_LOOP_STOP; \
|
||||
MEASURE_SHOW_RESULTS(_result_); \
|
||||
}
|
||||
|
||||
#endif // __MEASURE_H_INCLUDED__
|
|
@ -0,0 +1,414 @@
|
|||
/* prim_test.c
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include "prim_test.h"
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
|
||||
int test_sizes[] = { 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096 };
|
||||
int Quiet = 0;
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static void get_random_data_lrand(
|
||||
void *buffer,
|
||||
size_t size)
|
||||
{
|
||||
static int seeded = 0;
|
||||
long int *ptr = (long int *) buffer;
|
||||
unsigned char *cptr;
|
||||
|
||||
if (!seeded)
|
||||
{
|
||||
seeded = 1;
|
||||
srand48(time(NULL));
|
||||
}
|
||||
/* This isn't the perfect random number generator, but that's okay. */
|
||||
while (size >= sizeof(long int))
|
||||
{
|
||||
*ptr++ = lrand48();
|
||||
size -= sizeof(long int);
|
||||
}
|
||||
cptr = (unsigned char *) ptr;
|
||||
while (size > 0)
|
||||
{
|
||||
*cptr++ = lrand48() & 0xff;
|
||||
--size;
|
||||
}
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void get_random_data(
|
||||
void *buffer,
|
||||
size_t size)
|
||||
{
|
||||
#ifdef linux
|
||||
size_t offset = 0;
|
||||
int fd = open("/dev/urandom", O_RDONLY);
|
||||
if (fd < 0)
|
||||
{
|
||||
get_random_data_lrand(buffer, size);
|
||||
return;
|
||||
}
|
||||
|
||||
while (size > 0)
|
||||
{
|
||||
ssize_t count = read(fd, buffer+offset, size);
|
||||
size -= count;
|
||||
offset += count;
|
||||
}
|
||||
close(fd);
|
||||
#else
|
||||
get_random_data_lrand(buffer, size);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
float _delta_time(
|
||||
const struct timespec *t0,
|
||||
const struct timespec *t1)
|
||||
{
|
||||
INT64 secs = (INT64) (t1->tv_sec) - (INT64) (t0->tv_sec);
|
||||
long nsecs = t1->tv_nsec - t0->tv_nsec;
|
||||
double retval;
|
||||
|
||||
if (nsecs < 0)
|
||||
{
|
||||
--secs;
|
||||
nsecs += 1000000000;
|
||||
}
|
||||
retval = (double) secs + (double) nsecs / (double) 1000000000.0;
|
||||
return (retval < 0.0) ? 0.0 : (float) retval;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void _floatprint(
|
||||
float t,
|
||||
char *output)
|
||||
{
|
||||
/* I don't want to link against -lm, so avoid log,exp,... */
|
||||
float f = 10.0;
|
||||
int i;
|
||||
while (t > f) f *= 10.0;
|
||||
f /= 1000.0;
|
||||
i = ((int) (t/f+0.5)) * (int) f;
|
||||
if (t < 0.0) sprintf(output, "%f", t);
|
||||
else if (i == 0) sprintf(output, "%d", (int) (t+0.5));
|
||||
else if (t < 1e+3) sprintf(output, "%3d", i);
|
||||
else if (t < 1e+6) sprintf(output, "%3d,%03d",
|
||||
i/1000, i % 1000);
|
||||
else if (t < 1e+9) sprintf(output, "%3d,%03d,000",
|
||||
i/1000000, (i % 1000000) / 1000);
|
||||
else if (t < 1e+12) sprintf(output, "%3d,%03d,000,000",
|
||||
i/1000000000, (i % 1000000000) / 1000000);
|
||||
else sprintf(output, "%f", t);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
/* Specific areas to test: */
|
||||
#define TEST_COPY8 (1<<0)
|
||||
#define TEST_SET8 (1<<1)
|
||||
#define TEST_SET32S (1<<2)
|
||||
#define TEST_SET32U (1<<3)
|
||||
#define TEST_SIGN16S (1<<4)
|
||||
#define TEST_ADD16S (1<<5)
|
||||
#define TEST_LSHIFT16S (1<<6)
|
||||
#define TEST_LSHIFT16U (1<<7)
|
||||
#define TEST_RSHIFT16S (1<<8)
|
||||
#define TEST_RSHIFT16U (1<<9)
|
||||
#define TEST_RGB (1<<10)
|
||||
#define TEST_ALPHA (1<<11)
|
||||
#define TEST_AND (1<<12)
|
||||
#define TEST_OR (1<<13)
|
||||
|
||||
/* Specific types of testing: */
|
||||
#define TEST_FUNCTIONALITY (1<<0)
|
||||
#define TEST_PERFORMANCE (1<<1)
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
int main(
|
||||
int argc,
|
||||
char **argv)
|
||||
{
|
||||
typedef struct
|
||||
{
|
||||
const char *testStr;
|
||||
UINT32 bits;
|
||||
} test_t;
|
||||
static const test_t testList[] =
|
||||
{
|
||||
{ "all", 0xFFFFFFFFU },
|
||||
{ "copy", TEST_COPY8 },
|
||||
{ "copy8", TEST_COPY8 },
|
||||
{ "set", TEST_SET8|TEST_SET32S|TEST_SET32U },
|
||||
{ "set8", TEST_SET8 },
|
||||
{ "set32", TEST_SET32S|TEST_SET32U },
|
||||
{ "set32s", TEST_SET32S },
|
||||
{ "set32u", TEST_SET32U },
|
||||
{ "sign", TEST_SIGN16S },
|
||||
{ "sign16s", TEST_SIGN16S },
|
||||
{ "add", TEST_ADD16S },
|
||||
{ "add16s", TEST_ADD16S },
|
||||
{ "lshift", TEST_LSHIFT16S|TEST_LSHIFT16U },
|
||||
{ "rshift", TEST_RSHIFT16S|TEST_RSHIFT16U },
|
||||
{ "shift", TEST_LSHIFT16S|TEST_LSHIFT16U|TEST_RSHIFT16S|TEST_RSHIFT16U },
|
||||
{ "lshift16s", TEST_LSHIFT16S },
|
||||
{ "lshift16u", TEST_LSHIFT16U },
|
||||
{ "rshift16s", TEST_RSHIFT16S },
|
||||
{ "rshift16u", TEST_RSHIFT16U },
|
||||
{ "rgb", TEST_RGB },
|
||||
{ "color", TEST_RGB },
|
||||
{ "colors", TEST_RGB },
|
||||
{ "alpha", TEST_ALPHA },
|
||||
{ "and", TEST_AND },
|
||||
{ "or", TEST_OR },
|
||||
};
|
||||
#define NUMTESTS (sizeof(testList)/sizeof(test_t))
|
||||
static const test_t testTypeList[] =
|
||||
{
|
||||
{ "functionality", TEST_FUNCTIONALITY },
|
||||
{ "performance", TEST_PERFORMANCE },
|
||||
};
|
||||
#define NUMTESTTYPES (sizeof(testTypeList)/sizeof(test_t))
|
||||
char hints[256];
|
||||
UINT32 testSet = 0;
|
||||
UINT32 testTypes = 0;
|
||||
int i;
|
||||
int results = SUCCESS;
|
||||
|
||||
/* Parse command line for the test set. */
|
||||
for (i=1; i<argc; ++i)
|
||||
{
|
||||
BOOL found = 0;
|
||||
int j;
|
||||
for (j=0; j<NUMTESTS; ++j)
|
||||
{
|
||||
if (strcasecmp(argv[i], testList[j].testStr) == 0)
|
||||
{
|
||||
testSet |= testList[j].bits;
|
||||
found = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (j=0; j<NUMTESTTYPES; ++j)
|
||||
{
|
||||
if (strcasecmp(argv[i], testTypeList[j].testStr) == 0)
|
||||
{
|
||||
testTypes |= testTypeList[j].bits;
|
||||
found = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found)
|
||||
{
|
||||
if (strstr(argv[i], "help") != NULL)
|
||||
{
|
||||
printf("Available tests:\n");
|
||||
for (j=0; j<NUMTESTS; ++j)
|
||||
{
|
||||
printf(" %s\n", testList[j].testStr);
|
||||
}
|
||||
for (j=0; j<NUMTESTTYPES; ++j)
|
||||
{
|
||||
printf(" %s\n", testTypeList[j].testStr);
|
||||
}
|
||||
}
|
||||
else fprintf(stderr, "Unknown parameter '%s'!\n", argv[i]);
|
||||
}
|
||||
}
|
||||
if (testSet == 0) testSet = 0xffffffff;
|
||||
if (testTypes == 0) testTypes = 0xffffffff;
|
||||
|
||||
primitives_init();
|
||||
|
||||
primitives_flags_str(primitives_get(), hints, sizeof(hints));
|
||||
printf("Hints: %s\n", hints);
|
||||
|
||||
/* COPY */
|
||||
if (testSet & TEST_COPY8)
|
||||
{
|
||||
if (testTypes & TEST_FUNCTIONALITY)
|
||||
{
|
||||
results |= test_copy8u_func();
|
||||
}
|
||||
if (testTypes & TEST_PERFORMANCE)
|
||||
{
|
||||
results |= test_copy8u_speed();
|
||||
}
|
||||
}
|
||||
/* SET */
|
||||
if (testSet & TEST_SET8)
|
||||
{
|
||||
if (testTypes & TEST_FUNCTIONALITY)
|
||||
{
|
||||
results |= test_set8u_func();
|
||||
}
|
||||
if (testTypes & TEST_PERFORMANCE)
|
||||
{
|
||||
results |= test_set8u_speed();
|
||||
}
|
||||
}
|
||||
if (testSet & TEST_SET32S)
|
||||
{
|
||||
if (testTypes & TEST_FUNCTIONALITY)
|
||||
{
|
||||
results |= test_set32s_func();
|
||||
}
|
||||
if (testTypes & TEST_PERFORMANCE)
|
||||
{
|
||||
results |= test_set32s_speed();
|
||||
}
|
||||
}
|
||||
if (testSet & TEST_SET32U)
|
||||
{
|
||||
if (testTypes & TEST_FUNCTIONALITY)
|
||||
{
|
||||
results |= test_set32u_func();
|
||||
}
|
||||
if (testTypes & TEST_PERFORMANCE)
|
||||
{
|
||||
results |= test_set32u_speed();
|
||||
}
|
||||
}
|
||||
/* SIGN */
|
||||
if (testSet & TEST_SIGN16S)
|
||||
{
|
||||
if (testTypes & TEST_FUNCTIONALITY)
|
||||
{
|
||||
results |= test_sign16s_func();
|
||||
}
|
||||
if (testTypes & TEST_PERFORMANCE)
|
||||
{
|
||||
results |= test_sign16s_speed();
|
||||
}
|
||||
}
|
||||
/* ADD */
|
||||
if (testSet & TEST_ADD16S)
|
||||
{
|
||||
if (testTypes & TEST_FUNCTIONALITY)
|
||||
{
|
||||
results |= test_add16s_func();
|
||||
}
|
||||
if (testTypes & TEST_PERFORMANCE)
|
||||
{
|
||||
results |= test_add16s_speed();
|
||||
}
|
||||
}
|
||||
/* SHIFTS */
|
||||
if (testSet & TEST_LSHIFT16S)
|
||||
{
|
||||
if (testTypes & TEST_FUNCTIONALITY)
|
||||
{
|
||||
results |= test_lShift_16s_func();
|
||||
}
|
||||
if (testTypes & TEST_PERFORMANCE)
|
||||
{
|
||||
results |= test_lShift_16s_speed();
|
||||
}
|
||||
}
|
||||
if (testSet & TEST_LSHIFT16U)
|
||||
{
|
||||
if (testTypes & TEST_FUNCTIONALITY)
|
||||
{
|
||||
results |= test_lShift_16u_func();
|
||||
}
|
||||
if (testTypes & TEST_PERFORMANCE)
|
||||
{
|
||||
results |= test_lShift_16u_speed();
|
||||
}
|
||||
}
|
||||
if (testSet & TEST_RSHIFT16S)
|
||||
{
|
||||
if (testTypes & TEST_FUNCTIONALITY)
|
||||
{
|
||||
results |= test_rShift_16s_func();
|
||||
}
|
||||
if (testTypes & TEST_PERFORMANCE)
|
||||
{
|
||||
results |= test_rShift_16s_speed();
|
||||
}
|
||||
}
|
||||
if (testSet & TEST_RSHIFT16U)
|
||||
{
|
||||
if (testTypes & TEST_FUNCTIONALITY)
|
||||
{
|
||||
results |= test_rShift_16u_func();
|
||||
}
|
||||
if (testTypes & TEST_PERFORMANCE)
|
||||
{
|
||||
results |= test_rShift_16u_speed();
|
||||
}
|
||||
}
|
||||
/* COLORS */
|
||||
if (testSet & TEST_RGB)
|
||||
{
|
||||
if (testTypes & TEST_FUNCTIONALITY)
|
||||
{
|
||||
results |= test_RGBToRGB_16s8u_P3AC4R_func();
|
||||
}
|
||||
if (testTypes & TEST_PERFORMANCE)
|
||||
{
|
||||
results |= test_RGBToRGB_16s8u_P3AC4R_speed();
|
||||
}
|
||||
if (testTypes & TEST_FUNCTIONALITY)
|
||||
{
|
||||
results |= test_yCbCrToRGB_16s16s_P3P3_func();
|
||||
}
|
||||
if (testTypes & TEST_PERFORMANCE)
|
||||
{
|
||||
results |= test_yCbCrToRGB_16s16s_P3P3_speed();
|
||||
}
|
||||
}
|
||||
/* ALPHA COMPOSITION */
|
||||
if (testSet & TEST_ALPHA)
|
||||
{
|
||||
if (testTypes & TEST_FUNCTIONALITY)
|
||||
{
|
||||
results |= test_alphaComp_func();
|
||||
}
|
||||
if (testTypes & TEST_PERFORMANCE)
|
||||
{
|
||||
results |= test_alphaComp_speed();
|
||||
}
|
||||
}
|
||||
/* AND & OR */
|
||||
if (testSet & TEST_AND)
|
||||
{
|
||||
if (testTypes & TEST_FUNCTIONALITY)
|
||||
{
|
||||
results |= test_and_32u_func();
|
||||
}
|
||||
if (testTypes & TEST_PERFORMANCE)
|
||||
{
|
||||
results |= test_and_32u_speed();
|
||||
}
|
||||
}
|
||||
if (testSet & TEST_OR)
|
||||
{
|
||||
if (testTypes & TEST_FUNCTIONALITY)
|
||||
{
|
||||
results |= test_or_32u_func();
|
||||
}
|
||||
if (testTypes & TEST_PERFORMANCE)
|
||||
{
|
||||
results |= test_or_32u_speed();
|
||||
}
|
||||
}
|
||||
|
||||
primitives_deinit();
|
||||
return results;
|
||||
}
|
|
@ -0,0 +1,247 @@
|
|||
/* primtest.h
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License. Algorithms used by
|
||||
* this code may be covered by patents by HP, Microsoft, or other parties.
|
||||
*/
|
||||
|
||||
#ifdef __GNUC__
|
||||
# pragma once
|
||||
#endif
|
||||
|
||||
#ifndef __PRIMTEST_H_INCLUDED__
|
||||
#define __PRIMTEST_H_INCLUDED__
|
||||
|
||||
#include <config.h>
|
||||
#include <stdint.h>
|
||||
#include <winpr/wtypes.h>
|
||||
|
||||
#include <measure.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <freerdp/primitives.h>
|
||||
#ifdef WITH_IPP
|
||||
#include <ipps.h>
|
||||
#include <ippi.h>
|
||||
#endif
|
||||
|
||||
#define BLOCK_ALIGNMENT 16
|
||||
#ifdef __GNUC__
|
||||
#define ALIGN(x) x __attribute((aligned(BLOCK_ALIGNMENT)))
|
||||
#define POSSIBLY_UNUSED(x) x __attribute((unused))
|
||||
#else
|
||||
/* TODO: Someone needs to finish this for non-GNU C */
|
||||
#define ALIGN(x) x
|
||||
#define POSSIBLY_UNUSED(x) x
|
||||
#endif
|
||||
#define ABS(_x_) ((_x_) < 0 ? (-(_x_)) : (_x_))
|
||||
#define MAX_TEST_SIZE 4096
|
||||
|
||||
extern int test_sizes[];
|
||||
#define NUM_TEST_SIZES 10
|
||||
|
||||
extern void get_random_data(void *buffer, size_t size);
|
||||
|
||||
#ifndef SUCCESS
|
||||
#define SUCCESS 0
|
||||
#endif
|
||||
#ifndef FAILURE
|
||||
#define FAILURE 1
|
||||
#endif
|
||||
|
||||
extern int test_copy8u_func(void);
|
||||
extern int test_copy8u_speed(void);
|
||||
|
||||
extern int test_set8u_func(void);
|
||||
extern int test_set32s_func(void);
|
||||
extern int test_set8u_speed(void);
|
||||
extern int test_set32s_speed(void);
|
||||
extern int test_set32u_speed(void);
|
||||
|
||||
extern int test_sign16s_func(void);
|
||||
extern int test_sign16s_speed(void);
|
||||
|
||||
extern int test_add16s_func(void);
|
||||
extern int test_add16s_speed(void);
|
||||
|
||||
extern int test_lShift_16s_func(void);
|
||||
extern int test_lShift_16u_func(void);
|
||||
extern int test_rShift_16s_func(void);
|
||||
extern int test_rShift_16u_func(void);
|
||||
extern int test_lShift_16s_speed(void);
|
||||
extern int test_lShift_16u_speed(void);
|
||||
extern int test_rShift_16s_speed(void);
|
||||
extern int test_rShift_16u_speed(void);
|
||||
|
||||
extern int test_RGBToRGB_16s8u_P3AC4R_func(void);
|
||||
extern int test_RGBToRGB_16s8u_P3AC4R_speed(void);
|
||||
extern int test_yCbCrToRGB_16s16s_P3P3_func(void);
|
||||
extern int test_yCbCrToRGB_16s16s_P3P3_speed(void);
|
||||
|
||||
extern int test_alphaComp_func(void);
|
||||
extern int test_alphaComp_speed(void);
|
||||
|
||||
extern int test_and_32u_func(void);
|
||||
extern int test_and_32u_speed(void);
|
||||
extern int test_or_32u_func(void);
|
||||
extern int test_or_32u_speed(void);
|
||||
|
||||
/* Since so much of this code is repeated, define a macro to build
|
||||
* functions to do speed tests.
|
||||
*/
|
||||
#ifdef armel
|
||||
#define SIMD_TYPE "Neon"
|
||||
#else
|
||||
#define SIMD_TYPE "SSE"
|
||||
#endif
|
||||
|
||||
#define DO_NORMAL_MEASUREMENTS(_funcNormal_, _prework_) \
|
||||
do { \
|
||||
for (s=0; s<num_sizes; ++s) \
|
||||
{ \
|
||||
int iter; \
|
||||
char label[256]; \
|
||||
int size = size_array[s]; \
|
||||
_prework_; \
|
||||
iter = iterations/size; \
|
||||
sprintf(label, "%s-%-4d", oplabel, size); \
|
||||
MEASURE_TIMED(label, iter, test_time, resultNormal[s], \
|
||||
_funcNormal_); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#if defined(i386) && defined(WITH_SSE2)
|
||||
#define DO_SSE_MEASUREMENTS(_funcSSE_, _prework_) \
|
||||
do { \
|
||||
for (s=0; s<num_sizes; ++s) \
|
||||
{ \
|
||||
int iter; \
|
||||
char label[256]; \
|
||||
int size = size_array[s]; \
|
||||
_prework_; \
|
||||
iter = iterations/size; \
|
||||
sprintf(label, "%s-%s-%-4d", SIMD_TYPE, oplabel, size); \
|
||||
MEASURE_TIMED(label, iter, test_time, resultSSENeon[s], \
|
||||
_funcSSE_); \
|
||||
} \
|
||||
} while (0)
|
||||
#else
|
||||
#define DO_SSE_MEASUREMENTS(_funcSSE_, _prework_)
|
||||
#endif
|
||||
|
||||
#if defined(armel) && defined(INCLUDE_NEON_MEASUREMENTS)
|
||||
#define DO_NEON_MEASUREMENTS(_funcNeon_, _prework_) \
|
||||
do { \
|
||||
for (s=0; s<num_sizes; ++s) \
|
||||
{ \
|
||||
int iter; \
|
||||
char label[256]; \
|
||||
int size = size_array[s]; \
|
||||
_prework_; \
|
||||
iter = iterations/size; \
|
||||
sprintf(label, "%s-%s-%-4d", SIMD_TYPE, oplabel, size); \
|
||||
MEASURE_TIMED(label, iter, test_time, resultSSENeon[s], \
|
||||
_funcNeon_); \
|
||||
} \
|
||||
} while (0)
|
||||
#else
|
||||
#define DO_NEON_MEASUREMENTS(_funcNeon_, _prework_)
|
||||
#endif
|
||||
|
||||
#if defined(i386) && defined(WITH_IPP)
|
||||
#define DO_IPP_MEASUREMENTS(_funcIPP_, _prework_) \
|
||||
do { \
|
||||
for (s=0; s<num_sizes; ++s) \
|
||||
{ \
|
||||
int iter; \
|
||||
char label[256]; \
|
||||
int size = size_array[s]; \
|
||||
_prework_; \
|
||||
iter = iterations/size; \
|
||||
sprintf(label, "IPP-%s-%-4d", oplabel, size); \
|
||||
MEASURE_TIMED(label, iter, test_time, resultIPP[s], \
|
||||
_funcIPP_); \
|
||||
} \
|
||||
} while (0)
|
||||
#else
|
||||
#define DO_IPP_MEASUREMENTS(_funcIPP_, _prework_)
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
#define STD_SPEED_TEST( \
|
||||
_name_, _srctype_, _dsttype_, _prework_, \
|
||||
_doNormal_, _funcNormal_, \
|
||||
_doSSE_, _funcSSE_, _flagsSSE_, \
|
||||
_doNeon_, _funcNeon_, _flagsNeon_, \
|
||||
_doIPP_, _funcIPP_) \
|
||||
static void _name_( \
|
||||
const char *oplabel, const char *type, \
|
||||
const _srctype_ *src1, const _srctype_ *src2, _srctype_ constant, \
|
||||
_dsttype_ *dst, \
|
||||
const int *size_array, int num_sizes, \
|
||||
int iterations, float test_time) \
|
||||
{ \
|
||||
int s; \
|
||||
float *resultNormal, *resultSSENeon, *resultIPP; \
|
||||
UINT32 pflags = primitives_get_flags(primitives_get()); \
|
||||
resultNormal = (float *) calloc(num_sizes, sizeof(float)); \
|
||||
resultSSENeon = (float *) calloc(num_sizes, sizeof(float)); \
|
||||
resultIPP = (float *) calloc(num_sizes, sizeof(float)); \
|
||||
printf("******************** %s %s ******************\n", \
|
||||
oplabel, type); \
|
||||
if (_doNormal_) { DO_NORMAL_MEASUREMENTS(_funcNormal_, _prework_); } \
|
||||
if (_doSSE_) { \
|
||||
if ((pflags & (_flagsSSE_)) == (_flagsSSE_)) \
|
||||
{ \
|
||||
DO_SSE_MEASUREMENTS(_funcSSE_, _prework_); \
|
||||
} \
|
||||
} \
|
||||
if (_doNeon_) { \
|
||||
if ((pflags & (_flagsNeon_)) == (_flagsNeon_)) \
|
||||
{ \
|
||||
DO_NEON_MEASUREMENTS(_funcNeon_, _prework_); \
|
||||
} \
|
||||
} \
|
||||
if (_doIPP_) { DO_IPP_MEASUREMENTS(_funcIPP_, _prework_); } \
|
||||
printf("----------------------- SUMMARY ----------------------------\n"); \
|
||||
printf("%8s: %15s %15s %5s %15s %5s\n", \
|
||||
"size", "general", SIMD_TYPE, "%", "IPP", "%"); \
|
||||
for (s=0; s<num_sizes; ++s) \
|
||||
{ \
|
||||
char sN[32], sSN[32], sSNp[8], sIPP[32], sIPPp[8]; \
|
||||
strcpy(sN, "N/A"); strcpy(sSN, "N/A"); strcpy(sSNp, "N/A"); \
|
||||
strcpy(sIPP, "N/A"); strcpy(sIPPp, "N/A"); \
|
||||
if (resultNormal[s] > 0.0) _floatprint(resultNormal[s], sN); \
|
||||
if (resultSSENeon[s] > 0.0) \
|
||||
{ \
|
||||
_floatprint(resultSSENeon[s], sSN); \
|
||||
if (resultNormal[s] > 0.0) \
|
||||
{ \
|
||||
sprintf(sSNp, "%d%%", \
|
||||
(int) (resultSSENeon[s] / resultNormal[s] * 100.0 + 0.5)); \
|
||||
} \
|
||||
} \
|
||||
if (resultIPP[s] > 0.0) \
|
||||
{ \
|
||||
_floatprint(resultIPP[s], sIPP); \
|
||||
if (resultNormal[s] > 0.0) \
|
||||
{ \
|
||||
sprintf(sIPPp, "%d%%", \
|
||||
(int) (resultIPP[s] / resultNormal[s] * 100.0 + 0.5)); \
|
||||
} \
|
||||
} \
|
||||
printf("%8d: %15s %15s %5s %15s %5s\n", \
|
||||
size_array[s], sN, sSN, sSNp, sIPP, sIPPp); \
|
||||
} \
|
||||
free(resultNormal); free(resultSSENeon); free(resultIPP); \
|
||||
}
|
||||
|
||||
#endif // !__PRIMTEST_H_INCLUDED__
|
|
@ -0,0 +1,105 @@
|
|||
/* test_add.c
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include "prim_test.h"
|
||||
|
||||
#define FUNC_TEST_SIZE 65536
|
||||
static const int ADD16S_PRETEST_ITERATIONS = 300000*64;
|
||||
static const int TEST_TIME = 2.0; // seconds
|
||||
|
||||
extern pstatus_t general_add_16s(
|
||||
const INT16 *pSrc1, const INT16 *pSrc2, INT16 *pDst, int len);
|
||||
extern pstatus_t sse3_add_16s(
|
||||
const INT16 *pSrc1, const INT16 *pSrc2, INT16 *pDst, int len);
|
||||
|
||||
/* ========================================================================= */
|
||||
int test_add16s_func(void)
|
||||
{
|
||||
INT16 ALIGN(src1[FUNC_TEST_SIZE+3]), ALIGN(src2[FUNC_TEST_SIZE+3]),
|
||||
ALIGN(d1[FUNC_TEST_SIZE+3]), ALIGN(d2[FUNC_TEST_SIZE+3]);
|
||||
int failed = 0;
|
||||
int i;
|
||||
char testStr[256];
|
||||
UINT32 pflags = primitives_get_flags(primitives_get());
|
||||
|
||||
testStr[0] = '\0';
|
||||
get_random_data(src1, sizeof(src1));
|
||||
get_random_data(src2, sizeof(src2));
|
||||
memset(d1, 0, sizeof(d1));
|
||||
memset(d2, 0, sizeof(d2));
|
||||
general_add_16s(src1+1, src2+1, d1+1, FUNC_TEST_SIZE);
|
||||
#ifdef i386
|
||||
if (pflags & PRIM_X86_SSE3_AVAILABLE)
|
||||
{
|
||||
strcat(testStr, " SSE3");
|
||||
/* Aligned */
|
||||
sse3_add_16s(src1+1, src2+1, d2+1, FUNC_TEST_SIZE);
|
||||
for (i=1; i<FUNC_TEST_SIZE; ++i)
|
||||
{
|
||||
if (d1[i] != d2[i])
|
||||
{
|
||||
printf("ADD16S-SSE-aligned FAIL[%d] %d+%d=%d, got %d\n",
|
||||
i, src1[i], src2[i], d1[i], d2[i]);
|
||||
++failed;
|
||||
}
|
||||
}
|
||||
/* Unaligned */
|
||||
sse3_add_16s(src1+1, src2+1, d2+2, FUNC_TEST_SIZE);
|
||||
for (i=1; i<FUNC_TEST_SIZE; ++i)
|
||||
{
|
||||
if (d1[i] != d2[i+1])
|
||||
{
|
||||
printf("ADD16S-SSE-unaligned FAIL[%d] %d+%d=%d, got %d\n",
|
||||
i, src1[i], src2[i], d1[i], d2[i+1]);
|
||||
++failed;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* i386 */
|
||||
#ifdef WITH_IPP
|
||||
strcat(testStr, " IPP");
|
||||
ippsAdd_16s(src1+1, src2+1, d2+1, FUNC_TEST_SIZE);
|
||||
for (i=1; i<FUNC_TEST_SIZE; ++i)
|
||||
{
|
||||
if (d1[i] != d2[i])
|
||||
{
|
||||
printf("ADD16S-IPP FAIL[%d] %d+%d=%d, got %d\n",
|
||||
i, src1[i], src2[i], d1[i], d2[i]);
|
||||
++failed;
|
||||
}
|
||||
}
|
||||
#endif /* WITH_IPP */
|
||||
if (!failed) printf("All add16s tests passed (%s).\n", testStr);
|
||||
return (failed > 0) ? FAILURE : SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
STD_SPEED_TEST(add16s_speed_test, INT16, INT16, dst=dst,
|
||||
TRUE, general_add_16s(src1, src2, dst, size),
|
||||
TRUE, sse3_add_16s(src1, src2, dst, size), PRIM_X86_SSE3_AVAILABLE,
|
||||
FALSE, dst=dst, 0,
|
||||
TRUE, ippsAdd_16s(src1, src2, dst, size));
|
||||
|
||||
int test_add16s_speed(void)
|
||||
{
|
||||
INT16 ALIGN(src1[MAX_TEST_SIZE+3]), ALIGN(src2[MAX_TEST_SIZE+3]),
|
||||
ALIGN(dst[MAX_TEST_SIZE+3]);
|
||||
get_random_data(src1, sizeof(src1));
|
||||
get_random_data(src2, sizeof(src2));
|
||||
add16s_speed_test("add16s", "aligned", src1, src2, 0, dst,
|
||||
test_sizes, NUM_TEST_SIZES, ADD16S_PRETEST_ITERATIONS, TEST_TIME);
|
||||
add16s_speed_test("add16s", "unaligned", src1+1, src2+2, 0, dst,
|
||||
test_sizes, NUM_TEST_SIZES, ADD16S_PRETEST_ITERATIONS, TEST_TIME);
|
||||
return SUCCESS;
|
||||
}
|
|
@ -0,0 +1,225 @@
|
|||
/* test_alphaComp.c
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include "prim_test.h"
|
||||
|
||||
static const int ALPHA_PRETEST_ITERATIONS = 5000000;
|
||||
static const float TEST_TIME = 5.0;
|
||||
|
||||
static const int block_size[] = { 4, 64, 256 };
|
||||
#define NUM_BLOCK_SIZES (sizeof(block_size)/sizeof(int))
|
||||
#define MAX_BLOCK_SIZE 256
|
||||
#define SIZE_SQUARED (MAX_BLOCK_SIZE*MAX_BLOCK_SIZE)
|
||||
|
||||
extern pstatus_t general_alphaComp_argb(
|
||||
const BYTE *pSrc1, int src1Step,
|
||||
const BYTE *pSrc2, int src2Step,
|
||||
BYTE *pDst, int dstStep,
|
||||
int width, int height);
|
||||
extern pstatus_t sse2_alphaComp_argb(
|
||||
const BYTE *pSrc1, int src1Step,
|
||||
const BYTE *pSrc2, int src2Step,
|
||||
BYTE *pDst, int dstStep,
|
||||
int width, int height);
|
||||
extern pstatus_t ipp_alphaComp_argb(
|
||||
const BYTE *pSrc1, int src1Step,
|
||||
const BYTE *pSrc2, int src2Step,
|
||||
BYTE *pDst, int dstStep,
|
||||
int width, int height);
|
||||
|
||||
/* ========================================================================= */
|
||||
#define ALF(_c_) (((_c_) & 0xFF000000U) >> 24)
|
||||
#define RED(_c_) (((_c_) & 0x00FF0000U) >> 16)
|
||||
#define GRN(_c_) (((_c_) & 0x0000FF00U) >> 8)
|
||||
#define BLU(_c_) ((_c_) & 0x000000FFU)
|
||||
#define TOLERANCE 1
|
||||
#define PIXEL(_addr_, _bytes_, _x_, _y_) \
|
||||
((UINT32 *) (((BYTE *) (_addr_)) + (_x_)*4 + (_y_)*(_bytes_)))
|
||||
#define SRC1_WIDTH 6
|
||||
#define SRC1_HEIGHT 6
|
||||
#define SRC2_WIDTH 7
|
||||
#define SRC2_HEIGHT 7
|
||||
#define DST_WIDTH 9
|
||||
#define DST_HEIGHT 9
|
||||
#define TEST_WIDTH 4
|
||||
#define TEST_HEIGHT 5
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static UINT32 alpha_add(
|
||||
UINT32 c1,
|
||||
UINT32 c2)
|
||||
{
|
||||
UINT32 a1 = ALF(c1);
|
||||
UINT32 r1 = RED(c1);
|
||||
UINT32 g1 = GRN(c1);
|
||||
UINT32 b1 = BLU(c1);
|
||||
|
||||
UINT32 a2 = ALF(c2);
|
||||
UINT32 r2 = RED(c2);
|
||||
UINT32 g2 = GRN(c2);
|
||||
UINT32 b2 = BLU(c2);
|
||||
|
||||
UINT32 a3 = ((a1 * a1 + (255-a1) * a2) / 255) & 0xff;
|
||||
UINT32 r3 = ((a1 * r1 + (255-a1) * r2) / 255) & 0xff;
|
||||
UINT32 g3 = ((a1 * g1 + (255-a1) * g2) / 255) & 0xff;
|
||||
UINT32 b3 = ((a1 * b1 + (255-a1) * b2) / 255) & 0xff;
|
||||
|
||||
return (a3 << 24) | (r3 << 16) | (g3 << 8) | b3;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static UINT32 colordist(
|
||||
UINT32 c1,
|
||||
UINT32 c2)
|
||||
{
|
||||
int d, maxd = 0;
|
||||
|
||||
d = ABS(ALF(c1) - ALF(c2));
|
||||
if (d > maxd) maxd = d;
|
||||
d = ABS(RED(c1) - RED(c2));
|
||||
if (d > maxd) maxd = d;
|
||||
d = ABS(GRN(c1) - GRN(c2));
|
||||
if (d > maxd) maxd = d;
|
||||
d = ABS(BLU(c1) - BLU(c2));
|
||||
if (d > maxd) maxd = d;
|
||||
return maxd;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
int test_alphaComp_func(void)
|
||||
{
|
||||
UINT32 ALIGN(src1[SRC1_WIDTH*SRC1_HEIGHT]);
|
||||
UINT32 ALIGN(src2[SRC2_WIDTH*SRC2_HEIGHT]);
|
||||
UINT32 ALIGN(dst1[DST_WIDTH*DST_HEIGHT]);
|
||||
UINT32 ALIGN(dst2a[DST_WIDTH*DST_HEIGHT]);
|
||||
UINT32 ALIGN(dst2u[DST_WIDTH*DST_HEIGHT+1]);
|
||||
UINT32 ALIGN(dst3[DST_WIDTH*DST_HEIGHT]);
|
||||
int error = 0;
|
||||
UINT32 pflags = primitives_get_flags(primitives_get());
|
||||
char testStr[256];
|
||||
UINT32 *ptr;
|
||||
int i, x, y;
|
||||
|
||||
testStr[0] = '\0';
|
||||
get_random_data(src1, sizeof(src1));
|
||||
/* Special-case the first two values */
|
||||
src1[0] &= 0x00FFFFFFU;
|
||||
src1[1] |= 0xFF000000U;
|
||||
get_random_data(src2, sizeof(src2));
|
||||
/* Set the second operand to fully-opaque. */
|
||||
ptr = src2;
|
||||
for (i=0; i<sizeof(src2)/4; ++i) *ptr++ |= 0xFF000000U;
|
||||
memset(dst1, 0, sizeof(dst1));
|
||||
memset(dst2a, 0, sizeof(dst2a));
|
||||
memset(dst2u, 0, sizeof(dst2u));
|
||||
memset(dst3, 0, sizeof(dst3));
|
||||
|
||||
general_alphaComp_argb((const BYTE *) src1, 4*SRC1_WIDTH,
|
||||
(const BYTE *) src2, 4*SRC2_WIDTH,
|
||||
(BYTE *) dst1, 4*DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
|
||||
#ifdef i386
|
||||
if (pflags & PRIM_X86_SSE2_AVAILABLE)
|
||||
{
|
||||
strcat(testStr, " SSE2");
|
||||
sse2_alphaComp_argb((const BYTE *) src1, 4*SRC1_WIDTH,
|
||||
(const BYTE *) src2, 4*SRC2_WIDTH,
|
||||
(BYTE *) dst2a, 4*DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
|
||||
sse2_alphaComp_argb((const BYTE *) src1, 4*SRC1_WIDTH,
|
||||
(const BYTE *) src2, 4*SRC2_WIDTH,
|
||||
(BYTE *) (dst2u+1), 4*DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
|
||||
}
|
||||
#endif /* i386 */
|
||||
#ifdef WITH_IPP
|
||||
strcat(testStr, " IPP");
|
||||
ipp_alphaComp_argb((const BYTE *) src1, 4*SRC1_WIDTH,
|
||||
(const BYTE *) src2, 4*SRC2_WIDTH,
|
||||
(BYTE *) dst3, 4*DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
|
||||
#endif
|
||||
|
||||
for (y=0; y<TEST_HEIGHT; ++y)
|
||||
{
|
||||
for (x=0; x<TEST_WIDTH; ++x)
|
||||
{
|
||||
UINT32 s1 = *PIXEL(src1, 4*SRC1_WIDTH, x, y);
|
||||
UINT32 s2 = *PIXEL(src2, 4*SRC2_WIDTH, x, y);
|
||||
UINT32 c0 = alpha_add(s1, s2);
|
||||
UINT32 c1 = *PIXEL(dst1, 4*DST_WIDTH, x, y);
|
||||
if (colordist(c0, c1) > TOLERANCE)
|
||||
{
|
||||
printf("alphaComp-general: [%d,%d] 0x%08x+0x%08x=0x%08x, got 0x%08x\n",
|
||||
x, y, s1, s2, c0, c1);
|
||||
error = 1;
|
||||
}
|
||||
#ifdef i386
|
||||
if (pflags & PRIM_X86_SSE2_AVAILABLE)
|
||||
{
|
||||
UINT32 c2 = *PIXEL(dst2a, 4*DST_WIDTH, x, y);
|
||||
if (colordist(c0, c2) > TOLERANCE)
|
||||
{
|
||||
printf("alphaComp-SSE-aligned: [%d,%d] 0x%08x+0x%08x=0x%08x, got 0x%08x\n",
|
||||
x, y, s1, s2, c0, c2);
|
||||
error = 1;
|
||||
}
|
||||
c2 = *PIXEL(dst2u+1, 4*DST_WIDTH, x, y);
|
||||
if (colordist(c0, c2) > TOLERANCE)
|
||||
{
|
||||
printf("alphaComp-SSE-unaligned: [%d,%d] 0x%08x+0x%08x=0x%08x, got 0x%08x\n",
|
||||
x, y, s1, s2, c0, c2);
|
||||
error = 1;
|
||||
}
|
||||
}
|
||||
#endif /* i386 */
|
||||
#ifdef WITH_IPP
|
||||
UINT32 c3 = *PIXEL(dst3, 4*DST_WIDTH, x, y);
|
||||
if (colordist(c0, c3) > TOLERANCE)
|
||||
{
|
||||
printf("alphaComp-IPP: [%d,%d] 0x%08x+0x%08x=0x%08x, got 0x%08x\n",
|
||||
x, y, s1, s2, c0, c3);
|
||||
error = 1;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
if (!error) printf("All alphaComp tests passed (%s).\n", testStr);
|
||||
return (error > 0) ? FAILURE : SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
STD_SPEED_TEST(alphaComp_speed, BYTE, BYTE, int bytes = size*4,
|
||||
TRUE, general_alphaComp_argb(src1, bytes, src2, bytes, dst, bytes,
|
||||
size, size),
|
||||
TRUE, sse2_alphaComp_argb(src1, bytes, src2, bytes, dst, bytes,
|
||||
size, size), PRIM_X86_SSE2_AVAILABLE,
|
||||
FALSE, dst=dst, 0,
|
||||
TRUE, ipp_alphaComp_argb(src1, bytes, src2, bytes, dst, bytes,
|
||||
size, size));
|
||||
|
||||
int test_alphaComp_speed(void)
|
||||
{
|
||||
INT32 ALIGN(src1[MAX_BLOCK_SIZE*(MAX_BLOCK_SIZE+1)]),
|
||||
ALIGN(src2[SIZE_SQUARED]),
|
||||
ALIGN(dst[SIZE_SQUARED]);
|
||||
|
||||
get_random_data(src1, sizeof(src1));
|
||||
get_random_data(src2, sizeof(src2));
|
||||
|
||||
alphaComp_speed("alphaComp", "aligned",
|
||||
(BYTE *) src1, (BYTE *) src2, 0, (BYTE *) dst,
|
||||
block_size, NUM_BLOCK_SIZES, ALPHA_PRETEST_ITERATIONS, TEST_TIME);
|
||||
alphaComp_speed("alphaComp", "unaligned",
|
||||
(BYTE *) src1+1, (BYTE *) src2, 0, (BYTE *) dst,
|
||||
block_size, NUM_BLOCK_SIZES, ALPHA_PRETEST_ITERATIONS, TEST_TIME);
|
||||
return SUCCESS;
|
||||
}
|
|
@ -0,0 +1,178 @@
|
|||
/* test_andor.c
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include "prim_test.h"
|
||||
|
||||
#define FUNC_TEST_SIZE 65536
|
||||
static const int ANDOR_PRETEST_ITERATIONS = 100000;
|
||||
static const int TEST_TIME = 2.0; // seconds
|
||||
|
||||
extern pstatus_t general_andC_32u(const UINT32 *pSrc, UINT32 val,
|
||||
UINT32 *pDst, int len);
|
||||
extern pstatus_t sse3_andC_32u(const UINT32 *pSrc, UINT32 val,
|
||||
UINT32 *pDst, int len);
|
||||
extern pstatus_t general_orC_32u(const UINT32 *pSrc, UINT32 val,
|
||||
UINT32 *pDst, int len);
|
||||
extern pstatus_t sse3_orC_32u(const UINT32 *pSrc, UINT32 val,
|
||||
UINT32 *pDst, int len);
|
||||
|
||||
#define VALUE (0xA5A5A5A5U)
|
||||
|
||||
/* ========================================================================= */
|
||||
int test_and_32u_func(void)
|
||||
{
|
||||
UINT32 ALIGN(src[FUNC_TEST_SIZE+3]), ALIGN(dst[FUNC_TEST_SIZE+3]);
|
||||
int failed = 0;
|
||||
int i;
|
||||
UINT32 pflags = primitives_get_flags(primitives_get());
|
||||
char testStr[256];
|
||||
|
||||
testStr[0] = '\0';
|
||||
get_random_data(src, sizeof(src));
|
||||
general_andC_32u(src+1, VALUE, dst+1, FUNC_TEST_SIZE);
|
||||
strcat(testStr, " general");
|
||||
for (i=1; i<=FUNC_TEST_SIZE; ++i)
|
||||
{
|
||||
if (dst[i] != (src[i] & VALUE))
|
||||
{
|
||||
printf("AND-general FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
|
||||
i, src[i], VALUE, src[i] & VALUE, dst[i]);
|
||||
++failed;
|
||||
}
|
||||
}
|
||||
#ifdef i386
|
||||
if (pflags & PRIM_X86_SSE3_AVAILABLE)
|
||||
{
|
||||
strcat(testStr, " SSE3");
|
||||
/* Aligned */
|
||||
memset(dst, 0, sizeof(dst));
|
||||
sse3_andC_32u(src+1, VALUE, dst+1, FUNC_TEST_SIZE);
|
||||
for (i=1; i<=FUNC_TEST_SIZE; ++i)
|
||||
{
|
||||
if (dst[i] != (src[i] & VALUE))
|
||||
{
|
||||
printf("AND-SSE-aligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
|
||||
i, src[i], VALUE, src[i] & VALUE, dst[i]);
|
||||
++failed;
|
||||
}
|
||||
}
|
||||
/* Unaligned */
|
||||
memset(dst, 0, sizeof(dst));
|
||||
sse3_andC_32u(src+1, VALUE, dst+2, FUNC_TEST_SIZE);
|
||||
for (i=1; i<=FUNC_TEST_SIZE; ++i)
|
||||
{
|
||||
if (dst[i+1] != (src[i] & VALUE))
|
||||
{
|
||||
printf("AND-SSE-unaligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
|
||||
i, src[i], VALUE, src[i] & VALUE, dst[i+1]);
|
||||
++failed;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* i386 */
|
||||
if (!failed) printf("All and_32u tests passed (%s).\n", testStr);
|
||||
return (failed > 0) ? FAILURE : SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
STD_SPEED_TEST(andC_32u_speed_test, UINT32, UINT32, dst=dst,
|
||||
TRUE, general_andC_32u(src1, constant, dst, size),
|
||||
TRUE, sse3_andC_32u(src1, constant, dst, size), PRIM_X86_SSE3_AVAILABLE,
|
||||
FALSE, dst=dst, 0,
|
||||
TRUE, ippsAndC_32u(src1, constant, dst, size))
|
||||
|
||||
int test_and_32u_speed(void)
|
||||
{
|
||||
UINT32 ALIGN(src[MAX_TEST_SIZE+3]), ALIGN(dst[MAX_TEST_SIZE+3]);
|
||||
get_random_data(src, sizeof(src));
|
||||
andC_32u_speed_test("and32u", "aligned", src, NULL, VALUE, dst,
|
||||
test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME);
|
||||
andC_32u_speed_test("and32u", "unaligned", src+1, NULL, VALUE, dst,
|
||||
test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME);
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
/* ========================================================================= */
|
||||
int test_or_32u_func(void)
|
||||
{
|
||||
UINT32 ALIGN(src[FUNC_TEST_SIZE+3]), ALIGN(dst[FUNC_TEST_SIZE+3]);
|
||||
int failed = 0;
|
||||
int i;
|
||||
UINT32 pflags = primitives_get_flags(primitives_get());
|
||||
char testStr[256];
|
||||
|
||||
testStr[0] = '\0';
|
||||
get_random_data(src, sizeof(src));
|
||||
general_orC_32u(src+1, VALUE, dst+1, FUNC_TEST_SIZE);
|
||||
strcat(testStr, " general");
|
||||
for (i=1; i<=FUNC_TEST_SIZE; ++i)
|
||||
{
|
||||
if (dst[i] != (src[i] | VALUE))
|
||||
{
|
||||
printf("OR-general general FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
|
||||
i, src[i], VALUE, src[i] | VALUE, dst[i]);
|
||||
++failed;
|
||||
}
|
||||
}
|
||||
#ifdef i386
|
||||
if (pflags & PRIM_X86_SSE3_AVAILABLE)
|
||||
{
|
||||
strcat(testStr, " SSE3");
|
||||
/* Aligned */
|
||||
memset(dst, 0, sizeof(dst));
|
||||
sse3_orC_32u(src+1, VALUE, dst+1, FUNC_TEST_SIZE);
|
||||
for (i=1; i<FUNC_TEST_SIZE; ++i)
|
||||
{
|
||||
if (dst[i] != (src[i] | VALUE))
|
||||
{
|
||||
printf("OR-SSE-aligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
|
||||
i, src[i], VALUE, src[i] | VALUE, dst[i]);
|
||||
++failed;
|
||||
}
|
||||
}
|
||||
/* Unaligned */
|
||||
memset(dst, 0, sizeof(dst));
|
||||
sse3_orC_32u(src+1, VALUE, dst+2, FUNC_TEST_SIZE);
|
||||
for (i=1; i<FUNC_TEST_SIZE; ++i)
|
||||
{
|
||||
if (dst[i+1] != (src[i] | VALUE))
|
||||
{
|
||||
printf("OR-SSE-unaligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
|
||||
i, src[i], VALUE, src[i] | VALUE, dst[i+1]);
|
||||
++failed;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* i386 */
|
||||
if (!failed) printf("All or_32u tests passed (%s).\n", testStr);
|
||||
return (failed > 0) ? FAILURE : SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
STD_SPEED_TEST(orC_32u_speed_test, UINT32, UINT32, dst=dst,
|
||||
TRUE, general_orC_32u(src1, constant, dst, size),
|
||||
TRUE, sse3_orC_32u(src1, constant, dst, size), PRIM_X86_SSE3_AVAILABLE,
|
||||
FALSE, dst=dst, 0,
|
||||
TRUE, ippsOrC_32u(src1, constant, dst, size))
|
||||
|
||||
int test_or_32u_speed(void)
|
||||
{
|
||||
UINT32 ALIGN(src[MAX_TEST_SIZE+3]), ALIGN(dst[MAX_TEST_SIZE+3]);
|
||||
get_random_data(src, sizeof(src));
|
||||
orC_32u_speed_test("or32u", "aligned", src, NULL, VALUE, dst,
|
||||
test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME);
|
||||
orC_32u_speed_test("or32u", "unaligned", src+1, NULL, VALUE, dst,
|
||||
test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME);
|
||||
return SUCCESS;
|
||||
}
|
|
@ -0,0 +1,226 @@
|
|||
/* test_colors.c
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include "prim_test.h"
|
||||
|
||||
static const int RGB_TRIAL_ITERATIONS = 1000;
|
||||
static const int YCBCR_TRIAL_ITERATIONS = 1000;
|
||||
static const float TEST_TIME = 4.0;
|
||||
|
||||
extern pstatus_t general_RGBToRGB_16s8u_P3AC4R(const INT16 *pSrc[3],
|
||||
int srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi);
|
||||
extern pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(const INT16 *pSrc[3],
|
||||
int srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi);
|
||||
extern pstatus_t general_yCbCrToRGB_16s16s_P3P3(const INT16 *pSrc[3],
|
||||
int srcStep, INT16 *pDst[3], int dstStep, const prim_size_t *roi);
|
||||
extern pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(const INT16 *pSrc[3],
|
||||
int srcStep, INT16 *pDst[3], int dstStep, const prim_size_t *roi);
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
int test_RGBToRGB_16s8u_P3AC4R_func(void)
|
||||
{
|
||||
INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]);
|
||||
UINT32 ALIGN(out1[4096]), ALIGN(out2[4096]);
|
||||
int i;
|
||||
int failed = 0;
|
||||
UINT32 pflags = primitives_get_flags(primitives_get());
|
||||
char testStr[256];
|
||||
INT16 *ptrs[3];
|
||||
prim_size_t roi = { 64, 64 };
|
||||
|
||||
testStr[0] = '\0';
|
||||
get_random_data(r, sizeof(r));
|
||||
get_random_data(g, sizeof(g));
|
||||
get_random_data(b, sizeof(b));
|
||||
/* clear upper bytes */
|
||||
for (i=0; i<4096; ++i)
|
||||
{
|
||||
r[i] &= 0x00FFU;
|
||||
g[i] &= 0x00FFU;
|
||||
b[i] &= 0x00FFU;
|
||||
}
|
||||
|
||||
ptrs[0] = r;
|
||||
ptrs[1] = g;
|
||||
ptrs[2] = b;
|
||||
|
||||
general_RGBToRGB_16s8u_P3AC4R((const INT16 **) ptrs, 64*2,
|
||||
(BYTE *) out1, 64*4, &roi);
|
||||
#ifdef i386
|
||||
if (pflags & PRIM_X86_SSE2_AVAILABLE)
|
||||
{
|
||||
strcat(testStr, " SSE2");
|
||||
sse2_RGBToRGB_16s8u_P3AC4R((const INT16 **) ptrs, 64*2,
|
||||
(BYTE *) out2, 64*4, &roi);
|
||||
for (i=0; i<4096; ++i)
|
||||
{
|
||||
if (out1[i] != out2[i])
|
||||
{
|
||||
printf("RGBToRGB-SSE FAIL: out1[%d]=0x%08x out2[%d]=0x%08x\n",
|
||||
i, out1[i], i, out2[i]);
|
||||
failed = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* i386 */
|
||||
if (!failed) printf("All RGBToRGB_16s8u_P3AC4R tests passed (%s).\n", testStr);
|
||||
return (failed > 0) ? FAILURE : SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static const prim_size_t roi64x64 = { 64, 64 };
|
||||
STD_SPEED_TEST(
|
||||
rgb_to_argb_speed, INT16*, UINT32, dst=dst,
|
||||
TRUE, general_RGBToRGB_16s8u_P3AC4R(
|
||||
(const INT16 **) src1, 64*2, (BYTE *) dst, 64*4, &roi64x64),
|
||||
TRUE, sse2_RGBToRGB_16s8u_P3AC4R(
|
||||
(const INT16 **) src1, 64*2, (BYTE *) dst, 64*4, &roi64x64),
|
||||
PRIM_X86_SSE2_AVAILABLE,
|
||||
FALSE, dst=dst, 0,
|
||||
FALSE, dst=dst);
|
||||
|
||||
int test_RGBToRGB_16s8u_P3AC4R_speed(void)
|
||||
{
|
||||
INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]);
|
||||
UINT32 ALIGN(dst[4096]);
|
||||
int i;
|
||||
INT16 *ptrs[3];
|
||||
int size_array[] = { 64 };
|
||||
|
||||
get_random_data(r, sizeof(r));
|
||||
get_random_data(g, sizeof(g));
|
||||
get_random_data(b, sizeof(b));
|
||||
/* clear upper bytes */
|
||||
for (i=0; i<4096; ++i)
|
||||
{
|
||||
r[i] &= 0x00FFU;
|
||||
g[i] &= 0x00FFU;
|
||||
b[i] &= 0x00FFU;
|
||||
}
|
||||
|
||||
ptrs[0] = r;
|
||||
ptrs[1] = g;
|
||||
ptrs[2] = b;
|
||||
|
||||
rgb_to_argb_speed("RGBToARGB", "aligned",
|
||||
(const INT16 **) ptrs, NULL, 0, dst,
|
||||
size_array, 1, RGB_TRIAL_ITERATIONS, TEST_TIME);
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
/* ========================================================================= */
|
||||
int test_yCbCrToRGB_16s16s_P3P3_func(void)
|
||||
{
|
||||
INT16 ALIGN(y[4096]), ALIGN(cb[4096]), ALIGN(cr[4096]);
|
||||
INT16 ALIGN(r1[4096]), ALIGN(g1[4096]), ALIGN(b1[4096]);
|
||||
INT16 ALIGN(r2[4096]), ALIGN(g2[4096]), ALIGN(b2[4096]);
|
||||
int i;
|
||||
int failed = 0;
|
||||
UINT32 pflags = primitives_get_flags(primitives_get());
|
||||
char testStr[256];
|
||||
const INT16 *in[3];
|
||||
INT16 *out1[3];
|
||||
INT16 *out2[3];
|
||||
prim_size_t roi = { 64, 64 };
|
||||
|
||||
testStr[0] = '\0';
|
||||
get_random_data(y, sizeof(y));
|
||||
get_random_data(cb, sizeof(cb));
|
||||
get_random_data(cr, sizeof(cr));
|
||||
/* Normalize to 11.5 fixed radix */
|
||||
for (i=0; i<4096; ++i)
|
||||
{
|
||||
y[i] &= 0x1FE0U;
|
||||
cb[i] &= 0x1FE0U;
|
||||
cr[i] &= 0x1FE0U;
|
||||
}
|
||||
memset(r1, 0, sizeof(r1));
|
||||
memset(g1, 0, sizeof(g1));
|
||||
memset(b1, 0, sizeof(b1));
|
||||
memset(r2, 0, sizeof(r2));
|
||||
memset(g2, 0, sizeof(g2));
|
||||
memset(b2, 0, sizeof(b2));
|
||||
|
||||
in[0] = y;
|
||||
in[1] = cb;
|
||||
in[2] = cr;
|
||||
out1[0] = r1;
|
||||
out1[1] = g1;
|
||||
out1[2] = b1;
|
||||
out2[0] = r2;
|
||||
out2[1] = g2;
|
||||
out2[2] = b2;
|
||||
|
||||
general_yCbCrToRGB_16s16s_P3P3(in, 64*2, out1, 64*2, &roi);
|
||||
#ifdef i386
|
||||
if (pflags & PRIM_X86_SSE2_AVAILABLE)
|
||||
{
|
||||
strcat(testStr, " SSE2");
|
||||
sse2_yCbCrToRGB_16s16s_P3P3(in, 64*2, out2, 64*2, &roi);
|
||||
for (i=0; i<4096; ++i)
|
||||
{
|
||||
if ((ABS(r1[i]-r2[i]) > 1)
|
||||
|| (ABS(g1[i]-g2[i]) > 1)
|
||||
|| (ABS(b1[i]-b2[i]) > 1)) {
|
||||
printf("YCbCrToRGB-SSE FAIL[%d]: %d,%d,%d vs %d,%d,%d\n", i,
|
||||
r1[i],g1[i],b1[i], r2[i],g2[i],b2[i]);
|
||||
failed = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* i386 */
|
||||
if (!failed) printf("All yCbCrToRGB_16s16s_P3P3 tests passed (%s).\n", testStr);
|
||||
return (failed > 0) ? FAILURE : SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
STD_SPEED_TEST(
|
||||
ycbcr_to_rgb_speed, INT16*, INT16*, dst=dst,
|
||||
TRUE, general_yCbCrToRGB_16s16s_P3P3(src1, 64*2, dst, 64*2, &roi64x64),
|
||||
TRUE, sse2_yCbCrToRGB_16s16s_P3P3(src1, 64*2, dst, 64*2, &roi64x64),
|
||||
PRIM_X86_SSE2_AVAILABLE,
|
||||
FALSE, dst=dst, 0,
|
||||
FALSE, dst=dst);
|
||||
|
||||
int test_yCbCrToRGB_16s16s_P3P3_speed(void)
|
||||
{
|
||||
INT16 ALIGN(y[4096]), ALIGN(cb[4096]), ALIGN(cr[4096]);
|
||||
INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]);
|
||||
int i;
|
||||
const INT16 *input[3];
|
||||
INT16 *output[3];
|
||||
int size_array[] = { 64 };
|
||||
|
||||
get_random_data(y, sizeof(y));
|
||||
get_random_data(cb, sizeof(cb));
|
||||
get_random_data(cr, sizeof(cr));
|
||||
/* Normalize to 11.5 fixed radix */
|
||||
for (i=0; i<4096; ++i)
|
||||
{
|
||||
y[i] &= 0x1FE0U;
|
||||
cb[i] &= 0x1FE0U;
|
||||
cr[i] &= 0x1FE0U;
|
||||
}
|
||||
|
||||
input[0] = y;
|
||||
input[1] = cb;
|
||||
input[2] = cr;
|
||||
output[0] = r;
|
||||
output[1] = g;
|
||||
output[2] = b;
|
||||
|
||||
ycbcr_to_rgb_speed("yCbCrToRGB", "aligned", input, NULL, NULL, output,
|
||||
size_array, 1, YCBCR_TRIAL_ITERATIONS, TEST_TIME);
|
||||
return SUCCESS;
|
||||
}
|
|
@ -0,0 +1,83 @@
|
|||
/* test_copy.c
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include "prim_test.h"
|
||||
|
||||
static const int MEMCPY_PRETEST_ITERATIONS = 1000000;
|
||||
static const int TEST_TIME = 1.0; // seconds
|
||||
#define COPY_TESTSIZE (256*2+16*2+15+15)
|
||||
#if 0
|
||||
extern pstatus_t sse3_copy_8u(const BYTE *pSrc, BYTE *pDst, int len);
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
int test_copy8u_func(void)
|
||||
{
|
||||
primitives_t *prims = primitives_get();
|
||||
BYTE ALIGN(data[COPY_TESTSIZE+15]);
|
||||
int i, soff;
|
||||
int failed = 0;
|
||||
char testStr[256];
|
||||
BYTE ALIGN(dest[COPY_TESTSIZE+15]);
|
||||
|
||||
testStr[0] = '\0';
|
||||
get_random_data(data, sizeof(data));
|
||||
|
||||
strcat(testStr, " ptr");
|
||||
for (soff=0; soff<16; ++soff)
|
||||
{
|
||||
int doff;
|
||||
for (doff=0; doff<16; ++doff)
|
||||
{
|
||||
int length;
|
||||
for (length=1; length<=COPY_TESTSIZE-doff; ++length)
|
||||
{
|
||||
memset(dest, 0, sizeof(dest));
|
||||
prims->copy_8u(data+soff, dest+doff, length);
|
||||
for (i=0; i<length; ++i)
|
||||
{
|
||||
if (dest[i+doff] != data[i+soff])
|
||||
{
|
||||
printf("COPY8U FAIL: off=%d len=%d, dest[%d]=0x%02x"
|
||||
"data[%d]=0x%02x\n",
|
||||
doff, length, i+doff, dest[i+doff],
|
||||
i+soff, data[i+soff]);
|
||||
failed = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!failed) printf("All copy8 tests passed (%s).\n", testStr);
|
||||
return (failed > 0) ? FAILURE : SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
STD_SPEED_TEST(copy8u_speed_test, BYTE, BYTE, dst=dst,
|
||||
TRUE, memcpy(dst, src1, size),
|
||||
FALSE, NULL, 0,
|
||||
FALSE, NULL, 0,
|
||||
TRUE, ippsCopy_8u(src1, dst, size));
|
||||
|
||||
int test_copy8u_speed(void)
|
||||
{
|
||||
BYTE ALIGN(src[MAX_TEST_SIZE+4]);
|
||||
BYTE ALIGN(intervening[MAX_TEST_SIZE*7]);
|
||||
BYTE ALIGN(dst[MAX_TEST_SIZE+4]);
|
||||
copy8u_speed_test("copy8u", "aligned", src, NULL, 0, dst,
|
||||
test_sizes, NUM_TEST_SIZES, MEMCPY_PRETEST_ITERATIONS, TEST_TIME);
|
||||
copy8u_speed_test("copy8u", "unaligned", src+1, NULL, 0, dst,
|
||||
test_sizes, NUM_TEST_SIZES, MEMCPY_PRETEST_ITERATIONS, TEST_TIME);
|
||||
return SUCCESS;
|
||||
}
|
|
@ -0,0 +1,294 @@
|
|||
/* test_set.c
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include "prim_test.h"
|
||||
|
||||
static const int MEMSET8_PRETEST_ITERATIONS = 100000000;
|
||||
static const int MEMSET32_PRETEST_ITERATIONS = 40000000;
|
||||
static const float TEST_TIME = 1.0;
|
||||
|
||||
extern pstatus_t general_set_8u(BYTE val, BYTE *pDst, int len);
|
||||
extern pstatus_t sse2_set_8u(BYTE val, BYTE *pDst, int len);
|
||||
extern pstatus_t general_set_32s(INT32 val, INT32 *pDst, int len);
|
||||
extern pstatus_t sse2_set_32s(INT32 val, INT32 *pDst, int len);
|
||||
extern pstatus_t general_set_32u(UINT32 val, UINT32 *pDst, int len);
|
||||
extern pstatus_t sse2_set_32u(UINT32 val, UINT32 *pDst, int len);
|
||||
extern pstatus_t ipp_wrapper_set_32u(UINT32 val, UINT32 *pDst, int len);
|
||||
|
||||
static const int set_sizes[] = { 1, 4, 16, 32, 64, 256, 1024, 4096 };
|
||||
#define NUM_SET_SIZES (sizeof(set_sizes)/sizeof(int))
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
int test_set8u_func(void)
|
||||
{
|
||||
BYTE ALIGN(dest[48]);
|
||||
int failed = 0;
|
||||
int off;
|
||||
char testStr[256];
|
||||
UINT32 pflags = primitives_get_flags(primitives_get());
|
||||
testStr[0] = '\0';
|
||||
|
||||
#ifdef i386
|
||||
/* Test SSE under various alignments */
|
||||
if (pflags & PRIM_X86_SSE2_AVAILABLE)
|
||||
{
|
||||
strcat(testStr, " SSE2");
|
||||
for (off=0; off<16; ++off)
|
||||
{
|
||||
int len;
|
||||
for (len=1; len<48-off; ++len)
|
||||
{
|
||||
int i;
|
||||
memset(dest, 0, sizeof(dest));
|
||||
sse2_set_8u(0xa5, dest+off, len);
|
||||
for (i=0; i<len; ++i)
|
||||
{
|
||||
if (dest[off+i] != 0xa5)
|
||||
{
|
||||
printf("SET8U-SSE FAILED: off=%d len=%d dest[%d]=0x%02x\n",
|
||||
off, len, i+off, dest[i+off]);
|
||||
failed=1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* i386 */
|
||||
|
||||
#ifdef WITH_IPP
|
||||
/* Test IPP under various alignments */
|
||||
strcat(testStr, " IPP");
|
||||
for (off=0; off<16; ++off)
|
||||
{
|
||||
int len;
|
||||
for (len=1; len<48-off; ++len)
|
||||
{
|
||||
int i;
|
||||
memset(dest, 0, sizeof(dest));
|
||||
ippsSet_8u(0xa5, dest+off, len);
|
||||
for (i=0; i<len; ++i)
|
||||
{
|
||||
if (dest[off+i] != 0xa5)
|
||||
{
|
||||
printf("SET8U-IPP FAILED: off=%d len=%d dest[%d]=0x%02x\n",
|
||||
off, len, i+off, dest[i+off]);
|
||||
failed=1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* WITH_IPP */
|
||||
|
||||
if (!failed) printf("All set8u tests passed (%s).\n", testStr);
|
||||
return (failed > 0) ? FAILURE : SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
STD_SPEED_TEST(set8u_speed_test, BYTE, BYTE, dst=dst,
|
||||
TRUE, memset(dst, constant, size),
|
||||
FALSE, NULL, 0,
|
||||
FALSE, NULL, 0,
|
||||
TRUE, ippsSet_8u(constant, dst, size));
|
||||
|
||||
int test_set8u_speed(void)
|
||||
{
|
||||
BYTE ALIGN(dst[MAX_TEST_SIZE]);
|
||||
set8u_speed_test("set8u", "aligned", NULL, NULL, 0xA5, dst,
|
||||
set_sizes, NUM_SET_SIZES, MEMSET8_PRETEST_ITERATIONS, TEST_TIME);
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
int test_set32s_func(void)
|
||||
{
|
||||
primitives_t *prims = primitives_get();
|
||||
INT32 ALIGN(dest[512]);
|
||||
int failed = 0;
|
||||
int off;
|
||||
char testStr[256];
|
||||
UINT32 pflags = primitives_get_flags(primitives_get());
|
||||
testStr[0] = '\0';
|
||||
|
||||
#ifdef i386
|
||||
/* Test SSE under various alignments */
|
||||
if (pflags & PRIM_X86_SSE2_AVAILABLE)
|
||||
{
|
||||
strcat(testStr, " SSE2");
|
||||
for (off=0; off<16; ++off) {
|
||||
int len;
|
||||
for (len=1; len<512-off; ++len)
|
||||
{
|
||||
int i;
|
||||
memset(dest, 0, sizeof(dest));
|
||||
sse2_set_32s(0xdeadbeef, dest+off, len);
|
||||
for (i=0; i<len; ++i)
|
||||
{
|
||||
if (dest[off+i] != 0xdeadbeef)
|
||||
{
|
||||
printf("set32s-SSE FAIL: off=%d len=%d dest[%d]=0x%08x\n",
|
||||
off, len, i+off, dest[i+off]);
|
||||
failed=1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* i386 */
|
||||
|
||||
#ifdef WITH_IPP
|
||||
strcat(testStr, " IPP");
|
||||
for (off=0; off<16; ++off) {
|
||||
int len;
|
||||
for (len=1; len<512-off; ++len)
|
||||
{
|
||||
int i;
|
||||
memset(dest, 0, sizeof(dest));
|
||||
ippsSet_32s(0xdeadbeef, dest+off, len);
|
||||
for (i=0; i<len; ++i)
|
||||
{
|
||||
if (dest[off+i] != 0xdeadbeef)
|
||||
{
|
||||
printf("set32s-IPP FAIL: off=%d len=%d dest[%d]=0x%08x\n",
|
||||
off, len, i+off, dest[i+off]);
|
||||
failed=1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* WITH_IPP */
|
||||
|
||||
if (!failed) printf("All set32s tests passed (%s).\n", testStr);
|
||||
return (failed > 0) ? FAILURE : SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
int test_set32u_func(void)
|
||||
{
|
||||
primitives_t *prims = primitives_get();
|
||||
UINT32 ALIGN(dest[512]);
|
||||
int failed = 0;
|
||||
int off;
|
||||
char testStr[256];
|
||||
UINT32 pflags = primitives_get_flags(primitives_get());
|
||||
testStr[0] = '\0';
|
||||
|
||||
#ifdef i386
|
||||
/* Test SSE under various alignments */
|
||||
if (pflags & PRIM_X86_SSE2_AVAILABLE)
|
||||
{
|
||||
strcat(testStr, " SSE2");
|
||||
for (off=0; off<16; ++off) {
|
||||
int len;
|
||||
for (len=1; len<512-off; ++len)
|
||||
{
|
||||
int i;
|
||||
memset(dest, 0, sizeof(dest));
|
||||
sse2_set_32u(0xdeadbeefU, dest+off, len);
|
||||
for (i=0; i<len; ++i)
|
||||
{
|
||||
if (dest[off+i] != 0xdeadbeefU)
|
||||
{
|
||||
printf("set32u-SSE FAIL: off=%d len=%d dest[%d]=0x%08x\n",
|
||||
off, len, i+off, dest[i+off]);
|
||||
failed=1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* i386 */
|
||||
|
||||
#ifdef WITH_IPP
|
||||
strcat(testStr, " IPP");
|
||||
for (off=0; off<16; ++off) {
|
||||
int len;
|
||||
for (len=1; len<512-off; ++len)
|
||||
{
|
||||
memset(dest, 0, sizeof(dest));
|
||||
ipp_wrapper_set_32u(0xdeadbeefU, dest+off, len);
|
||||
int i;
|
||||
for (i=0; i<len; ++i)
|
||||
{
|
||||
if (dest[off+i] != 0xdeadbeefU)
|
||||
{
|
||||
printf("set32u-IPP FAIL: off=%d len=%d dest[%d]=0x%08x\n",
|
||||
off, len, i+off, dest[i+off]);
|
||||
failed=1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* WITH_IPP */
|
||||
|
||||
if (!failed) printf("All set32u tests passed (%s).\n", testStr);
|
||||
return (failed > 0) ? FAILURE : SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static inline void memset32u_naive(
|
||||
UINT32 val,
|
||||
UINT32 *dst,
|
||||
size_t count)
|
||||
{
|
||||
while (count--) *dst++ = val;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
STD_SPEED_TEST(set32u_speed_test, UINT32, UINT32, dst=dst,
|
||||
TRUE, memset32u_naive(constant, dst, size),
|
||||
TRUE, sse2_set_32u(constant, dst, size), PRIM_X86_SSE2_AVAILABLE,
|
||||
FALSE, dst=dst, 0,
|
||||
TRUE, ipp_wrapper_set_32u(constant, dst, size));
|
||||
|
||||
int test_set32u_speed(void)
|
||||
{
|
||||
UINT32 ALIGN(dst[MAX_TEST_SIZE+1]);
|
||||
set32u_speed_test("set32u", "aligned", NULL, NULL, 0xdeadbeef, dst,
|
||||
set_sizes, NUM_SET_SIZES, MEMSET32_PRETEST_ITERATIONS, TEST_TIME);
|
||||
#if 0
|
||||
/* Not really necessary; should be almost as fast. */
|
||||
set32u_speed_test("set32u", "unaligned", NULL, NULL, dst+1,
|
||||
set_sizes, NUM_SET_SIZES, MEMSET32_PRETEST_ITERATIONS, TEST_TIME);
|
||||
#endif
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static inline void memset32s_naive(
|
||||
INT32 val,
|
||||
INT32 *dst,
|
||||
size_t count)
|
||||
{
|
||||
while (count--) *dst++ = val;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
STD_SPEED_TEST(set32s_speed_test, INT32, INT32, dst=dst,
|
||||
TRUE, memset32s_naive(constant, dst, size),
|
||||
TRUE, sse2_set_32s(constant, dst, size), PRIM_X86_SSE2_AVAILABLE,
|
||||
FALSE, dst=dst, 0,
|
||||
TRUE, ippsSet_32s(constant, dst, size));
|
||||
|
||||
int test_set32s_speed(void)
|
||||
{
|
||||
INT32 ALIGN(dst[MAX_TEST_SIZE+1]);
|
||||
set32s_speed_test("set32s", "aligned", NULL, NULL, 0xdeadbeef, dst,
|
||||
set_sizes, NUM_SET_SIZES, MEMSET32_PRETEST_ITERATIONS, TEST_TIME);
|
||||
#if 0
|
||||
/* Not really necessary; should be almost as fast. */
|
||||
set32s_speed_test("set32s", "unaligned", NULL, NULL, dst+1,
|
||||
set_sizes, NUM_SET_SIZES, MEMSET32_PRETEST_ITERATIONS, TEST_TIME);
|
||||
#endif
|
||||
return SUCCESS;
|
||||
}
|
|
@ -0,0 +1,173 @@
|
|||
/* test_shift.c
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include "prim_test.h"
|
||||
|
||||
#define FUNC_TEST_SIZE 65536
|
||||
static const int SHIFT_PRETEST_ITERATIONS = 50000;
|
||||
static const float TEST_TIME = 1.0;
|
||||
|
||||
extern pstatus_t general_lShiftC_16s(
|
||||
const INT16 *pSrc, int val, INT16 *pDst, int len);
|
||||
extern pstatus_t general_rShiftC_16s(
|
||||
const INT16 *pSrc, int val, INT16 *pDst, int len);
|
||||
extern pstatus_t general_shiftC_16s(
|
||||
const INT16 *pSrc, int val, INT16 *pDst, int len);
|
||||
extern pstatus_t general_lShiftC_16u(
|
||||
const UINT16 *pSrc, int val, UINT16 *pDst, int len);
|
||||
extern pstatus_t general_rShiftC_16u(
|
||||
const UINT16 *pSrc, int val, UINT16 *pDst, int len);
|
||||
extern pstatus_t general_shiftC_16u(
|
||||
const UINT16 *pSrc, int val, UINT16 *pDst, int len);
|
||||
extern pstatus_t sse2_lShiftC_16s(
|
||||
const INT16 *pSrc, int val, INT16 *pDst, int len);
|
||||
extern pstatus_t sse2_rShiftC_16s(
|
||||
const INT16 *pSrc, int val, INT16 *pDst, int len);
|
||||
extern pstatus_t sse2_shiftC_16s(
|
||||
const INT16 *pSrc, int val, INT16 *pDst, int len);
|
||||
extern pstatus_t sse2_lShiftC_16u(
|
||||
const UINT16 *pSrc, int val, UINT16 *pDst, int len);
|
||||
extern pstatus_t sse2_rShiftC_16u(
|
||||
const UINT16 *pSrc, int val, UINT16 *pDst, int len);
|
||||
extern pstatus_t sse2_shiftC_16u(
|
||||
const UINT16 *pSrc, int val, UINT16 *pDst, int len);
|
||||
|
||||
#ifdef i386
|
||||
#define SHIFT_TEST_FUNC(_name_, _type_, _str_, _f1_, _f2_) \
|
||||
int _name_(void) \
|
||||
{ \
|
||||
_type_ ALIGN(src[FUNC_TEST_SIZE+3]), \
|
||||
ALIGN(d1[FUNC_TEST_SIZE+3]), ALIGN(d2[FUNC_TEST_SIZE+3]); \
|
||||
int failed = 0; \
|
||||
int i; \
|
||||
UINT32 pflags = primitives_get_flags(primitives_get()); \
|
||||
char testStr[256]; \
|
||||
testStr[0] = '\0'; \
|
||||
get_random_data(src, sizeof(src)); \
|
||||
_f1_(src+1, 3, d1+1, FUNC_TEST_SIZE); \
|
||||
if (pflags & PRIM_X86_SSE3_AVAILABLE) \
|
||||
{ \
|
||||
strcat(testStr, " SSE3"); \
|
||||
/* Aligned */ \
|
||||
_f2_(src+1, 3, d2+1, FUNC_TEST_SIZE); \
|
||||
for (i=1; i<=FUNC_TEST_SIZE; ++i) \
|
||||
{ \
|
||||
if (d1[i] != d2[i]) \
|
||||
{ \
|
||||
printf("%s-SSE-aligned FAIL[%d]: 0x%x>>3=0x%x, got 0x%x\n", \
|
||||
_str_, i, src[i], d1[i], d2[i]); \
|
||||
++failed; \
|
||||
} \
|
||||
} \
|
||||
/* Unaligned */ \
|
||||
_f2_(src+1, 3, d2+2, FUNC_TEST_SIZE); \
|
||||
for (i=1; i<=FUNC_TEST_SIZE; ++i) \
|
||||
{ \
|
||||
if (d1[i] != d2[i+1]) \
|
||||
{ \
|
||||
printf("%s-SSE-unaligned FAIL[%d]: 0x%x>>3=0x%x, got 0x%x\n", \
|
||||
_str_, i, src[i], d1[i], d2[i+1]); \
|
||||
++failed; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
if (!failed) printf("All %s tests passed (%s).\n", _str_, testStr); \
|
||||
return (failed > 0) ? FAILURE : SUCCESS; \
|
||||
}
|
||||
#else
|
||||
#define SHIFT_TEST_FUNC(_name_, _type_, _str_, _f1_, _f2_) \
|
||||
int _name_(void) \
|
||||
{ \
|
||||
return SUCCESS; \
|
||||
}
|
||||
#endif /* i386 */
|
||||
|
||||
SHIFT_TEST_FUNC(test_lShift_16s_func, INT16, "lshift_16s", general_lShiftC_16s,
|
||||
sse2_lShiftC_16s)
|
||||
SHIFT_TEST_FUNC(test_lShift_16u_func, UINT16, "lshift_16u", general_lShiftC_16u,
|
||||
sse2_lShiftC_16u)
|
||||
SHIFT_TEST_FUNC(test_rShift_16s_func, INT16, "rshift_16s", general_rShiftC_16s,
|
||||
sse2_rShiftC_16s)
|
||||
SHIFT_TEST_FUNC(test_rShift_16u_func, UINT16, "rshift_16u", general_rShiftC_16u,
|
||||
sse2_rShiftC_16u)
|
||||
|
||||
/* ========================================================================= */
|
||||
STD_SPEED_TEST(speed_lShift_16s, INT16, INT16, dst=dst,
|
||||
TRUE, general_lShiftC_16s(src1, constant, dst, size),
|
||||
TRUE, sse2_lShiftC_16s(src1, constant, dst, size), PRIM_X86_SSE2_AVAILABLE,
|
||||
FALSE, dst=dst, 0,
|
||||
TRUE, ippsLShiftC_16s(src1, constant, dst, size));
|
||||
STD_SPEED_TEST(speed_lShift_16u, UINT16, UINT16, dst=dst,
|
||||
TRUE, general_lShiftC_16u(src1, constant, dst, size),
|
||||
TRUE, sse2_lShiftC_16u(src1, constant, dst, size), PRIM_X86_SSE2_AVAILABLE,
|
||||
FALSE, dst=dst, 0,
|
||||
TRUE, ippsLShiftC_16u(src1, constant, dst, size));
|
||||
STD_SPEED_TEST(speed_rShift_16s, INT16, INT16, dst=dst,
|
||||
TRUE, general_rShiftC_16s(src1, constant, dst, size),
|
||||
TRUE, sse2_rShiftC_16s(src1, constant, dst, size), PRIM_X86_SSE2_AVAILABLE,
|
||||
FALSE, dst=dst, 0,
|
||||
TRUE, ippsRShiftC_16s(src1, constant, dst, size));
|
||||
STD_SPEED_TEST(speed_rShift_16u, UINT16, UINT16, dst=dst,
|
||||
TRUE, general_rShiftC_16u(src1, constant, dst, size),
|
||||
TRUE, sse2_rShiftC_16u(src1, constant, dst, size), PRIM_X86_SSE2_AVAILABLE,
|
||||
FALSE, dst=dst, 0,
|
||||
TRUE, ippsRShiftC_16u(src1, constant, dst, size));
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
int test_lShift_16s_speed(void)
|
||||
{
|
||||
INT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]);
|
||||
get_random_data(src, sizeof(src));
|
||||
speed_lShift_16s("lShift_16s", "aligned", src, NULL, 3, dst,
|
||||
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
|
||||
speed_lShift_16s("lShift_16s", "unaligned", src+1, NULL, 3, dst,
|
||||
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
int test_lShift_16u_speed(void)
|
||||
{
|
||||
UINT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]);
|
||||
get_random_data(src, sizeof(src));
|
||||
speed_lShift_16u("lShift_16u", "aligned", src, NULL, 3, dst,
|
||||
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
|
||||
speed_lShift_16u("lShift_16u", "unaligned", src+1, NULL, 3, dst,
|
||||
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
int test_rShift_16s_speed(void)
|
||||
{
|
||||
INT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]);
|
||||
get_random_data(src, sizeof(src));
|
||||
speed_rShift_16s("rShift_16s", "aligned", src, NULL, 3, dst,
|
||||
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
|
||||
speed_rShift_16s("rShift_16s", "unaligned", src+1, NULL, 3, dst,
|
||||
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
int test_rShift_16u_speed(void)
|
||||
{
|
||||
UINT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]);
|
||||
get_random_data(src, sizeof(src));
|
||||
speed_rShift_16u("rShift_16u", "aligned", src, NULL, 3, dst,
|
||||
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
|
||||
speed_rShift_16u("rShift_16u", "unaligned", src+1, NULL, 3, dst,
|
||||
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
|
||||
return SUCCESS;
|
||||
}
|
|
@ -0,0 +1,91 @@
|
|||
/* test_sign.c
|
||||
* vi:ts=4 sw=4
|
||||
*
|
||||
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License. You may obtain
|
||||
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
* or implied. See the License for the specific language governing
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include "prim_test.h"
|
||||
|
||||
static const int SIGN_PRETEST_ITERATIONS = 100000;
|
||||
static const float TEST_TIME = 1.0;
|
||||
|
||||
extern pstatus_t general_sign_16s(const INT16 *pSrc, INT16 *pDst, int len);
|
||||
extern pstatus_t ssse3_sign_16s(const INT16 *pSrc, INT16 *pDst, int len);
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
int test_sign16s_func(void)
|
||||
{
|
||||
INT16 ALIGN(src[65535]), ALIGN(d1[65535]), ALIGN(d2[65535]);
|
||||
int failed = 0;
|
||||
int i;
|
||||
UINT32 pflags = primitives_get_flags(primitives_get());
|
||||
char testStr[256];
|
||||
|
||||
/* Test when we can reach 16-byte alignment */
|
||||
testStr[0] = '\0';
|
||||
get_random_data(src, sizeof(src));
|
||||
general_sign_16s(src+1, d1+1, 65535);
|
||||
#ifdef i386
|
||||
if (pflags & PRIM_X86_SSSE3_AVAILABLE)
|
||||
{
|
||||
strcat(testStr, " SSSE3");
|
||||
ssse3_sign_16s(src+1, d2+1, 65535);
|
||||
for (i=1; i<65535; ++i)
|
||||
{
|
||||
if (d1[i] != d2[i])
|
||||
{
|
||||
printf("SIGN16s-SSE-aligned FAIL[%d] of %d: want %d, got %d\n",
|
||||
i, src[i], d1[i], d2[i]);
|
||||
++failed;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* i386 */
|
||||
|
||||
/* Test when we cannot reach 16-byte alignment */
|
||||
get_random_data(src, sizeof(src));
|
||||
general_sign_16s(src+1, d1+2, 65535);
|
||||
#ifdef i386
|
||||
if (pflags & PRIM_X86_SSSE3_AVAILABLE)
|
||||
{
|
||||
ssse3_sign_16s(src+1, d2+2, 65535);
|
||||
for (i=2; i<65535; ++i)
|
||||
{
|
||||
if (d1[i] != d2[i])
|
||||
{
|
||||
printf("SIGN16s-SSE-unaligned FAIL[%d] of %d: want %d, got %d\n",
|
||||
i, src[i-1], d1[i], d2[i]);
|
||||
++failed;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* i386 */
|
||||
if (!failed) printf("All sign16s tests passed (%s).\n", testStr);
|
||||
return (failed > 0) ? FAILURE : SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
STD_SPEED_TEST(sign16s_speed_test, INT16, INT16, dst=dst,
|
||||
TRUE, general_sign_16s(src1, dst, size),
|
||||
TRUE, ssse3_sign_16s(src1, dst, size), PRIM_X86_SSSE3_AVAILABLE,
|
||||
FALSE, dst=dst, 0,
|
||||
FALSE, dst=dst);
|
||||
|
||||
int test_sign16s_speed(void)
|
||||
{
|
||||
INT16 ALIGN(src[MAX_TEST_SIZE+3]), ALIGN(dst[MAX_TEST_SIZE+3]);
|
||||
get_random_data(src, sizeof(src));
|
||||
sign16s_speed_test("sign16s", "aligned", src, NULL, 0, dst,
|
||||
test_sizes, NUM_TEST_SIZES, SIGN_PRETEST_ITERATIONS, TEST_TIME);
|
||||
sign16s_speed_test("sign16s", "unaligned", src+1, NULL, 0, dst,
|
||||
test_sizes, NUM_TEST_SIZES, SIGN_PRETEST_ITERATIONS, TEST_TIME);
|
||||
return SUCCESS;
|
||||
}
|
|
@ -64,7 +64,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS}
|
|||
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
|
||||
MONOLITHIC ${MONOLITHIC_BUILD}
|
||||
MODULE freerdp
|
||||
MODULES freerdp-core freerdp-utils freerdp-codec)
|
||||
MODULES freerdp-core freerdp-utils freerdp-codec freerdp-primitives)
|
||||
|
||||
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
|
||||
MONOLITHIC ${MONOLITHIC_BUILD}
|
||||
|
|
|
@ -33,7 +33,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} freerdp-server)
|
|||
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
|
||||
MONOLITHIC ${MONOLITHIC_BUILD}
|
||||
MODULE freerdp
|
||||
MODULES freerdp-core freerdp-utils freerdp-codec)
|
||||
MODULES freerdp-core freerdp-utils freerdp-codec freerdp-primitives)
|
||||
|
||||
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
|
||||
MONOLITHIC ${MONOLITHIC_BUILD}
|
||||
|
|
|
@ -59,7 +59,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} freerdp-server)
|
|||
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
|
||||
MONOLITHIC ${MONOLITHIC_BUILD}
|
||||
MODULE freerdp
|
||||
MODULES freerdp-core freerdp-utils freerdp-codec)
|
||||
MODULES freerdp-core freerdp-utils freerdp-codec freerdp-primitives)
|
||||
|
||||
target_link_libraries(${MODULE_NAME} ${${MODULE_PREFIX}_LIBS})
|
||||
|
||||
|
|
|
@ -90,7 +90,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} ${X11_LIBRARIES})
|
|||
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
|
||||
MONOLITHIC ${MONOLITHIC_BUILD}
|
||||
MODULE freerdp
|
||||
MODULES freerdp-core freerdp-common freerdp-codec freerdp-utils freerdp-gdi freerdp-crypto freerdp-locale)
|
||||
MODULES freerdp-core freerdp-common freerdp-codec freerdp-primitives freerdp-utils freerdp-gdi freerdp-crypto freerdp-locale)
|
||||
|
||||
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
|
||||
MONOLITHIC ${MONOLITHIC_BUILD}
|
||||
|
|
Loading…
Reference in New Issue