This commit is contained in:
Marc-André Moreau 2013-01-18 19:28:55 -05:00
commit c6e6956a43
49 changed files with 5696 additions and 407 deletions

View File

@ -231,6 +231,10 @@ set(GSTREAMER_FEATURE_TYPE "RECOMMENDED")
set(GSTREAMER_FEATURE_PURPOSE "multimedia")
set(GSTREAMER_FEATURE_DESCRIPTION "multimedia redirection, audio and video playback")
set(IPP_FEATURE_TYPE "OPTIONAL")
set(IPP_FEATURE_PURPOSE "performance")
set(IPP_FEATURE_DESCRIPTION "Intel Integrated Performance Primitives library")
if(WIN32)
set(X11_FEATURE_TYPE "DISABLED")
set(ZLIB_FEATURE_TYPE "DISABLED")
@ -285,6 +289,9 @@ find_feature(PCSC ${PCSC_FEATURE_TYPE} ${PCSC_FEATURE_PURPOSE} ${PCSC_FEATURE_DE
find_feature(FFmpeg ${FFMPEG_FEATURE_TYPE} ${FFMPEG_FEATURE_PURPOSE} ${FFMPEG_FEATURE_DESCRIPTION})
find_feature(Gstreamer ${GSTREAMER_FEATURE_TYPE} ${GSTREAMER_FEATURE_PURPOSE} ${GSTREAMER_FEATURE_DESCRIPTION})
# Intel Performance Primitives
find_feature(IPP ${IPP_FEATURE_TYPE} ${IPP_FEATURE_PURPOSE} ${IPP_FEATURE_DESCRIPTION})
# Installation Paths
if(WIN32)
set(CMAKE_INSTALL_BINDIR ".")

View File

@ -36,7 +36,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} freerdp-client)
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
MONOLITHIC ${MONOLITHIC_BUILD}
MODULE freerdp
MODULES freerdp-core freerdp-gdi freerdp-locale freerdp-codec freerdp-utils)
MODULES freerdp-core freerdp-gdi freerdp-locale freerdp-codec freerdp-primitives freerdp-utils)
target_link_libraries(${MODULE_NAME} ${${MODULE_PREFIX}_LIBS})
install(TARGETS ${MODULE_NAME} DESTINATION ${CMAKE_INSTALL_BINDIR})

View File

@ -78,7 +78,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} freerdp-client)
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS MONOLITHIC ${MONOLITHIC_BUILD}
MODULE freerdp
MODULES freerdp-core freerdp-cache freerdp-gdi freerdp-codec freerdp-rail freerdp-utils)
MODULES freerdp-core freerdp-cache freerdp-gdi freerdp-codec freerdp-primitives freerdp-rail freerdp-utils)
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS MONOLITHIC ${MONOLITHIC_BUILD}
MODULE winpr

View File

@ -46,7 +46,7 @@ set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
MONOLITHIC ${MONOLITHIC_BUILD}
MODULE freerdp
MODULES freerdp-core freerdp-gdi freerdp-codec freerdp-utils)
MODULES freerdp-core freerdp-gdi freerdp-codec freerdp-primitives freerdp-utils)
target_link_libraries(${MODULE_NAME} ${${MODULE_PREFIX}_LIBS})

View File

@ -120,10 +120,14 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} freerdp-client)
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS MONOLITHIC ${MONOLITHIC_BUILD}
MODULE freerdp
MODULES freerdp-core freerdp-gdi freerdp-locale freerdp-rail freerdp-utils)
MODULES freerdp-core freerdp-gdi freerdp-locale freerdp-primitives freerdp-rail freerdp-utils)
target_link_libraries(${MODULE_NAME} ${${MODULE_PREFIX}_LIBS})
if(WITH_IPP)
target_link_libraries(xfreerdp ${IPP_LIBRARY_LIST})
endif()
install(TARGETS ${MODULE_NAME} DESTINATION ${CMAKE_INSTALL_BINDIR})
set_property(TARGET ${MODULE_NAME} PROPERTY FOLDER "Client/X11")

View File

@ -9,6 +9,7 @@ endif()
option(WITH_MANPAGES "Generate manpages." ON)
option(WITH_PROFILER "Compile profiler." OFF)
option(WITH_IPP "Use Intel Performance Primitives." OFF)
if((TARGET_ARCH MATCHES "x86|x64") AND (NOT DEFINED WITH_SSE2))
option(WITH_SSE2 "Enable SSE2 optimization." ON)

View File

@ -265,7 +265,7 @@ if(NOT IPP_FOUND)
# Note, if several IPP installations found the newest version will be
# selected
# ------------------------------------------------------------------------
foreach(curdir ${CMAKE_SYSTEM_PREFIX_PATH})
foreach(curdir ${CMAKE_SYSTEM_PREFIX_PATH} /opt)
set(curdir ${curdir}/intel)
file(TO_CMAKE_PATH ${curdir} CURDIR)
@ -336,3 +336,33 @@ if(WIN32 AND MINGW AND NOT IPP_LATEST_VERSION_MAJOR LESS 7)
set(MSV_NTDLL "ntdll")
set(IPP_LIBRARIES ${IPP_LIBRARIES} ${MSV_NTDLL}${IPP_LIB_SUFFIX})
endif()
# ------------------------------------------------------------------------
# This section will look for the IPP "compiler" dependent library
# libiomp5.
# ------------------------------------------------------------------------
foreach(curdir ${CMAKE_SYSTEM_PREFIX_PATH} /opt)
set(curdir ${curdir}/intel)
if(EXISTS ${curdir})
file(GLOB_RECURSE liblist FOLLOW_SYMLINKS ${curdir}/libiomp5.*)
foreach(lib ${liblist})
get_filename_component(libdir ${lib} REALPATH)
get_filename_component(libdir ${libdir} PATH)
set(IPP_COMPILER_LIBRARY_DIRS ${libdir})
set(IPP_COMPILER_LIBRARIES iomp5)
endforeach(lib)
endif()
endforeach(curdir)
# ------------------------------------------------------------------------
# Build fullpath library list.
# ------------------------------------------------------------------------
find_library(LIB_IPPI ippi PATHS ${IPP_LIBRARY_DIRS})
set(IPP_LIBRARY_LIST ${IPP_LIBRARY_LIST} ${LIB_IPPI})
find_library(LIB_IPPS ipps PATHS ${IPP_LIBRARY_DIRS})
set(IPP_LIBRARY_LIST ${IPP_LIBRARY_LIST} ${LIB_IPPS})
find_library(LIB_IPPCORE ippcore PATHS ${IPP_LIBRARY_DIRS})
set(IPP_LIBRARY_LIST ${IPP_LIBRARY_LIST} ${LIB_IPPCORE})
find_library(LIB_IOMP5 iomp5 PATHS ${IPP_COMPILER_LIBRARY_DIRS})
set(IPP_LIBRARY_LIST ${IPP_LIBRARY_LIST} ${LIB_IOMP5})

View File

@ -36,6 +36,7 @@
#cmakedefine WITH_PROFILER
#cmakedefine WITH_SSE2
#cmakedefine WITH_NEON
#cmakedefine WITH_IPP
#cmakedefine WITH_NATIVE_SSPI
#cmakedefine WITH_JPEG
#cmakedefine WITH_WIN8

View File

@ -101,8 +101,6 @@ struct _RFX_CONTEXT
BYTE quant_idx_cr;
/* routines */
void (*decode_ycbcr_to_rgb)(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf);
void (*encode_rgb_to_ycbcr)(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf);
void (*quantization_decode)(INT16* buffer, const UINT32* quantization_values);
void (*quantization_encode)(INT16* buffer, const UINT32* quantization_values);
void (*dwt_2d_decode)(INT16* buffer, INT16* dwt_buffer);

View File

@ -0,0 +1,207 @@
/* primitives.h
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License. Algorithms used by
* this code may be covered by patents by HP, Microsoft, or other parties.
*/
#ifdef __GNUC__
# pragma once
#endif
#ifndef __PRIMITIVES_H_INCLUDED__
#define __PRIMITIVES_H_INCLUDED__
#include <freerdp/api.h>
#include <freerdp/types.h>
typedef INT32 pstatus_t; /* match IppStatus. */
#define PRIMITIVES_SUCCESS (0) /* match ippStsNoErr */
/* Simple macro for address of an x,y location in 2d 4-byte memory block */
#define PIXMAP4_ADDR(_dst_, _x_, _y_, _span_) \
((void *) (((BYTE *) (_dst_)) + (((_x_) + (_y_)*(_span_)) << 2)))
#define PRIM_X86_MMX_AVAILABLE (1U<<0)
#define PRIM_X86_3DNOW_AVAILABLE (1U<<1)
#define PRIM_X86_3DNOW_PREFETCH_AVAILABLE (1U<<2)
#define PRIM_X86_SSE_AVAILABLE (1U<<3)
#define PRIM_X86_SSE2_AVAILABLE (1U<<4)
#define PRIM_X86_SSE3_AVAILABLE (1U<<5)
#define PRIM_X86_SSSE3_AVAILABLE (1U<<6)
#define PRIM_X86_SSE41_AVAILABLE (1U<<7)
#define PRIM_X86_SSE42_AVAILABLE (1U<<8)
#define PRIM_X86_AVX_AVAILABLE (1U<<9)
#define PRIM_X86_FMA_AVAILABLE (1U<<10)
#define PRIM_X86_AVX_AES_AVAILABLE (1U<<11)
#define PRIM_X86_AVX2_AVAILABLE (1U<<12)
#define PRIM_ARM_VFP1_AVAILABLE (1U<<0)
#define PRIM_ARM_VFP2_AVAILABLE (1U<<1)
#define PRIM_ARM_VFP3_AVAILABLE (1U<<2)
#define PRIM_ARM_VFP4_AVAILABLE (1U<<3)
#define PRIM_ARM_FPA_AVAILABLE (1U<<4)
#define PRIM_ARM_FPE_AVAILABLE (1U<<5)
#define PRIM_ARM_IWMMXT_AVAILABLE (1U<<6)
#define PRIM_ARM_NEON_AVAILABLE (1U<<7)
/* Structures compatible with IPP */
typedef struct
{
INT32 width;
INT32 height;
} prim_size_t; /* like IppiSize */
/* Function prototypes for all of the supported primitives. */
typedef pstatus_t (*__copy_t)(
const void *pSrc,
void *pDst,
INT32 bytes);
typedef pstatus_t (*__copy_8u_t)(
const BYTE *pSrc,
BYTE *pDst,
INT32 len);
typedef pstatus_t (*__copy_8u_AC4r_t)(
const BYTE *pSrc,
INT32 srcStep, /* bytes */
BYTE *pDst,
INT32 dstStep, /* bytes */
INT32 width, INT32 height); /* pixels */
typedef pstatus_t (*__set_8u_t)(
BYTE val,
BYTE *pDst,
INT32 len);
typedef pstatus_t (*__set_32s_t)(
INT32 val,
INT32 *pDst,
INT32 len);
typedef pstatus_t (*__set_32u_t)(
UINT32 val,
UINT32 *pDst,
INT32 len);
typedef pstatus_t (*__zero_t)(
void *pDst,
size_t bytes);
typedef pstatus_t (*__alphaComp_argb_t)(
const BYTE *pSrc1, INT32 src1Step,
const BYTE *pSrc2, INT32 src2Step,
BYTE *pDst, INT32 dstStep,
INT32 width, INT32 height);
typedef pstatus_t (*__add_16s_t)(
const INT16 *pSrc1,
const INT16 *pSrc2,
INT16 *pDst,
INT32 len);
typedef pstatus_t (*__lShiftC_16s_t)(
const INT16 *pSrc,
INT32 val,
INT16 *pSrcDst,
INT32 len);
typedef pstatus_t (*__lShiftC_16u_t)(
const UINT16 *pSrc,
INT32 val,
UINT16 *pSrcDst,
INT32 len);
typedef pstatus_t (*__rShiftC_16s_t)(
const INT16 *pSrc,
INT32 val,
INT16 *pSrcDst,
INT32 len);
typedef pstatus_t (*__rShiftC_16u_t)(
const UINT16 *pSrc,
INT32 val,
UINT16 *pSrcDst,
INT32 len);
typedef pstatus_t (*__shiftC_16s_t)(
const INT16 *pSrc,
INT32 val,
INT16 *pSrcDst,
INT32 len);
typedef pstatus_t (*__shiftC_16u_t)(
const UINT16 *pSrc,
INT32 val,
UINT16 *pSrcDst,
INT32 len);
typedef pstatus_t (*__sign_16s_t)(
const INT16 *pSrc,
INT16 *pDst,
INT32 len);
typedef pstatus_t (*__yCbCrToRGB_16s16s_P3P3_t)(
const INT16 *pSrc[3], INT32 srcStep,
INT16 *pDst[3], INT32 dstStep,
const prim_size_t *roi);
typedef pstatus_t (*__RGBToYCbCr_16s16s_P3P3_t)(
const INT16 *pSrc[3], INT32 srcStep,
INT16 *pDst[3], INT32 dstStep,
const prim_size_t *roi);
typedef pstatus_t (*__RGBToRGB_16s8u_P3AC4R_t)(
const INT16 *pSrc[3], INT32 srcStep,
BYTE *pDst, INT32 dstStep,
const prim_size_t *roi);
typedef pstatus_t (*__andC_32u_t)(
const UINT32 *pSrc,
UINT32 val,
UINT32 *pDst,
INT32 len);
typedef pstatus_t (*__orC_32u_t)(
const UINT32 *pSrc,
UINT32 val,
UINT32 *pDst,
INT32 len);
typedef struct
{
/* Memory-to-memory copy routines */
__copy_t copy; /* memcpy/memmove, basically */
__copy_8u_t copy_8u; /* more strongly typed */
__copy_8u_AC4r_t copy_8u_AC4r; /* pixel copy function */
/* Memory setting routines */
__set_8u_t set_8u; /* memset, basically */
__set_32s_t set_32s;
__set_32u_t set_32u;
__zero_t zero; /* bzero or faster */
/* Arithmetic functions */
__add_16s_t add_16s;
/* And/or */
__andC_32u_t andC_32u;
__orC_32u_t orC_32u;
/* Shifts */
__lShiftC_16s_t lShiftC_16s;
__lShiftC_16u_t lShiftC_16u;
__rShiftC_16s_t rShiftC_16s;
__rShiftC_16u_t rShiftC_16u;
__shiftC_16s_t shiftC_16s;
__shiftC_16u_t shiftC_16u;
/* Alpha Composition */
__alphaComp_argb_t alphaComp_argb;
/* Sign */
__sign_16s_t sign_16s;
/* Color conversions */
__yCbCrToRGB_16s16s_P3P3_t yCbCrToRGB_16s16s_P3P3;
__RGBToYCbCr_16s16s_P3P3_t RGBToYCbCr_16s16s_P3P3;
__RGBToRGB_16s8u_P3AC4R_t RGBToRGB_16s8u_P3AC4R;
/* internal use for CPU flags and such. */
void *hints;
} primitives_t;
/* Prototypes for the externally-visible entrypoints. */
FREERDP_API void primitives_init(void);
FREERDP_API primitives_t *primitives_get(void);
FREERDP_API UINT32 primitives_get_flags(
const primitives_t *prims);
FREERDP_API void primitives_flags_str(
const primitives_t *prims,
char *str,
size_t len);
FREERDP_API void primitives_deinit(void);
#endif /* !__PRIMITIVES_H_INCLUDED__ */

View File

@ -31,6 +31,7 @@ set(${MODULE_PREFIX}_SUBMODULES
codec
crypto
locale
primitives
core)
foreach(${MODULE_PREFIX}_SUBMODULE ${${MODULE_PREFIX}_SUBMODULES})

View File

@ -79,7 +79,7 @@ static void rfx_profiler_create(RFX_CONTEXT* context)
PROFILER_CREATE(context->priv->prof_rfx_differential_decode, "rfx_differential_decode");
PROFILER_CREATE(context->priv->prof_rfx_quantization_decode, "rfx_quantization_decode");
PROFILER_CREATE(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode");
PROFILER_CREATE(context->priv->prof_rfx_decode_ycbcr_to_rgb, "rfx_decode_ycbcr_to_rgb");
PROFILER_CREATE(context->priv->prof_rfx_ycbcr_to_rgb, "prims->yCbCrToRGB");
PROFILER_CREATE(context->priv->prof_rfx_decode_format_rgb, "rfx_decode_format_rgb");
PROFILER_CREATE(context->priv->prof_rfx_encode_rgb, "rfx_encode_rgb");
@ -88,7 +88,7 @@ static void rfx_profiler_create(RFX_CONTEXT* context)
PROFILER_CREATE(context->priv->prof_rfx_differential_encode, "rfx_differential_encode");
PROFILER_CREATE(context->priv->prof_rfx_quantization_encode, "rfx_quantization_encode");
PROFILER_CREATE(context->priv->prof_rfx_dwt_2d_encode, "rfx_dwt_2d_encode");
PROFILER_CREATE(context->priv->prof_rfx_encode_rgb_to_ycbcr, "rfx_encode_rgb_to_ycbcr");
PROFILER_CREATE(context->priv->prof_rfx_rgb_to_ycbcr, "prims->RGBToYCbCr");
PROFILER_CREATE(context->priv->prof_rfx_encode_format_rgb, "rfx_encode_format_rgb");
}
@ -100,7 +100,7 @@ static void rfx_profiler_free(RFX_CONTEXT* context)
PROFILER_FREE(context->priv->prof_rfx_differential_decode);
PROFILER_FREE(context->priv->prof_rfx_quantization_decode);
PROFILER_FREE(context->priv->prof_rfx_dwt_2d_decode);
PROFILER_FREE(context->priv->prof_rfx_decode_ycbcr_to_rgb);
PROFILER_FREE(context->priv->prof_rfx_ycbcr_to_rgb);
PROFILER_FREE(context->priv->prof_rfx_decode_format_rgb);
PROFILER_FREE(context->priv->prof_rfx_encode_rgb);
@ -109,7 +109,7 @@ static void rfx_profiler_free(RFX_CONTEXT* context)
PROFILER_FREE(context->priv->prof_rfx_differential_encode);
PROFILER_FREE(context->priv->prof_rfx_quantization_encode);
PROFILER_FREE(context->priv->prof_rfx_dwt_2d_encode);
PROFILER_FREE(context->priv->prof_rfx_encode_rgb_to_ycbcr);
PROFILER_FREE(context->priv->prof_rfx_rgb_to_ycbcr);
PROFILER_FREE(context->priv->prof_rfx_encode_format_rgb);
}
@ -123,7 +123,7 @@ static void rfx_profiler_print(RFX_CONTEXT* context)
PROFILER_PRINT(context->priv->prof_rfx_differential_decode);
PROFILER_PRINT(context->priv->prof_rfx_quantization_decode);
PROFILER_PRINT(context->priv->prof_rfx_dwt_2d_decode);
PROFILER_PRINT(context->priv->prof_rfx_decode_ycbcr_to_rgb);
PROFILER_PRINT(context->priv->prof_rfx_ycbcr_to_rgb);
PROFILER_PRINT(context->priv->prof_rfx_decode_format_rgb);
PROFILER_PRINT(context->priv->prof_rfx_encode_rgb);
@ -132,7 +132,7 @@ static void rfx_profiler_print(RFX_CONTEXT* context)
PROFILER_PRINT(context->priv->prof_rfx_differential_encode);
PROFILER_PRINT(context->priv->prof_rfx_quantization_encode);
PROFILER_PRINT(context->priv->prof_rfx_dwt_2d_encode);
PROFILER_PRINT(context->priv->prof_rfx_encode_rgb_to_ycbcr);
PROFILER_PRINT(context->priv->prof_rfx_rgb_to_ycbcr);
PROFILER_PRINT(context->priv->prof_rfx_encode_format_rgb);
PROFILER_PRINT_FOOTER;
@ -164,8 +164,6 @@ RFX_CONTEXT* rfx_context_new(void)
rfx_profiler_create(context);
/* set up default routines */
context->decode_ycbcr_to_rgb = rfx_decode_ycbcr_to_rgb;
context->encode_rgb_to_ycbcr = rfx_encode_rgb_to_ycbcr;
context->quantization_decode = rfx_quantization_decode;
context->quantization_encode = rfx_quantization_encode;
context->dwt_2d_decode = rfx_dwt_2d_decode;
@ -414,7 +412,7 @@ static void rfx_process_message_tile(RFX_CONTEXT* context, RFX_TILE* tile, STREA
YLen, context->quants + (quantIdxY * 10),
CbLen, context->quants + (quantIdxCb * 10),
CrLen, context->quants + (quantIdxCr * 10),
tile->data);
tile->data, 64*sizeof(UINT32));
}
static void rfx_process_message_tileset(RFX_CONTEXT* context, RFX_MESSAGE* message, STREAM* s)

View File

@ -27,6 +27,7 @@
#include <string.h>
#include <freerdp/utils/stream.h>
#include <freerdp/primitives.h>
#include "rfx_types.h"
#include "rfx_rlgr.h"
@ -36,49 +37,55 @@
#include "rfx_decode.h"
/* stride is bytes between rows in the output buffer. */
static void rfx_decode_format_rgb(INT16* r_buf, INT16* g_buf, INT16* b_buf,
RDP_PIXEL_FORMAT pixel_format, BYTE* dst_buf)
RDP_PIXEL_FORMAT pixel_format, BYTE* dst_buf, int stride)
{
primitives_t *prims = primitives_get();
INT16* r = r_buf;
INT16* g = g_buf;
INT16* b = b_buf;
INT16* pSrc[3];
static const prim_size_t roi_64x64 = { 64, 64 };
BYTE* dst = dst_buf;
int i;
int x, y;
switch (pixel_format)
{
case RDP_PIXEL_FORMAT_B8G8R8A8:
for (i = 0; i < 4096; i++)
{
*dst++ = (BYTE) (*b++);
*dst++ = (BYTE) (*g++);
*dst++ = (BYTE) (*r++);
*dst++ = 0xFF;
}
pSrc[0] = r; pSrc[1] = g; pSrc[2] = b;
prims->RGBToRGB_16s8u_P3AC4R(
(const INT16 **) pSrc, 64*sizeof(INT16),
dst, stride, &roi_64x64);
break;
case RDP_PIXEL_FORMAT_R8G8B8A8:
for (i = 0; i < 4096; i++)
{
*dst++ = (BYTE) (*r++);
*dst++ = (BYTE) (*g++);
*dst++ = (BYTE) (*b++);
*dst++ = 0xFF;
}
pSrc[0] = b; pSrc[1] = g; pSrc[2] = r;
prims->RGBToRGB_16s8u_P3AC4R(
(const INT16 **) pSrc, 64*sizeof(INT16),
dst, stride, &roi_64x64);
break;
case RDP_PIXEL_FORMAT_B8G8R8:
for (i = 0; i < 4096; i++)
for (y=0; y<64; y++)
{
*dst++ = (BYTE) (*b++);
*dst++ = (BYTE) (*g++);
*dst++ = (BYTE) (*r++);
for (x=0; x<64; x++)
{
*dst++ = (BYTE) (*b++);
*dst++ = (BYTE) (*g++);
*dst++ = (BYTE) (*r++);
}
dst += stride - (64*3);
}
break;
case RDP_PIXEL_FORMAT_R8G8B8:
for (i = 0; i < 4096; i++)
for (y=0; y<64; y++)
{
*dst++ = (BYTE) (*r++);
*dst++ = (BYTE) (*g++);
*dst++ = (BYTE) (*b++);
for (x=0; x<64; x++)
{
*dst++ = (BYTE) (*r++);
*dst++ = (BYTE) (*g++);
*dst++ = (BYTE) (*b++);
}
dst += stride - (64*3);
}
break;
default:
@ -86,69 +93,6 @@ static void rfx_decode_format_rgb(INT16* r_buf, INT16* g_buf, INT16* b_buf,
}
}
#define MINMAX(_v,_l,_h) ((_v) < (_l) ? (_l) : ((_v) > (_h) ? (_h) : (_v)))
void rfx_decode_ycbcr_to_rgb(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf)
{
/* INT32 is used intentionally because we calculate with shifted factors! */
INT32 y, cb, cr;
INT32 r, g, b;
int i;
/**
* The decoded YCbCr coeffectients are represented as 11.5 fixed-point numbers:
*
* 1 sign bit + 10 integer bits + 5 fractional bits
*
* However only 7 integer bits will be actually used since the value range is [-128.0, 127.0].
* In other words, the decoded coeffectients is scaled by << 5 when intepreted as INT16.
* It was scaled in the quantization phase, so we must scale it back here.
*/
for (i = 0; i < 4096; i++)
{
y = y_r_buf[i];
cb = cb_g_buf[i];
cr = cr_b_buf[i];
#if 0
/**
* This is the slow floating point version kept here for reference
*/
y = y + 4096; /* 128<<5=4096 so that we can scale the sum by >> 5 */
r = y + cr*1.403f;
g = y - cb*0.344f - cr*0.714f;
b = y + cb*1.770f;
y_r_buf[i] = MINMAX(r>>5, 0, 255);
cb_g_buf[i] = MINMAX(g>>5, 0, 255);
cr_b_buf[i] = MINMAX(b>>5, 0, 255);
#else
/**
* We scale the factors by << 16 into 32-bit integers in order to avoid slower
* floating point multiplications. Since the final result needs to be scaled
* by >> 5 we will extract only the upper 11 bits (>> 21) from the final sum.
* Hence we also have to scale the other terms of the sum by << 16.
*
* R: 1.403 << 16 = 91947
* G: 0.344 << 16 = 22544, 0.714 << 16 = 46792
* B: 1.770 << 16 = 115998
*/
y = (y+4096)<<16;
r = y + cr*91947;
g = y - cb*22544 - cr*46792;
b = y + cb*115998;
y_r_buf[i] = MINMAX(r>>21, 0, 255);
cb_g_buf[i] = MINMAX(g>>21, 0, 255);
cr_b_buf[i] = MINMAX(b>>21, 0, 255);
#endif
}
}
static void rfx_decode_component(RFX_CONTEXT* context, const UINT32* quantization_values,
const BYTE* data, int size, INT16* buffer)
{
@ -173,11 +117,18 @@ static void rfx_decode_component(RFX_CONTEXT* context, const UINT32* quantizatio
PROFILER_EXIT(context->priv->prof_rfx_decode_component);
}
/* rfx_decode_ycbcr_to_rgb code now resides in the primitives library. */
/* stride is bytes between rows in the output buffer. */
void rfx_decode_rgb(RFX_CONTEXT* context, STREAM* data_in,
int y_size, const UINT32 * y_quants,
int cb_size, const UINT32 * cb_quants,
int cr_size, const UINT32 * cr_quants, BYTE* rgb_buffer)
int cr_size, const UINT32 * cr_quants, BYTE* rgb_buffer, int stride)
{
static const prim_size_t roi_64x64 = { 64, 64 };
const primitives_t *prims = primitives_get();
INT16 *pSrcDst[3];
PROFILER_ENTER(context->priv->prof_rfx_decode_rgb);
rfx_decode_component(context, y_quants, stream_get_tail(data_in), y_size, context->priv->y_r_buffer); /* YData */
@ -188,12 +139,16 @@ void rfx_decode_rgb(RFX_CONTEXT* context, STREAM* data_in,
stream_seek(data_in, cr_size);
PROFILER_ENTER(context->priv->prof_rfx_decode_ycbcr_to_rgb);
context->decode_ycbcr_to_rgb(context->priv->y_r_buffer, context->priv->cb_g_buffer, context->priv->cr_b_buffer);
pSrcDst[0] = context->priv->y_r_buffer;
pSrcDst[1] = context->priv->cb_g_buffer;
pSrcDst[2] = context->priv->cr_b_buffer;
prims->yCbCrToRGB_16s16s_P3P3((const INT16 **) pSrcDst, 64*sizeof(INT16),
pSrcDst, 64*sizeof(INT16), &roi_64x64);
PROFILER_EXIT(context->priv->prof_rfx_decode_ycbcr_to_rgb);
PROFILER_ENTER(context->priv->prof_rfx_decode_format_rgb);
rfx_decode_format_rgb(context->priv->y_r_buffer, context->priv->cb_g_buffer, context->priv->cr_b_buffer,
context->pixel_format, rgb_buffer);
context->pixel_format, rgb_buffer, stride);
PROFILER_EXIT(context->priv->prof_rfx_decode_format_rgb);
PROFILER_EXIT(context->priv->prof_rfx_decode_rgb);

View File

@ -22,12 +22,12 @@
#include <freerdp/codec/rfx.h>
void rfx_decode_ycbcr_to_rgb(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf);
/* stride is bytes between rows in the output buffer. */
void rfx_decode_rgb(RFX_CONTEXT* context, STREAM* data_in,
int y_size, const UINT32 * y_quants,
int cb_size, const UINT32 * cb_quants,
int cr_size, const UINT32 * cr_quants, BYTE* rgb_buffer);
int cr_size, const UINT32 * cr_quants, BYTE* rgb_buffer,
int stride);
#endif /* __RFX_DECODE_H */

View File

@ -26,6 +26,8 @@
#include <stdlib.h>
#include <string.h>
#include <freerdp/primitives.h>
#include "rfx_types.h"
#include "rfx_rlgr.h"
#include "rfx_differential.h"
@ -180,47 +182,7 @@ static void rfx_encode_format_rgb(const BYTE* rgb_data, int width, int height, i
}
}
void rfx_encode_rgb_to_ycbcr(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf)
{
/* INT32 is used intentionally because we calculate with shifted factors! */
int i;
INT32 r, g, b;
INT32 y, cb, cr;
/**
* The encoded YCbCr coefficients are represented as 11.5 fixed-point numbers:
*
* 1 sign bit + 10 integer bits + 5 fractional bits
*
* However only 7 integer bits will be actually used since the value range is [-128.0, 127.0].
* In other words, the encoded coefficients is scaled by << 5 when interpreted as INT16.
* It will be scaled down to original during the quantization phase.
*/
for (i = 0; i < 4096; i++)
{
r = y_r_buf[i];
g = cb_g_buf[i];
b = cr_b_buf[i];
/*
* We scale the factors by << 15 into 32-bit integers in order to avoid slower
* floating point multiplications. Since the terms need to be scaled by << 5 we
* simply scale the final sum by >> 10
*
* Y: 0.299000 << 15 = 9798, 0.587000 << 15 = 19235, 0.114000 << 15 = 3735
* Cb: 0.168935 << 15 = 5535, 0.331665 << 15 = 10868, 0.500590 << 15 = 16403
* Cr: 0.499813 << 15 = 16377, 0.418531 << 15 = 13714, 0.081282 << 15 = 2663
*/
y = (r * 9798 + g * 19235 + b * 3735) >> 10;
cb = (r * -5535 + g * -10868 + b * 16403) >> 10;
cr = (r * 16377 + g * -13714 + b * -2663) >> 10;
y_r_buf[i] = MINMAX(y - 4096, -4096, 4095);
cb_g_buf[i] = MINMAX(cb, -4096, 4095);
cr_b_buf[i] = MINMAX(cr, -4096, 4095);
}
}
/* rfx_encode_rgb_to_ycbcr code now resides in the primitives library. */
static void rfx_encode_component(RFX_CONTEXT* context, const UINT32* quantization_values,
INT16* data, BYTE* buffer, int buffer_size, int* size)
@ -250,6 +212,9 @@ void rfx_encode_rgb(RFX_CONTEXT* context, const BYTE* rgb_data, int width, int h
const UINT32* y_quants, const UINT32* cb_quants, const UINT32* cr_quants,
STREAM* data_out, int* y_size, int* cb_size, int* cr_size)
{
primitives_t *prims = primitives_get();
INT16* pSrcDst[3];
static const prim_size_t roi_64x64 = { 64, 64 };
INT16* y_r_buffer = context->priv->y_r_buffer;
INT16* cb_g_buffer = context->priv->cb_g_buffer;
INT16* cr_b_buffer = context->priv->cr_b_buffer;
@ -261,9 +226,13 @@ void rfx_encode_rgb(RFX_CONTEXT* context, const BYTE* rgb_data, int width, int h
context->pixel_format, context->palette, y_r_buffer, cb_g_buffer, cr_b_buffer);
PROFILER_EXIT(context->priv->prof_rfx_encode_format_rgb);
PROFILER_ENTER(context->priv->prof_rfx_encode_rgb_to_ycbcr);
context->encode_rgb_to_ycbcr(context->priv->y_r_buffer, context->priv->cb_g_buffer, context->priv->cr_b_buffer);
PROFILER_EXIT(context->priv->prof_rfx_encode_rgb_to_ycbcr);
PROFILER_ENTER(context->priv->prof_rfx_rgb_to_ycbcr);
pSrcDst[0] = context->priv->y_r_buffer;
pSrcDst[1] = context->priv->cb_g_buffer;
pSrcDst[2] = context->priv->cr_b_buffer;
prims->RGBToYCbCr_16s16s_P3P3((const INT16 **) pSrcDst, 64*sizeof(INT16),
pSrcDst, 64*sizeof(INT16), &roi_64x64);
PROFILER_EXIT(context->priv->prof_rfx_rgb_to_ycbcr);
/* Ensure the buffer is reasonably large enough */
stream_check_size(data_out, 4096);

View File

@ -22,8 +22,6 @@
#include <freerdp/codec/rfx.h>
void rfx_encode_rgb_to_ycbcr(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf);
void rfx_encode_rgb(RFX_CONTEXT* context, const BYTE* rgb_data, int width, int height, int rowstride,
const UINT32* y_quants, const UINT32* cb_quants, const UINT32* cr_quants,
STREAM* data_out, int* y_size, int* cb_size, int* cr_size);

View File

@ -35,56 +35,7 @@
#include "cpu-features.h"
#endif
void rfx_decode_YCbCr_to_RGB_NEON(INT16 * y_r_buffer, INT16 * cb_g_buffer, INT16 * cr_b_buffer)
{
int16x8_t zero = vdupq_n_s16(0);
int16x8_t max = vdupq_n_s16(255);
int16x8_t y_add = vdupq_n_s16(128);
int16x8_t* y_r_buf = (int16x8_t*) y_r_buffer;
int16x8_t* cb_g_buf = (int16x8_t*) cb_g_buffer;
int16x8_t* cr_b_buf = (int16x8_t*) cr_b_buffer;
int i;
for (i = 0; i < 4096 / 8; i++)
{
int16x8_t y = vld1q_s16((INT16*) &y_r_buf[i]);
y = vaddq_s16(y, y_add);
int16x8_t cr = vld1q_s16((INT16*) &cr_b_buf[i]);
// r = between((y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5)), 0, 255);
int16x8_t r = vaddq_s16(y, cr);
r = vaddq_s16(r, vshrq_n_s16(cr, 2));
r = vaddq_s16(r, vshrq_n_s16(cr, 3));
r = vaddq_s16(r, vshrq_n_s16(cr, 5));
r = vminq_s16(vmaxq_s16(r, zero), max);
vst1q_s16((INT16*)&y_r_buf[i], r);
// cb = cb_g_buf[i];
int16x8_t cb = vld1q_s16((INT16*)&cb_g_buf[i]);
// g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255);
int16x8_t g = vsubq_s16(y, vshrq_n_s16(cb, 2));
g = vsubq_s16(g, vshrq_n_s16(cb, 4));
g = vsubq_s16(g, vshrq_n_s16(cb, 5));
g = vsubq_s16(g, vshrq_n_s16(cr, 1));
g = vsubq_s16(g, vshrq_n_s16(cr, 3));
g = vsubq_s16(g, vshrq_n_s16(cr, 4));
g = vsubq_s16(g, vshrq_n_s16(cr, 5));
g = vminq_s16(vmaxq_s16(g, zero), max);
vst1q_s16((INT16*)&cb_g_buf[i], g);
// b = between((y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6)), 0, 255);
int16x8_t b = vaddq_s16(y, cb);
b = vaddq_s16(b, vshrq_n_s16(cb, 1));
b = vaddq_s16(b, vshrq_n_s16(cb, 2));
b = vaddq_s16(b, vshrq_n_s16(cb, 6));
b = vminq_s16(vmaxq_s16(b, zero), max);
vst1q_s16((INT16*)&cr_b_buf[i], b);
}
}
/* rfx_decode_YCbCr_to_RGB_NEON code now resides in the primitives library. */
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
rfx_quantization_decode_block_NEON(INT16 * buffer, const int buffer_size, const UINT32 factor)
@ -338,11 +289,10 @@ void rfx_init_neon(RFX_CONTEXT * context)
{
DEBUG_RFX("Using NEON optimizations");
IF_PROFILER(context->priv->prof_rfx_decode_ycbcr_to_rgb->name = "rfx_decode_YCbCr_to_RGB_NEON");
IF_PROFILER(context->priv->prof_rfx_ycbcr_to_rgb->name = "rfx_decode_YCbCr_to_RGB_NEON");
IF_PROFILER(context->priv->prof_rfx_quantization_decode->name = "rfx_quantization_decode_NEON");
IF_PROFILER(context->priv->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_NEON");
context->decode_ycbcr_to_rgb = rfx_decode_YCbCr_to_RGB_NEON;
context->quantization_decode = rfx_quantization_decode_NEON;
context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
}

View File

@ -21,36 +21,34 @@
#include "config.h"
#endif
#include <freerdp/primitives.h>
#include "rfx_quantization.h"
static void rfx_quantization_decode_block(INT16* buffer, int buffer_size, UINT32 factor)
static void rfx_quantization_decode_block(const primitives_t *prims, INT16* buffer, int buffer_size, UINT32 factor)
{
INT16* dst;
if (factor == 0)
return;
for (dst = buffer; buffer_size > 0; dst++, buffer_size--)
{
*dst <<= factor;
}
prims->lShiftC_16s(buffer, factor, buffer, buffer_size);
}
void rfx_quantization_decode(INT16* buffer, const UINT32* quantization_values)
{
/* Scale the values so that they are represented as 11.5 fixed-point number */
rfx_quantization_decode_block(buffer, 4096, 5);
const primitives_t *prims = primitives_get();
rfx_quantization_decode_block(buffer, 1024, quantization_values[8] - 6); /* HL1 */
rfx_quantization_decode_block(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
rfx_quantization_decode_block(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
rfx_quantization_decode_block(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
rfx_quantization_decode_block(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
rfx_quantization_decode_block(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
rfx_quantization_decode_block(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
rfx_quantization_decode_block(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
rfx_quantization_decode_block(buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
rfx_quantization_decode_block(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
/* Scale the values so that they are represented as 11.5 fixed-point number */
rfx_quantization_decode_block(prims, buffer, 4096, 5);
rfx_quantization_decode_block(prims, buffer, 1024, quantization_values[8] - 6); /* HL1 */
rfx_quantization_decode_block(prims, buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
rfx_quantization_decode_block(prims, buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
rfx_quantization_decode_block(prims, buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
rfx_quantization_decode_block(prims, buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
rfx_quantization_decode_block(prims, buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
rfx_quantization_decode_block(prims, buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
rfx_quantization_decode_block(prims, buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
rfx_quantization_decode_block(prims, buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
rfx_quantization_decode_block(prims, buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
}
static void rfx_quantization_encode_block(INT16* buffer, int buffer_size, UINT32 factor)
@ -62,6 +60,7 @@ static void rfx_quantization_encode_block(INT16* buffer, int buffer_size, UINT32
return;
half = (1 << (factor - 1));
/* Could probably use prims->rShiftC_16s(dst+half, factor, dst, buffer_size); */
for (dst = buffer; buffer_size > 0; dst++, buffer_size--)
{
*dst = (*dst + half) >> factor;

View File

@ -52,177 +52,8 @@ _mm_prefetch_buffer(char * buffer, int num_bytes)
}
}
static void rfx_decode_ycbcr_to_rgb_sse2(INT16* y_r_buffer, INT16* cb_g_buffer, INT16* cr_b_buffer)
{
__m128i zero = _mm_setzero_si128();
__m128i max = _mm_set1_epi16(255);
__m128i* y_r_buf = (__m128i*) y_r_buffer;
__m128i* cb_g_buf = (__m128i*) cb_g_buffer;
__m128i* cr_b_buf = (__m128i*) cr_b_buffer;
__m128i y;
__m128i cr;
__m128i cb;
__m128i r;
__m128i g;
__m128i b;
int i;
__m128i r_cr = _mm_set1_epi16(22986); // 1.403 << 14
__m128i g_cb = _mm_set1_epi16(-5636); // -0.344 << 14
__m128i g_cr = _mm_set1_epi16(-11698); // -0.714 << 14
__m128i b_cb = _mm_set1_epi16(28999); // 1.770 << 14
__m128i c4096 = _mm_set1_epi16(4096);
for (i = 0; i < (4096 * sizeof(INT16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
{
_mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
_mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA);
_mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA);
}
for (i = 0; i < (4096 * sizeof(INT16) / sizeof(__m128i)); i++)
{
/*
In order to use SSE2 signed 16-bit integer multiplication we need to convert
the floating point factors to signed int without loosing information.
The result of this multiplication is 32 bit and we have two SSE instructions
that return either the hi or lo word.
Thus we will multiply the factors by the highest possible 2^n, take the
upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16) and correct this
result by multiplying it by 2^(16-n).
For the given factors in the conversion matrix the best possible n is 14.
Example for calculating r:
r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
*/
/* y = (y_r_buf[i] + 4096) >> 2 */
y = _mm_load_si128(&y_r_buf[i]);
y = _mm_add_epi16(y, c4096);
y = _mm_srai_epi16(y, 2);
/* cb = cb_g_buf[i]; */
cb = _mm_load_si128(&cb_g_buf[i]);
/* cr = cr_b_buf[i]; */
cr = _mm_load_si128(&cr_b_buf[i]);
/* (y + HIWORD(cr*22986)) >> 3 */
r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
r = _mm_srai_epi16(r, 3);
/* y_r_buf[i] = MINMAX(r, 0, 255); */
_mm_between_epi16(r, zero, max);
_mm_store_si128(&y_r_buf[i], r);
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
g = _mm_srai_epi16(g, 3);
/* cb_g_buf[i] = MINMAX(g, 0, 255); */
_mm_between_epi16(g, zero, max);
_mm_store_si128(&cb_g_buf[i], g);
/* (y + HIWORD(cb*28999)) >> 3 */
b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
b = _mm_srai_epi16(b, 3);
/* cr_b_buf[i] = MINMAX(b, 0, 255); */
_mm_between_epi16(b, zero, max);
_mm_store_si128(&cr_b_buf[i], b);
}
}
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point numbers. See rfx_encode.c */
static void rfx_encode_rgb_to_ycbcr_sse2(INT16* y_r_buffer, INT16* cb_g_buffer, INT16* cr_b_buffer)
{
__m128i min = _mm_set1_epi16(-128 << 5);
__m128i max = _mm_set1_epi16(127 << 5);
__m128i* y_r_buf = (__m128i*) y_r_buffer;
__m128i* cb_g_buf = (__m128i*) cb_g_buffer;
__m128i* cr_b_buf = (__m128i*) cr_b_buffer;
__m128i y;
__m128i cr;
__m128i cb;
__m128i r;
__m128i g;
__m128i b;
__m128i y_r = _mm_set1_epi16(9798); // 0.299000 << 15
__m128i y_g = _mm_set1_epi16(19235); // 0.587000 << 15
__m128i y_b = _mm_set1_epi16(3735); // 0.114000 << 15
__m128i cb_r = _mm_set1_epi16(-5535); // -0.168935 << 15
__m128i cb_g = _mm_set1_epi16(-10868); // -0.331665 << 15
__m128i cb_b = _mm_set1_epi16(16403); // 0.500590 << 15
__m128i cr_r = _mm_set1_epi16(16377); // 0.499813 << 15
__m128i cr_g = _mm_set1_epi16(-13714); // -0.418531 << 15
__m128i cr_b = _mm_set1_epi16(-2663); // -0.081282 << 15
int i;
for (i = 0; i < (4096 * sizeof(INT16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
{
_mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
_mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA);
_mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA);
}
for (i = 0; i < (4096 * sizeof(INT16) / sizeof(__m128i)); i++)
{
/*
In order to use SSE2 signed 16-bit integer multiplication we need to convert
the floating point factors to signed int without loosing information.
The result of this multiplication is 32 bit and using SSE2 we get either the
product's hi or lo word.
Thus we will multiply the factors by the highest possible 2^n and take the
upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16).
Since the final result needs to be scaled by << 5 and also in in order to keep
the precision within the upper 16 bits we will also have to scale the RGB
values used in the multiplication by << 5+(16-n).
*/
/* r = y_r_buf[i]; */
r = _mm_load_si128(&y_r_buf[i]);
/* g = cb_g_buf[i]; */
g = _mm_load_si128(&cb_g_buf[i]);
/* b = cr_b_buf[i]; */
b = _mm_load_si128(&cr_b_buf[i]);
/* r<<6; g<<6; b<<6 */
r = _mm_slli_epi16(r, 6);
g = _mm_slli_epi16(g, 6);
b = _mm_slli_epi16(b, 6);
/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
y = _mm_mulhi_epi16(r, y_r);
y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
y = _mm_add_epi16(y, min);
/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
_mm_between_epi16(y, min, max);
_mm_store_si128(&y_r_buf[i], y);
/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
cb = _mm_mulhi_epi16(r, cb_r);
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
_mm_between_epi16(cb, min, max);
_mm_store_si128(&cb_g_buf[i], cb);
/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
cr = _mm_mulhi_epi16(r, cr_r);
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
_mm_between_epi16(cr, min, max);
_mm_store_si128(&cr_b_buf[i], cr);
}
}
/* rfx_decode_ycbcr_to_rgb_sse2 code now resides in the primitives library. */
/* rfx_encode_rgb_to_ycbcr_sse2 code now resides in the primitives library. */
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
rfx_quantization_decode_block_sse2(INT16* buffer, const int buffer_size, const UINT32 factor)
@ -658,15 +489,11 @@ void rfx_init_sse2(RFX_CONTEXT* context)
{
DEBUG_RFX("Using SSE2 optimizations");
IF_PROFILER(context->priv->prof_rfx_decode_ycbcr_to_rgb->name = "rfx_decode_ycbcr_to_rgb_sse2");
IF_PROFILER(context->priv->prof_rfx_encode_rgb_to_ycbcr->name = "rfx_encode_rgb_to_ycbcr_sse2");
IF_PROFILER(context->priv->prof_rfx_quantization_decode->name = "rfx_quantization_decode_sse2");
IF_PROFILER(context->priv->prof_rfx_quantization_encode->name = "rfx_quantization_encode_sse2");
IF_PROFILER(context->priv->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_sse2");
IF_PROFILER(context->priv->prof_rfx_dwt_2d_encode->name = "rfx_dwt_2d_encode_sse2");
context->decode_ycbcr_to_rgb = rfx_decode_ycbcr_to_rgb_sse2;
context->encode_rgb_to_ycbcr = rfx_encode_rgb_to_ycbcr_sse2;
context->quantization_decode = rfx_quantization_decode_sse2;
context->quantization_encode = rfx_quantization_encode_sse2;
context->dwt_2d_decode = rfx_dwt_2d_decode_sse2;

View File

@ -60,7 +60,7 @@ struct _RFX_CONTEXT_PRIV
PROFILER_DEFINE(prof_rfx_differential_decode);
PROFILER_DEFINE(prof_rfx_quantization_decode);
PROFILER_DEFINE(prof_rfx_dwt_2d_decode);
PROFILER_DEFINE(prof_rfx_decode_ycbcr_to_rgb);
PROFILER_DEFINE(prof_rfx_ycbcr_to_rgb);
PROFILER_DEFINE(prof_rfx_decode_format_rgb);
PROFILER_DEFINE(prof_rfx_encode_rgb);
@ -69,7 +69,7 @@ struct _RFX_CONTEXT_PRIV
PROFILER_DEFINE(prof_rfx_differential_encode);
PROFILER_DEFINE(prof_rfx_quantization_encode);
PROFILER_DEFINE(prof_rfx_dwt_2d_encode);
PROFILER_DEFINE(prof_rfx_encode_rgb_to_ycbcr);
PROFILER_DEFINE(prof_rfx_rgb_to_ycbcr);
PROFILER_DEFINE(prof_rfx_encode_format_rgb);
};

View File

@ -0,0 +1,81 @@
# FreeRDP: A Remote Desktop Protocol Client
# libfreerdp-primitives cmake build script
# vi:ts=4 sw=4:
#
# (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied. See the License for the specific language governing permissions
# and limitations under the License.
#
set(FREERDP_PRIMITIVE_CFILES
prim_add.c
prim_andor.c
prim_alphaComp.c
prim_colors.c
prim_copy.c
prim_set.c
prim_shift.c
prim_sign.c
primitives.c
)
set(FREERDP_PRIMITIVE_HEADERS
prim_internal.h
)
set(FREERDP_PRIMITIVE_SRCS
${FREERDP_PRIMITIVE_CFILES}
${FREERDP_PRIMITIVE_HEADERS}
)
add_definitions(-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})
### IPP Variable debugging
if(WITH_IPP)
if(CMAKE_COMPILER_IS_GNUCC)
foreach(INCLDIR ${IPP_INCLUDE_DIRS})
set(OPTIMIZATION "${OPTIMIZATION} -I${INCLDIR}")
endforeach(INCLDIR)
endif()
endif()
if(WITH_SSE2)
if(CMAKE_COMPILER_IS_GNUCC)
set(OPTIMIZATION "${OPTIMIZATION} -msse2 -mssse3 -Wdeclaration-after-statement")
endif()
if(MSVC)
set(OPTIMIZATION "${OPTIMIZATION} /arch:SSE2")
endif()
elseif(WITH_NEON)
if(CMAKE_COMPILER_IS_GNUCC)
set(OPTIMZATION "${OPTIMIZATION} -mfpu=neon -mfloat-abi=softfp")
endif()
# TODO: Add MSVC equivalent
endif()
set_property(SOURCE ${FREERDP_PRIMITIVE_CFILES} PROPERTY COMPILE_FLAGS ${OPTIMIZATION})
if(WITH_MONOLITHIC_BUILD)
add_library(freerdp-primitives OBJECT ${FREERDP_PRIMITIVE_SRCS})
else()
add_library(freerdp-primitives ${FREERDP_PRIMITIVE_SRCS})
endif()
set_target_properties(freerdp-primitives PROPERTIES VERSION ${FREERDP_VERSION_FULL} SOVERSION ${FREERDP_VERSION} PREFIX "lib")
#set(FREERDP_PRIMITIVE_LIBS, rt)
if(WITH_MONOLITHIC_BUILD)
set(FREERDP_LIBS ${FREERDP_LIBS} ${FREERDP_PRIMITIVE_LIBS} PARENT_SCOPE)
else()
target_link_libraries(freerdp-primitives ${FREERDP_PRIMITIVE_LIBS})
install(TARGETS freerdp-primitives DESTINATION ${CMAKE_INSTALL_LIBDIR})
endif()
if(BUILD_TESTING)
add_subdirectory(test)
endif(BUILD_TESTING)

View File

@ -0,0 +1,113 @@
The Primitives Library
Introduction
------------
The purpose of the primitives library is to give the freerdp code easy
access to *run-time* optimization via SIMD operations. When the library
is initialized, dynamic checks of processor features are run (such as
the support of SSE3 or Neon), and entrypoints are linked to through
function pointers to provide the fastest possible operations. All
routines offer generic C alternatives as fallbacks.
Run-time optimization has the advantage of allowing a single executable
to run fast on multiple platforms with different SIMD capabilities.
Use In Code
-----------
A singleton pointing to a structure containing the function pointers
is accessed through primitives_get(). The function pointers can then
be used from that structure, e.g.
primitives_t *prims = primitives_get();
prims->shiftC_16s(buffer, shifts, buffer, 256);
Of course, there is some overhead in calling through the function pointer
and setting up the SIMD operations, so it would be counterproductive to
call the primitives library for very small operation, e.g. initializing an
array of eight values to a constant. The primitives library is intended
for larger-scale operations, e.g. arrays of size 64 and larger.
Initialization and Cleanup
--------------------------
Library initialization is done the first time primitives_init() is called
or the first time primitives_get() is used. Cleanup (if any) is done by
primitives_deinit().
Intel Integrated Performance Primitives (IPP)
---------------------------------------------
If freerdp is compiled with IPP support (-DWITH_IPP=ON), the IPP function
calls will be used (where available) to fill the function pointers.
Where possible, function names and parameter lists match IPP format so
that the IPP functions can be plugged into the function pointers without
a wrapper layer. Use of IPP is completely optional, and in many cases
the SSE operations in the primitives library itself are faster or similar
in performance.
Coverage
--------
The primitives library is not meant to be comprehensive, offering
entrypoints for every operation and operand type. Instead, the coverage
is focused on operations known to be performance bottlenecks in the code.
For instance, 16-bit signed operations are used widely in the RemoteFX
software, so you'll find 16s versions of several operations, but there
is no attempt to provide (unused) copies of the same code for 8u, 16u,
32s, etc.
New Optimizations
-----------------
As the need arises, new optimizations can be added to the library,
including NEON, AVX, and perhaps OpenCL or other SIMD implementations.
The initialization routine is free to do any quick run-time test to
determine which features are available before hooking the operation's
function pointer, or it can simply look at the processor features list
from the hints passed to the initialization routine.
Adding Entrypoints
------------------
As the need for new operations or operands arises, new entrypoints can
be added.
1) Function prototypes and pointers are added to
include/freerdp/primitives.h
2) New module initialization and cleanup function prototypes are added
to prim_internal.h and called in primitives.c (primitives_init()
and primitives_deinit()).
3) Operation names and parameter lists should be compatible with the IPP.
IPP manuals are available online at software.intel.com.
4) A generic C entrypoint must be available as a fallback.
5) prim_templates.h contains macro-based templates for simple operations,
such as applying a single SSE operation to arrays of data.
The template functions can frequently be used to extend the
operations without writing a lot of new code.
Flags
-----
The entrypoint primitives_get_flags() returns a bitfield of processor flags
(as defined in primitives.h) and primitives_flag_str() returns a string
related to those processor flags, for debugging and information. The
bitfield can be used elsewhere in the code as needed.
Cache Management
----------------
I haven't found a lot of speed improvement by attempting prefetch, and
in fact it seems to have a negative impact in some cases. Done correctly
perhaps the routines could be further accelerated by proper use of prefetch,
fences, etc.
Testing
-------
In the test subdirectory is an executable (prim_test) that tests both
functionality and speed of primitives library operations. Any new
modules should be added to that test, following the conventions already
established in that directory. The program can be executed on various
target hardware to compare generic C, optimized, and IPP performance
with various array sizes.

View File

@ -0,0 +1,81 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Add operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
*/
#include <config.h>
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#ifdef WITH_SSE2
# include <emmintrin.h>
# include <pmmintrin.h>
#endif /* WITH_SSE2 */
#ifdef WITH_IPP
# include <ipps.h>
#endif /* WITH_IPP */
#include "prim_internal.h"
#include "prim_templates.h"
/* ----------------------------------------------------------------------------
* 16-bit signed add with saturation (under and over).
*/
PRIM_STATIC pstatus_t general_add_16s(
const INT16 *pSrc1,
const INT16 *pSrc2,
INT16 *pDst,
INT32 len)
{
while (len--)
{
INT32 k = (INT32) (*pSrc1++) + (INT32) (*pSrc2++);
if (k > 32767) *pDst++ = ((INT16) 32767);
else if (k < -32768) *pDst++ = ((INT16) -32768);
else *pDst++ = (INT16) k;
}
return PRIMITIVES_SUCCESS;
}
#ifdef WITH_SSE2
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
/* ------------------------------------------------------------------------- */
SSE3_SSD_ROUTINE(sse3_add_16s, INT16, general_add_16s,
_mm_adds_epi16, general_add_16s(sptr1++, sptr2++, dptr++, 1))
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_add(
const primitives_hints_t *hints,
primitives_t *prims)
{
prims->add_16s = general_add_16s;
#ifdef WITH_IPP
prims->add_16s = (__add_16s_t) ippsAdd_16s;
#elif defined(WITH_SSE2)
if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) /* for LDDQU */
{
prims->add_16s = sse3_add_16s;
}
#endif
}
/* ------------------------------------------------------------------------- */
void primitives_deinit_add(
primitives_t *prims)
{
/* Nothing to do. */
}

View File

@ -0,0 +1,298 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Alpha blending routines.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
* Note: this code assumes the second operand is fully opaque,
* e.g.
* newval = alpha1*val1 + (1-alpha1)*val2
* rather than
* newval = alpha1*val1 + (1-alpha1)*alpha2*val2
* The IPP gives other options.
*/
#include <config.h>
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
#ifdef WITH_SSE2
# include <emmintrin.h>
# include <pmmintrin.h>
#endif /* WITH_SSE2 */
#ifdef WITH_IPP
# include <ippi.h>
#endif /* WITH_IPP */
#define ALPHA(_k_) (((_k_) & 0xFF000000U) >> 24)
#define RED(_k_) (((_k_) & 0x00FF0000U) >> 16)
#define GRN(_k_) (((_k_) & 0x0000FF00U) >> 8)
#define BLU(_k_) (((_k_) & 0x000000FFU))
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_alphaComp_argb(
const BYTE *pSrc1, INT32 src1Step,
const BYTE *pSrc2, INT32 src2Step,
BYTE *pDst, INT32 dstStep,
INT32 width, INT32 height)
{
const UINT32 *sptr1 = (const UINT32 *) pSrc1;
const UINT32 *sptr2 = (const UINT32 *) pSrc2;
UINT32 *dptr = (UINT32 *) pDst;
int linebytes = width * sizeof(UINT32);
int src1Jump = (src1Step - linebytes) / sizeof(UINT32);
int src2Jump = (src2Step - linebytes) / sizeof(UINT32);
int dstJump = (dstStep - linebytes) / sizeof(UINT32);
int y;
for (y=0; y<height; y++)
{
int x;
for (x=0; x<width; x++)
{
const UINT32 src1 = *sptr1++;
const UINT32 src2 = *sptr2++;
UINT32 alpha = ALPHA(src1) + 1;
if (alpha == 256)
{
/* If alpha is 255+1, just copy src1. */
*dptr++ = src1;
}
else if (alpha <= 1)
{
/* If alpha is 0+1, just copy src2. */
*dptr++ = src2;
}
else
{
/* A perfectly accurate blend would do (a*src + (255-a)*dst)/255
* rather than adding one to alpha and dividing by 256, but this
* is much faster and only differs by one 16% of the time.
* I'm not sure who first designed the double-ops trick
* (Red Blue and Alpha Green).
*/
UINT32 rb, ag;
UINT32 s2rb = src2 & 0x00FF00FFU;
UINT32 s2ag = (src2 >> 8) & 0x00FF00FFU;
UINT32 s1rb = src1 & 0x00FF00FFU;
UINT32 s1ag = (src1 >> 8) & 0x00FF00FFU;
UINT32 drb = s1rb - s2rb;
UINT32 dag = s1ag - s2ag;
drb *= alpha;
dag *= alpha;
rb = ((drb >> 8) + s2rb) & 0x00FF00FFU;
ag = (((dag >> 8) + s2ag) << 8) & 0xFF00FF00U;
*dptr++ = rb | ag;
}
}
sptr1 += src1Jump;
sptr2 += src2Jump;
dptr += dstJump;
}
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
#ifdef WITH_SSE2
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
PRIM_STATIC pstatus_t sse2_alphaComp_argb(
const BYTE *pSrc1, INT32 src1Step,
const BYTE *pSrc2, INT32 src2Step,
BYTE *pDst, INT32 dstStep,
INT32 width, INT32 height)
{
const UINT32 *sptr1 = (const UINT32 *) pSrc1;
const UINT32 *sptr2 = (const UINT32 *) pSrc2;
UINT32 *dptr;
int linebytes, src1Jump, src2Jump, dstJump, y;
__m128i xmm0, xmm1;
if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS;
if (width < 4) /* pointless if too small */
{
return general_alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step,
pDst, dstStep, width, height);
}
dptr = (UINT32 *) pDst;
linebytes = width * sizeof(UINT32);
src1Jump = (src1Step - linebytes) / sizeof(UINT32);
src2Jump = (src2Step - linebytes) / sizeof(UINT32);
dstJump = (dstStep - linebytes) / sizeof(UINT32);
xmm0 = _mm_set1_epi32(0);
xmm1 = _mm_set1_epi16(1);
for (y=0; y<height; ++y)
{
int pixels = width;
int count;
/* Get to the 16-byte boundary now. */
int leadIn = 0;
switch ((ULONG_PTR) dptr & 0x0f)
{
case 0:
leadIn = 0;
break;
case 4:
leadIn = 3;
break;
case 8:
leadIn = 2;
break;
case 12:
leadIn = 1;
break;
default:
/* We'll never hit a 16-byte boundary, so do the whole
* thing the slow way.
*/
leadIn = width;
break;
}
if (leadIn)
{
general_alphaComp_argb((const BYTE *) sptr1,
src1Step, (const BYTE *) sptr2, src2Step,
(BYTE *) dptr, dstStep, leadIn, 1);
sptr1 += leadIn;
sptr2 += leadIn;
dptr += leadIn;
pixels -= leadIn;
}
/* Use SSE registers to do 4 pixels at a time. */
count = pixels >> 2;
pixels -= count << 2;
while (count--)
{
__m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
xmm2 = LOAD_SI128(sptr1); sptr1 += 4;
/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
xmm3 = LOAD_SI128(sptr2); sptr2 += 4;
/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
/* subtract */
xmm6 = _mm_subs_epi16(xmm4, xmm5);
/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
/* Add one to alphas */
xmm4 = _mm_adds_epi16(xmm4, xmm1);
/* Multiply and take low word */
xmm4 = _mm_mullo_epi16(xmm4, xmm6);
/* Shift 8 right */
xmm4 = _mm_srai_epi16(xmm4, 8);
/* Add xmm5 */
xmm4 = _mm_adds_epi16(xmm4, xmm5);
/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
/* subtract */
xmm7 = _mm_subs_epi16(xmm5, xmm6);
/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
/* Add one to alphas */
xmm5 = _mm_adds_epi16(xmm5, xmm1);
/* Multiply and take low word */
xmm5 = _mm_mullo_epi16(xmm5, xmm7);
/* Shift 8 right */
xmm5 = _mm_srai_epi16(xmm5, 8);
/* Add xmm6 */
xmm5 = _mm_adds_epi16(xmm5, xmm6);
/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
/* Must mask off remainders or pack gets confused */
xmm3 = _mm_set1_epi16(0x00ffU);
xmm4 = _mm_and_si128(xmm4, xmm3);
xmm5 = _mm_and_si128(xmm5, xmm3);
/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
xmm5 = _mm_packus_epi16(xmm5, xmm4);
_mm_store_si128((__m128i *) dptr, xmm5); dptr += 4;
}
/* Finish off the remainder. */
if (pixels)
{
general_alphaComp_argb((const BYTE *) sptr1, src1Step,
(const BYTE *) sptr2, src2Step,
(BYTE *) dptr, dstStep, pixels, 1);
sptr1 += pixels;
sptr2 += pixels;
dptr += pixels;
}
/* Jump to next row. */
sptr1 += src1Jump;
sptr2 += src2Jump;
dptr += dstJump;
}
return PRIMITIVES_SUCCESS;
}
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif
#ifdef WITH_IPP
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t ipp_alphaComp_argb(
const BYTE *pSrc1, INT32 src1Step,
const BYTE *pSrc2, INT32 src2Step,
BYTE *pDst, INT32 dstStep,
INT32 width, INT32 height)
{
IppiSize sz;
sz.width = width;
sz.height = height;
return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step,
pDst, dstStep, sz, ippAlphaOver);
}
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_alphaComp(
const primitives_hints_t *hints,
primitives_t *prims)
{
prims->alphaComp_argb = general_alphaComp_argb;
#ifdef WITH_IPP
prims->alphaComp_argb = ipp_alphaComp_argb;
#elif defined(WITH_SSE2)
if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) /* for LDDQU */
{
prims->alphaComp_argb = sse2_alphaComp_argb;
}
#endif
}
/* ------------------------------------------------------------------------- */
void primitives_deinit_alphaComp(
primitives_t *prims)
{
/* Nothing to do. */
}

View File

@ -0,0 +1,94 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Logical operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <config.h>
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#ifdef WITH_SSE2
# include <emmintrin.h>
# include <pmmintrin.h>
#endif /* WITH_SSE2 */
#ifdef WITH_IPP
# include <ipps.h>
#endif /* WITH_IPP */
#include "prim_internal.h"
#include "prim_templates.h"
/* ----------------------------------------------------------------------------
* 32-bit AND with a constant.
*/
PRIM_STATIC pstatus_t general_andC_32u(
const UINT32 *pSrc,
UINT32 val,
UINT32 *pDst,
INT32 len)
{
if (val == 0) return PRIMITIVES_SUCCESS;
while (len--) *pDst++ = *pSrc++ & val;
return PRIMITIVES_SUCCESS;
}
/* ----------------------------------------------------------------------------
* 32-bit OR with a constant.
*/
PRIM_STATIC pstatus_t general_orC_32u(
const UINT32 *pSrc,
UINT32 val,
UINT32 *pDst,
INT32 len)
{
if (val == 0) return PRIMITIVES_SUCCESS;
while (len--) *pDst++ = *pSrc++ | val;
return PRIMITIVES_SUCCESS;
}
#ifdef WITH_SSE2
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
/* ------------------------------------------------------------------------- */
SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, general_andC_32u,
_mm_and_si128, *dptr++ = *sptr++ & val)
SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, general_orC_32u,
_mm_or_si128, *dptr++ = *sptr++ | val)
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_andor(
const primitives_hints_t *hints,
primitives_t *prims)
{
/* Start with the default. */
prims->andC_32u = general_andC_32u;
prims->orC_32u = general_orC_32u;
#if defined(WITH_IPP)
prims->andC_32u = (__andC_32u_t) ippsAndC_32u;
prims->orC_32u = (__orC_32u_t) ippsOrC_32u;
#elif defined(WITH_SSE2)
if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
{
prims->andC_32u = sse3_andC_32u;
prims->orC_32u = sse3_orC_32u;
}
#endif
}
/* ------------------------------------------------------------------------- */
void primitives_deinit_andor(
primitives_t *prims)
{
/* Nothing to do. */
}

View File

@ -0,0 +1,742 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Color conversion operations.
* vi:ts=4 sw=4:
*
* Copyright 2011 Stephen Erisman
* Copyright 2011 Norbert Federa <nfedera@thinstuff.com>
* Copyright 2011 Martin Fleisz <mfleisz@thinstuff.com>
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#ifdef WITH_SSE2
#include <emmintrin.h>
#elif WITH_NEON
#include <arm_neon.h>
#endif /* WITH_SSE2 else WITH_NEON */
#include "prim_internal.h"
#include "prim_templates.h"
#ifndef MINMAX
#define MINMAX(_v_, _l_, _h_) \
((_v_) < (_l_) ? (_l_) : ((_v_) > (_h_) ? (_h_) : (_v_)))
#endif /* !MINMAX */
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_yCbCrToRGB_16s16s_P3P3(
const INT16 *pSrc[3], INT32 srcStep,
INT16 *pDst[3], INT32 dstStep,
const prim_size_t *roi) /* region of interest */
{
/**
* The decoded YCbCr coeffectients are represented as 11.5 fixed-point
* numbers:
*
* 1 sign bit + 10 integer bits + 5 fractional bits
*
* However only 7 integer bits will be actually used since the value range
* is [-128.0, 127.0]. In other words, the decoded coefficients are scaled
* by << 5 when interpreted as INT16.
* It was scaled in the quantization phase, so we must scale it back here.
*/
const INT16 *yptr = pSrc[0];
const INT16 *cbptr = pSrc[1];
const INT16 *crptr = pSrc[2];
INT16 *rptr = pDst[0];
INT16 *gptr = pDst[1];
INT16 *bptr = pDst[2];
int srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
int dstbump = (dstStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
int y;
for (y=0; y<roi->height; y++)
{
int x;
for (x=0; x<roi->width; ++x)
{
/* INT32 is used intentionally because we calculate
* with shifted factors!
*/
INT32 y = (INT32) (*yptr++);
INT32 cb = (INT32) (*cbptr++);
INT32 cr = (INT32) (*crptr++);
INT32 r,g,b;
/*
* This is the slow floating point version kept here for reference.
* y = y + 4096; // 128<<5=4096 so that we can scale the sum by>>5
* r = y + cr*1.403f;
* g = y - cb*0.344f - cr*0.714f;
* b = y + cb*1.770f;
* y_r_buf[i] = MINMAX(r>>5, 0, 255);
* cb_g_buf[i] = MINMAX(g>>5, 0, 255);
* cr_b_buf[i] = MINMAX(b>>5, 0, 255);
*/
/*
* We scale the factors by << 16 into 32-bit integers in order to
* avoid slower floating point multiplications. Since the final
* result needs to be scaled by >> 5 we will extract only the
* upper 11 bits (>> 21) from the final sum.
* Hence we also have to scale the other terms of the sum by << 16.
* R: 1.403 << 16 = 91947
* G: 0.344 << 16 = 22544, 0.714 << 16 = 46792
* B: 1.770 << 16 = 115998
*/
y = (y+4096)<<16;
r = y + cr*91947;
g = y - cb*22544 - cr*46792;
b = y + cb*115998;
*rptr++ = MINMAX(r>>21, 0, 255);
*gptr++ = MINMAX(g>>21, 0, 255);
*bptr++ = MINMAX(b>>21, 0, 255);
}
yptr += srcbump;
cbptr += srcbump;
crptr += srcbump;
rptr += dstbump;
gptr += dstbump;
bptr += dstbump;
}
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_RGBToYCbCr_16s16s_P3P3(
const INT16 *pSrc[3], INT32 srcStep,
INT16 *pDst[3], INT32 dstStep,
const prim_size_t *roi) /* region of interest */
{
/* The encoded YCbCr coefficients are represented as 11.5 fixed-point
* numbers:
*
* 1 sign bit + 10 integer bits + 5 fractional bits
*
* However only 7 integer bits will be actually used since the value
* range is [-128.0, 127.0]. In other words, the encoded coefficients
* is scaled by << 5 when interpreted as INT16.
* It will be scaled down to original during the quantization phase.
*/
const INT16 *rptr = pSrc[0];
const INT16 *gptr = pSrc[1];
const INT16 *bptr = pSrc[2];
INT16 *yptr = pDst[0];
INT16 *cbptr = pDst[1];
INT16 *crptr = pDst[2];
int srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
int dstbump = (dstStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
int y;
for (y=0; y<roi->height; y++)
{
int x;
for (x=0; x<roi->width; ++x)
{
/* INT32 is used intentionally because we calculate with
* shifted factors!
*/
INT32 r = (INT32) (*rptr++);
INT32 g = (INT32) (*gptr++);
INT32 b = (INT32) (*bptr++);
/* We scale the factors by << 15 into 32-bit integers in order
* to avoid slower floating point multiplications. Since the
* terms need to be scaled by << 5 we simply scale the final
* sum by >> 10
*
* Y: 0.299000 << 15 = 9798, 0.587000 << 15 = 19235,
* 0.114000 << 15 = 3735
* Cb: 0.168935 << 15 = 5535, 0.331665 << 15 = 10868,
* 0.500590 << 15 = 16403
* Cr: 0.499813 << 15 = 16377, 0.418531 << 15 = 13714,
* 0.081282 << 15 = 2663
*/
INT32 y = (r * 9798 + g * 19235 + b * 3735) >> 10;
INT32 cb = (r * -5535 + g * -10868 + b * 16403) >> 10;
INT32 cr = (r * 16377 + g * -13714 + b * -2663) >> 10;
*yptr++ = (INT16) MINMAX(y - 4096, -4096, 4095);
*cbptr++ = (INT16) MINMAX(cb, -4096, 4095);
*crptr++ = (INT16) MINMAX(cr, -4096, 4095);
}
yptr += srcbump;
cbptr += srcbump;
crptr += srcbump;
rptr += dstbump;
gptr += dstbump;
bptr += dstbump;
}
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_RGBToRGB_16s8u_P3AC4R(
const INT16 *pSrc[3], /* 16-bit R,G, and B arrays */
int srcStep, /* bytes between rows in source data */
BYTE *pDst, /* 32-bit interleaved ARGB (ABGR?) data */
int dstStep, /* bytes between rows in dest data */
const prim_size_t *roi) /* region of interest */
{
const INT16 *r = pSrc[0];
const INT16 *g = pSrc[1];
const INT16 *b = pSrc[2];
BYTE *dst = pDst;
int x,y;
int srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
int dstbump = (dstStep - (roi->width * sizeof(UINT32)));
for (y=0; y<roi->height; ++y)
{
for (x=0; x<roi->width; ++x)
{
*dst++ = (BYTE) (*b++);
*dst++ = (BYTE) (*g++);
*dst++ = (BYTE) (*r++);
*dst++ = ((BYTE) (0xFFU));
}
dst += dstbump;
r += srcbump;
g += srcbump;
b += srcbump;
}
return PRIMITIVES_SUCCESS;
}
#ifdef WITH_SSE2
#ifdef __GNUC__
# define GNU_INLINE \
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
#else
# define GNU_INLINE
#endif
#define CACHE_LINE_BYTES 64
#define _mm_between_epi16(_val, _min, _max) \
do { _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); } while (0)
#ifdef DO_PREFETCH
/*---------------------------------------------------------------------------*/
static inline void GNU_INLINE _mm_prefetch_buffer(
char * buffer,
int num_bytes)
{
__m128i * buf = (__m128i*) buffer;
unsigned int i;
for (i = 0; i < (num_bytes / sizeof(__m128i));
i+=(CACHE_LINE_BYTES / sizeof(__m128i)))
{
_mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
}
}
#endif /* DO_PREFETCH */
/*---------------------------------------------------------------------------*/
PRIM_STATIC pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(
const INT16 *pSrc[3],
int srcStep,
INT16 *pDst[3],
int dstStep,
const prim_size_t *roi) /* region of interest */
{
__m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096;
__m128i *y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf;
int srcbump, dstbump, yp, imax;
if (((ULONG_PTR) (pSrc[0]) & 0x0f)
|| ((ULONG_PTR) (pSrc[1]) & 0x0f)
|| ((ULONG_PTR) (pSrc[2]) & 0x0f)
|| ((ULONG_PTR) (pDst[0]) & 0x0f)
|| ((ULONG_PTR) (pDst[1]) & 0x0f)
|| ((ULONG_PTR) (pDst[2]) & 0x0f)
|| (roi->width & 0x07)
|| (srcStep & 127)
|| (dstStep & 127))
{
/* We can't maintain 16-byte alignment. */
return general_yCbCrToRGB_16s16s_P3P3(pSrc, srcStep,
pDst, dstStep, roi);
}
zero = _mm_setzero_si128();
max = _mm_set1_epi16(255);
y_buf = (__m128i*) (pSrc[0]);
cb_buf = (__m128i*) (pSrc[1]);
cr_buf = (__m128i*) (pSrc[2]);
r_buf = (__m128i*) (pDst[0]);
g_buf = (__m128i*) (pDst[1]);
b_buf = (__m128i*) (pDst[2]);
r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */
g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */
g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */
c4096 = _mm_set1_epi16(4096);
srcbump = srcStep / sizeof(__m128i);
dstbump = dstStep / sizeof(__m128i);
#ifdef DO_PREFETCH
/* Prefetch Y's, Cb's, and Cr's. */
for (yp=0; yp<roi->height; yp++)
{
int i;
for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
i += (CACHE_LINE_BYTES / sizeof(__m128i)))
{
_mm_prefetch((char*)(&y_buf[i]), _MM_HINT_NTA);
_mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA);
_mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA);
}
y_buf += srcbump;
cb_buf += srcbump;
cr_buf += srcbump;
}
y_buf = (__m128i*) (pSrc[0]);
cb_buf = (__m128i*) (pSrc[1]);
cr_buf = (__m128i*) (pSrc[2]);
#endif /* DO_PREFETCH */
imax = roi->width * sizeof(INT16) / sizeof(__m128i);
for (yp=0; yp<roi->height; ++yp)
{
int i;
for (i=0; i<imax; i++)
{
/* In order to use SSE2 signed 16-bit integer multiplication
* we need to convert the floating point factors to signed int
* without losing information.
* The result of this multiplication is 32 bit and we have two
* SSE instructions that return either the hi or lo word.
* Thus we will multiply the factors by the highest possible 2^n,
* take the upper 16 bits of the signed 32-bit result
* (_mm_mulhi_epi16) and correct this result by multiplying
* it by 2^(16-n).
*
* For the given factors in the conversion matrix the best
* possible n is 14.
*
* Example for calculating r:
* r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
* r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
* r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
* r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
*/
/* y = (y_r_buf[i] + 4096) >> 2 */
__m128i y, cb, cr, r, g, b;
y = _mm_load_si128(y_buf + i);
y = _mm_add_epi16(y, c4096);
y = _mm_srai_epi16(y, 2);
/* cb = cb_g_buf[i]; */
cb = _mm_load_si128(cb_buf + i);
/* cr = cr_b_buf[i]; */
cr = _mm_load_si128(cr_buf + i);
/* (y + HIWORD(cr*22986)) >> 3 */
r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
r = _mm_srai_epi16(r, 3);
/* r_buf[i] = MINMAX(r, 0, 255); */
_mm_between_epi16(r, zero, max);
_mm_store_si128(r_buf + i, r);
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
g = _mm_srai_epi16(g, 3);
/* g_buf[i] = MINMAX(g, 0, 255); */
_mm_between_epi16(g, zero, max);
_mm_store_si128(g_buf + i, g);
/* (y + HIWORD(cb*28999)) >> 3 */
b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
b = _mm_srai_epi16(b, 3);
/* b_buf[i] = MINMAX(b, 0, 255); */
_mm_between_epi16(b, zero, max);
_mm_store_si128(b_buf + i, b);
}
y_buf += srcbump;
cb_buf += srcbump;
cr_buf += srcbump;
r_buf += dstbump;
g_buf += dstbump;
b_buf += dstbump;
}
return PRIMITIVES_SUCCESS;
}
/*---------------------------------------------------------------------------*/
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
* numbers. See the general code above.
*/
PRIM_STATIC pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(
const INT16 *pSrc[3],
int srcStep,
INT16 *pDst[3],
int dstStep,
const prim_size_t *roi) /* region of interest */
{
__m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b;
__m128i *r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf;
int srcbump, dstbump, yp, imax;
if (((ULONG_PTR) (pSrc[0]) & 0x0f)
|| ((ULONG_PTR) (pSrc[1]) & 0x0f)
|| ((ULONG_PTR) (pSrc[2]) & 0x0f)
|| ((ULONG_PTR) (pDst[0]) & 0x0f)
|| ((ULONG_PTR) (pDst[1]) & 0x0f)
|| ((ULONG_PTR) (pDst[2]) & 0x0f)
|| (roi->width & 0x07)
|| (srcStep & 127)
|| (dstStep & 127))
{
/* We can't maintain 16-byte alignment. */
return general_RGBToYCbCr_16s16s_P3P3(pSrc, srcStep,
pDst, dstStep, roi);
}
min = _mm_set1_epi16(-128 << 5);
max = _mm_set1_epi16(127 << 5);
r_buf = (__m128i*) (pSrc[0]);
g_buf = (__m128i*) (pSrc[1]);
b_buf = (__m128i*) (pSrc[2]);
y_buf = (__m128i*) (pDst[0]);
cb_buf = (__m128i*) (pDst[1]);
cr_buf = (__m128i*) (pDst[2]);
y_r = _mm_set1_epi16(9798); /* 0.299000 << 15 */
y_g = _mm_set1_epi16(19235); /* 0.587000 << 15 */
y_b = _mm_set1_epi16(3735); /* 0.114000 << 15 */
cb_r = _mm_set1_epi16(-5535); /* -0.168935 << 15 */
cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
cb_b = _mm_set1_epi16(16403); /* 0.500590 << 15 */
cr_r = _mm_set1_epi16(16377); /* 0.499813 << 15 */
cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
cr_b = _mm_set1_epi16(-2663); /* -0.081282 << 15 */
srcbump = srcStep / sizeof(__m128i);
dstbump = dstStep / sizeof(__m128i);
#ifdef DO_PREFETCH
/* Prefetch RGB's. */
for (yp=0; yp<roi->height; yp++)
{
int i;
for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
i += (CACHE_LINE_BYTES / sizeof(__m128i)))
{
_mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA);
_mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA);
_mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA);
}
r_buf += srcbump;
g_buf += srcbump;
b_buf += srcbump;
}
r_buf = (__m128i*) (pSrc[0]);
g_buf = (__m128i*) (pSrc[1]);
b_buf = (__m128i*) (pSrc[2]);
#endif /* DO_PREFETCH */
imax = roi->width * sizeof(INT16) / sizeof(__m128i);
for (yp=0; yp<roi->height; ++yp)
{
int i;
for (i=0; i<imax; i++)
{
/* In order to use SSE2 signed 16-bit integer multiplication we
* need to convert the floating point factors to signed int
* without loosing information. The result of this multiplication
* is 32 bit and using SSE2 we get either the product's hi or lo
* word. Thus we will multiply the factors by the highest
* possible 2^n and take the upper 16 bits of the signed 32-bit
* result (_mm_mulhi_epi16). Since the final result needs to
* be scaled by << 5 and also in in order to keep the precision
* within the upper 16 bits we will also have to scale the RGB
* values used in the multiplication by << 5+(16-n).
*/
__m128i r, g, b, y, cb, cr;
r = _mm_load_si128(y_buf+i);
g = _mm_load_si128(g_buf+i);
b = _mm_load_si128(b_buf+i);
/* r<<6; g<<6; b<<6 */
r = _mm_slli_epi16(r, 6);
g = _mm_slli_epi16(g, 6);
b = _mm_slli_epi16(b, 6);
/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
y = _mm_mulhi_epi16(r, y_r);
y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
y = _mm_add_epi16(y, min);
/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
_mm_between_epi16(y, min, max);
_mm_store_si128(y_buf+i, y);
/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
cb = _mm_mulhi_epi16(r, cb_r);
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
_mm_between_epi16(cb, min, max);
_mm_store_si128(cb_buf+i, cb);
/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
cr = _mm_mulhi_epi16(r, cr_r);
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
_mm_between_epi16(cr, min, max);
_mm_store_si128(cr_buf+i, cr);
}
y_buf += srcbump;
cb_buf += srcbump;
cr_buf += srcbump;
r_buf += dstbump;
g_buf += dstbump;
b_buf += dstbump;
}
return PRIMITIVES_SUCCESS;
}
/*---------------------------------------------------------------------------*/
#define LOAD128(_src_) \
_mm_load_si128((__m128i *) _src_)
#define STORE128(_dst_, _src_) \
_mm_store_si128((__m128i *) _dst_, _src_)
#define PUNPCKLBW(_dst_, _src_) \
_dst_ = _mm_unpacklo_epi8(_src_, _dst_)
#define PUNPCKHBW(_dst_, _src_) \
_dst_ = _mm_unpackhi_epi8(_src_, _dst_)
#define PUNPCKLWD(_dst_, _src_) \
_dst_ = _mm_unpacklo_epi16(_src_, _dst_)
#define PUNPCKHWD(_dst_, _src_) \
_dst_ = _mm_unpackhi_epi16(_src_, _dst_)
#define PACKUSWB(_dst_, _src_) \
_dst_ = _mm_packus_epi16(_dst_, _src_)
#define PREFETCH(_ptr_) \
_mm_prefetch((const void *) _ptr_, _MM_HINT_T0)
#define XMM_ALL_ONES \
_mm_set1_epi32(0xFFFFFFFFU)
PRIM_STATIC pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
const INT16 *pSrc[3], /* 16-bit R,G, and B arrays */
INT32 srcStep, /* bytes between rows in source data */
BYTE *pDst, /* 32-bit interleaved ARGB (ABGR?) data */
INT32 dstStep, /* bytes between rows in dest data */
const prim_size_t *roi) /* region of interest */
{
const UINT16 *r = (const UINT16 *) (pSrc[0]);
const UINT16 *g = (const UINT16 *) (pSrc[1]);
const UINT16 *b = (const UINT16 *) (pSrc[2]);
BYTE *out;
int srcbump, dstbump, y;
/* Ensure 16-byte alignment on all pointers,
* that width is a multiple of 8,
* and that the next row will also remain aligned.
* Since this is usually used for 64x64 aligned arrays,
* these checks should presumably pass.
*/
if ((((ULONG_PTR) (pSrc[0]) & 0x0f) != 0)
|| (((ULONG_PTR) (pSrc[1]) & 0x0f) != 0)
|| (((ULONG_PTR) (pSrc[2]) & 0x0f) != 0)
|| (((ULONG_PTR) pDst & 0x0f) != 0)
|| (roi->width & 0x0f)
|| (srcStep & 0x0f)
|| (dstStep & 0x0f))
{
return general_RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, roi);
}
out = (BYTE *) pDst;
srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
dstbump = (dstStep - (roi->width * sizeof(UINT32)));
for (y=0; y<roi->height; ++y)
{
int width = roi->width;
do {
__m128i R0, R1, R2, R3, R4;
/* The comments below pretend these are 8-byte registers
* rather than 16-byte, for readability.
*/
R0 = LOAD128(b); b += 8; /* R0 = 00B300B200B100B0 */
R1 = LOAD128(b); b += 8; /* R1 = 00B700B600B500B4 */
PACKUSWB(R0,R1); /* R0 = B7B6B5B4B3B2B1B0 */
R1 = LOAD128(g); g += 8; /* R1 = 00G300G200G100G0 */
R2 = LOAD128(g); g += 8; /* R2 = 00G700G600G500G4 */
PACKUSWB(R1,R2); /* R1 = G7G6G5G4G3G2G1G0 */
R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */
PUNPCKLBW(R2,R0); /* R2 = G3B3G2B2G1B1G0B0 */
PUNPCKHBW(R1,R0); /* R1 = G7B7G6B7G5B5G4B4 */
R0 = LOAD128(r); r += 8; /* R0 = 00R300R200R100R0 */
R3 = LOAD128(r); r += 8; /* R3 = 00R700R600R500R4 */
PACKUSWB(R0,R3); /* R0 = R7R6R5R4R3R2R1R0 */
R3 = XMM_ALL_ONES; /* R3 = FFFFFFFFFFFFFFFF */
R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */
PUNPCKLBW(R4,R0); /* R4 = FFR3FFR2FFR1FFR0 */
PUNPCKHBW(R3,R0); /* R3 = FFR7FFR6FFR5FFR4 */
R0 = R4; /* R0 = R4 */
PUNPCKLWD(R0,R2); /* R0 = FFR1G1B1FFR0G0B0 */
PUNPCKHWD(R4,R2); /* R4 = FFR3G3B3FFR2G2B2 */
R2 = R3; /* R2 = R3 */
PUNPCKLWD(R2,R1); /* R2 = FFR5G5B5FFR4G4B4 */
PUNPCKHWD(R3,R1); /* R3 = FFR7G7B7FFR6G6B6 */
STORE128(out, R0); out += 16; /* FFR1G1B1FFR0G0B0 */
STORE128(out, R4); out += 16; /* FFR3G3B3FFR2G2B2 */
STORE128(out, R2); out += 16; /* FFR5G5B5FFR4G4B4 */
STORE128(out, R3); out += 16; /* FFR7G7B7FFR6G6B6 */
} while (width -= 16);
/* Jump to next row. */
r += srcbump;
g += srcbump;
b += srcbump;
out += dstbump;
}
return PRIMITIVES_SUCCESS;
}
#endif /* WITH_SSE2 */
/*---------------------------------------------------------------------------*/
#ifdef WITH_NEON
PRIM_STATIC pstatus_t neon_yCbCrToRGB_16s16s_P3P3(
const INT16 *pSrc[3],
int srcStep,
INT16 *pDst[3],
int dstStep,
const prim_size_t *roi) /* region of interest */
{
/* TODO: If necessary, check alignments and call the general version. */
int16x8_t zero = vdupq_n_s16(0);
int16x8_t max = vdupq_n_s16(255);
int16x8_t y_add = vdupq_n_s16(128);
int16x8_t* y_buf = (int16x8_t*) pSrc[0];
int16x8_t* cb_buf = (int16x8_t*) pSrc[1];
int16x8_t* cr_buf = (int16x8_t*) pSrc[2];
int16x8_t* r_buf = (int16x8_t*) pDst[0];
int16x8_t* g_buf = (int16x8_t*) pDst[1];
int16x8_t* b_buf = (int16x8_t*) pDst[2];
int srcbump = srcStep / sizeof(int16x8_t);
int dstbump = dstStep / sizeof(int16x8_t);
int yp;
int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
for (yp=0; yp<roi->height; ++yp)
{
int i;
for (i=0; i<imax; i++)
{
int16x8_t y = vld1q_s16((INT16*) (y_buf+i));
y = vaddq_s16(y, y_add);
int16x8_t cr = vld1q_s16((INT16*) (cr_buf+i));
/* r = between((y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5)),
* 0, 255);
*/
int16x8_t r = vaddq_s16(y, cr);
r = vaddq_s16(r, vshrq_n_s16(cr, 2));
r = vaddq_s16(r, vshrq_n_s16(cr, 3));
r = vaddq_s16(r, vshrq_n_s16(cr, 5));
r = vminq_s16(vmaxq_s16(r, zero), max);
vst1q_s16((INT16*) (r_buf+i), r);
/* cb = cb_g_buf[i]; */
int16x8_t cb = vld1q_s16((INT16*) (cb_buf+i));
/* g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1)
* - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255);
*/
int16x8_t g = vsubq_s16(y, vshrq_n_s16(cb, 2));
g = vsubq_s16(g, vshrq_n_s16(cb, 4));
g = vsubq_s16(g, vshrq_n_s16(cb, 5));
g = vsubq_s16(g, vshrq_n_s16(cr, 1));
g = vsubq_s16(g, vshrq_n_s16(cr, 3));
g = vsubq_s16(g, vshrq_n_s16(cr, 4));
g = vsubq_s16(g, vshrq_n_s16(cr, 5));
g = vminq_s16(vmaxq_s16(g, zero), max);
vst1q_s16((INT16*) (g_buf+i), g);
/* b = between((y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6)),
* 0, 255);
*/
int16x8_t b = vaddq_s16(y, cb);
b = vaddq_s16(b, vshrq_n_s16(cb, 1));
b = vaddq_s16(b, vshrq_n_s16(cb, 2));
b = vaddq_s16(b, vshrq_n_s16(cb, 6));
b = vminq_s16(vmaxq_s16(b, zero), max);
vst1q_s16((INT16*) (b_buf+i), b);
}
y_buf += srcbump;
cb_buf += srcbump;
cr_buf += srcbump;
r_buf += dstbump;
g_buf += dstbump;
b_buf += dstbump;
}
}
#endif /* WITH_NEON */
/* I don't see a direct IPP version of this, since the input is INT16
* YCbCr. It may be possible via Deinterleave and then YCbCrToRGB_<mod>.
* But that would likely be slower.
*/
/* ------------------------------------------------------------------------- */
void primitives_init_colors(
const primitives_hints_t *hints,
primitives_t *prims)
{
prims->RGBToRGB_16s8u_P3AC4R = general_RGBToRGB_16s8u_P3AC4R;
prims->yCbCrToRGB_16s16s_P3P3 = general_yCbCrToRGB_16s16s_P3P3;
prims->RGBToYCbCr_16s16s_P3P3 = general_RGBToYCbCr_16s16s_P3P3;
#if defined(WITH_SSE2)
if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
{
prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R;
prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
}
#elif defined(WITH_NEON)
if (hints->arm_flags & PRIM_ARM_NEON_AVAILABLE)
{
prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
}
#endif /* WITH_SSE2 */
}
/* ------------------------------------------------------------------------- */
void primitives_deinit_colors(
primitives_t *prims)
{
/* Nothing to do. */
}

View File

@ -0,0 +1,177 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Copy operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <config.h>
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#ifdef WITH_IPP
# include <ipps.h>
# include <ippi.h>
#endif /* WITH_IPP */
#include "prim_internal.h"
/* ------------------------------------------------------------------------- */
/*static inline BOOL memory_regions_overlap_1d(*/
static BOOL memory_regions_overlap_1d(
const BYTE *p1,
const BYTE *p2,
size_t bytes)
{
const ULONG_PTR p1m = (const ULONG_PTR) p1;
const ULONG_PTR p2m = (const ULONG_PTR) p2;
if (p1m <= p2m)
{
if (p1m + bytes > p2m) return TRUE;
}
else
{
if (p2m + bytes > p1m) return TRUE;
}
/* else */
return FALSE;
}
/* ------------------------------------------------------------------------- */
/*static inline BOOL memory_regions_overlap_2d( */
static BOOL memory_regions_overlap_2d(
const BYTE *p1, int p1Step, int p1Size,
const BYTE *p2, int p2Step, int p2Size,
int width, int height)
{
ULONG_PTR p1m = (ULONG_PTR) p1;
ULONG_PTR p2m = (ULONG_PTR) p2;
if (p1m <= p2m)
{
ULONG_PTR p1mEnd = p1m + (height-1)*p1Step + width*p1Size;
if (p1mEnd > p2m) return TRUE;
}
else
{
ULONG_PTR p2mEnd = p2m + (height-1)*p2Step + width*p2Size;
if (p2mEnd > p1m) return TRUE;
}
/* else */
return FALSE;
}
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_copy_8u(
const BYTE *pSrc,
BYTE *pDst,
INT32 len)
{
if (memory_regions_overlap_1d(pSrc, pDst, (size_t) len))
{
memmove((void *) pDst, (const void *) pSrc, (size_t) len);
}
else
{
memcpy((void *) pDst, (const void *) pSrc, (size_t) len);
}
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
/* Copy a block of pixels from one buffer to another.
* The addresses are assumed to have been already offset to the upper-left
* corners of the source and destination region of interest.
*/
PRIM_STATIC pstatus_t general_copy_8u_AC4r(
const BYTE *pSrc, INT32 srcStep,
BYTE *pDst, INT32 dstStep,
INT32 width, INT32 height)
{
primitives_t *prims = primitives_get();
const BYTE *src = (const BYTE *) pSrc;
BYTE *dst = (BYTE *) pDst;
int rowbytes = width * sizeof(UINT32);
if ((width == 0) || (height == 0)) return PRIMITIVES_SUCCESS;
if (memory_regions_overlap_2d(pSrc, srcStep, sizeof(UINT32),
pDst, dstStep, sizeof(UINT32), width, height))
{
do {
prims->copy(src, dst, rowbytes);
src += srcStep;
dst += dstStep;
} while (--height);
}
else
{
/* TODO: do it in one operation when the rowdata is adjacent. */
do {
/* If we find a replacement for memcpy that is consistently
* faster, this could be replaced with that.
*/
memcpy(dst, src, rowbytes);
src += srcStep;
dst += dstStep;
} while (--height);
}
return PRIMITIVES_SUCCESS;
}
#ifdef WITH_IPP
/* ------------------------------------------------------------------------- */
/* This is just ippiCopy_8u_AC4R without the IppiSize structure parameter. */
static pstatus_t ippiCopy_8u_AC4r(
const BYTE *pSrc, INT32 srcStep,
BYTE *pDst, INT32 dstStep,
INT32 width, INT32 height)
{
IppiSize roi;
roi.width = width;
roi.height = height;
return (pstatus_t) ippiCopy_8u_AC4R(pSrc, srcStep, pDst, dstStep, roi);
}
#endif /* WITH_IPP */
/* ------------------------------------------------------------------------- */
void primitives_init_copy(
const primitives_hints_t *hints,
primitives_t *prims)
{
/* Start with the default. */
prims->copy_8u = general_copy_8u;
prims->copy_8u_AC4r = general_copy_8u_AC4r;
/* Pick tuned versions if possible. */
#ifdef WITH_IPP
prims->copy_8u = (__copy_8u_t) ippsCopy_8u;
prims->copy_8u_AC4r = (__copy_8u_AC4r_t) ippiCopy_8u_AC4r;
#endif
/* Performance with an SSE2 version with no prefetch seemed to be
* all over the map vs. memcpy.
* Sometimes it was significantly faster, sometimes dreadfully slower,
* and it seemed to vary a lot depending on block size and processor.
* Hence, no SSE version is used here unless once can be written that
* is consistently faster than memcpy.
*/
/* This is just an alias with void* parameters */
prims->copy = (__copy_t) (prims->copy_8u);
}
/* ------------------------------------------------------------------------- */
void primitives_deinit_copy(
primitives_t *prims)
{
/* Nothing to do. */
}

View File

@ -0,0 +1,105 @@
/* prim_internal.h
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License. Algorithms used by
* this code may be covered by patents by HP, Microsoft, or other parties.
*
*/
#ifdef __GNUC__
# pragma once
#endif
#ifndef __PRIM_INTERNAL_H_INCLUDED__
#define __PRIM_INTERNAL_H_INCLUDED__
#ifndef CMAKE_BUILD_TYPE
#define CMAKE_BUILD_TYPE Release
#endif
#include <freerdp/primitives.h>
/* Normally the internal entrypoints should be static, but a benchmark
* program may want to access them directly and turn this off.
*/
#ifndef PRIM_STATIC
# define PRIM_STATIC static
#else
# undef PRIM_STATIC
# define PRIM_STATIC
#endif /* !PRIM_STATIC */
/* Use lddqu for unaligned; load for 16-byte aligned. */
#define LOAD_SI128(_ptr_) \
(((ULONG_PTR) (_ptr_) & 0x0f) \
? _mm_lddqu_si128((__m128i *) (_ptr_)) \
: _mm_load_si128((__m128i *) (_ptr_)))
/* This structure can (eventually) be used to provide hints to the
* initialization routines, e.g. whether SSE2 or NEON or IPP instructions
* or calls are available.
*/
typedef struct
{
UINT32 x86_flags;
UINT32 arm_flags;
} primitives_hints_t;
/* Function prototypes for all the init/deinit routines. */
extern void primitives_init_copy(
const primitives_hints_t *hints,
primitives_t *prims);
extern void primitives_deinit_copy(
primitives_t *prims);
extern void primitives_init_set(
const primitives_hints_t *hints,
primitives_t *prims);
extern void primitives_deinit_set(
primitives_t *prims);
extern void primitives_init_add(
const primitives_hints_t *hints,
primitives_t *prims);
extern void primitives_deinit_add(
primitives_t *prims);
extern void primitives_init_andor(
const primitives_hints_t *hints,
primitives_t *prims);
extern void primitives_deinit_andor(
primitives_t *prims);
extern void primitives_init_shift(
const primitives_hints_t *hints,
primitives_t *prims);
extern void primitives_deinit_shift(
primitives_t *prims);
extern void primitives_init_sign(
const primitives_hints_t *hints,
primitives_t *prims);
extern void primitives_deinit_sign(
primitives_t *prims);
extern void primitives_init_alphaComp(
const primitives_hints_t *hints,
primitives_t *prims);
extern void primitives_deinit_alphaComp(
primitives_t *prims);
extern void primitives_init_colors(
const primitives_hints_t *hints,
primitives_t *prims);
extern void primitives_deinit_colors(
primitives_t *prims);
#endif /* !__PRIM_INTERNAL_H_INCLUDED__ */

View File

@ -0,0 +1,309 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Routines to set a chunk of memory to a constant.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
*/
#include <config.h>
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#ifdef WITH_SSE2
# include <emmintrin.h>
#endif /* WITH_SSE2 */
#ifdef WITH_IPP
# include <ipps.h>
#endif /* WITH_IPP */
#include "prim_internal.h"
/* ========================================================================= */
PRIM_STATIC pstatus_t general_set_8u(
BYTE val,
BYTE *pDst,
INT32 len)
{
memset((void *) pDst, (int) val, (size_t) len);
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_zero(
void *pDst,
size_t len)
{
memset(pDst, 0, len);
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
#ifdef WITH_SSE2
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
PRIM_STATIC pstatus_t sse2_set_8u(
BYTE val,
BYTE *pDst,
INT32 len)
{
BYTE byte, *dptr;
__m128i xmm0;
size_t count;
if (len < 16) return general_set_8u(val, pDst, len);
byte = val;
dptr = (BYTE *) pDst;
/* Seek 16-byte alignment. */
while ((ULONG_PTR) dptr & 0x0f)
{
*dptr++ = byte;
if (--len == 0) return PRIMITIVES_SUCCESS;
}
xmm0 = _mm_set1_epi8(byte);
/* Cover 256-byte chunks via SSE register stores. */
count = len >> 8;
len -= count << 8;
/* Do 256-byte chunks using one XMM register. */
while (count--)
{
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
}
/* Cover 16-byte chunks via SSE register stores. */
count = len >> 4;
len -= count << 4;
/* Do 16-byte chunks using one XMM register. */
while (count--)
{
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
}
/* Do leftover bytes. */
while (len--) *dptr++ = byte;
return PRIMITIVES_SUCCESS;
}
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif /* WITH_SSE2 */
/* ========================================================================= */
PRIM_STATIC pstatus_t general_set_32s(
INT32 val,
INT32 *pDst,
INT32 len)
{
INT32 *dptr = (INT32 *) pDst;
size_t span, remaining;
primitives_t *prims;
if (len < 256)
{
while (len--) *dptr++ = val;
return PRIMITIVES_SUCCESS;
}
/* else quadratic growth memcpy algorithm */
span = 1;
*dptr = val;
remaining = len - 1;
prims = primitives_get();
while (remaining)
{
size_t thiswidth = span;
if (thiswidth > remaining) thiswidth = remaining;
prims->copy_8u((BYTE *) dptr, (BYTE *) (dptr + span), thiswidth<<2);
remaining -= thiswidth;
span <<= 1;
}
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_set_32u(
UINT32 val,
UINT32 *pDst,
INT32 len)
{
UINT32 *dptr = (UINT32 *) pDst;
size_t span, remaining;
primitives_t *prims;
if (len < 256)
{
while (len--) *dptr++ = val;
return PRIMITIVES_SUCCESS;
}
/* else quadratic growth memcpy algorithm */
span = 1;
*dptr = val;
remaining = len - 1;
prims = primitives_get();
while (remaining)
{
size_t thiswidth = span;
if (thiswidth > remaining) thiswidth = remaining;
prims->copy_8u((BYTE *) dptr, (BYTE *) (dptr + span), thiswidth<<2);
remaining -= thiswidth;
span <<= 1;
}
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
#ifdef WITH_SSE2
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
PRIM_STATIC pstatus_t sse2_set_32u(
UINT32 val,
UINT32 *pDst,
INT32 len)
{
UINT32 *dptr = (UINT32 *) pDst;
__m128i xmm0;
size_t count;
/* If really short, just do it here. */
if (len < 32)
{
while (len--) *dptr++ = val;
return PRIMITIVES_SUCCESS;
}
/* Assure we can reach 16-byte alignment. */
if (((ULONG_PTR) dptr & 0x03) != 0)
{
return general_set_32u(val, pDst, len);
}
/* Seek 16-byte alignment. */
while ((ULONG_PTR) dptr & 0x0f)
{
*dptr++ = val;
if (--len == 0) return PRIMITIVES_SUCCESS;
}
xmm0 = _mm_set1_epi32(val);
/* Cover 256-byte chunks via SSE register stores. */
count = len >> 6;
len -= count << 6;
/* Do 256-byte chunks using one XMM register. */
while (count--)
{
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
}
/* Cover 16-byte chunks via SSE register stores. */
count = len >> 2;
len -= count << 2;
/* Do 16-byte chunks using one XMM register. */
while (count--)
{
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
}
/* Do leftover bytes. */
while (len--) *dptr++ = val;
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t sse2_set_32s(
INT32 val,
INT32 *pDst,
INT32 len)
{
UINT32 uval = *((UINT32 *) &val);
return sse2_set_32u(uval, (UINT32 *) pDst, len);
}
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif /* WITH_SSE2 */
#ifdef WITH_IPP
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t ipp_wrapper_set_32u(
UINT32 val,
UINT32 *pDst,
INT32 len)
{
/* A little type conversion, then use the signed version. */
INT32 sval = *((INT32 *) &val);
return ippsSet_32s(sval, (INT32 *) pDst, len);
}
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_set(
const primitives_hints_t *hints,
primitives_t *prims)
{
/* Start with the default. */
prims->set_8u = general_set_8u;
prims->set_32s = general_set_32s;
prims->set_32u = general_set_32u;
prims->zero = general_zero;
/* Pick tuned versions if possible. */
#ifdef WITH_IPP
prims->set_8u = (__set_8u_t) ippsSet_8u;
prims->set_32s = (__set_32s_t) ippsSet_32s;
prims->set_32u = (__set_32u_t) ipp_wrapper_set_32u;
prims->zero = (__zero_t) ippsZero_8u;
#elif defined(WITH_SSE2)
if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
{
prims->set_8u = sse2_set_8u;
prims->set_32s = sse2_set_32s;
prims->set_32u = sse2_set_32u;
}
#endif
}
/* ------------------------------------------------------------------------- */
void primitives_deinit_set(
primitives_t *prims)
{
/* Nothing to do. */
}

View File

@ -0,0 +1,165 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Shift operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <config.h>
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#ifdef WITH_SSE2
# include <emmintrin.h>
# include <pmmintrin.h>
#endif /* WITH_SSE2 */
#ifdef WITH_IPP
# include <ipps.h>
#endif /* WITH_IPP */
#include "prim_internal.h"
#include "prim_templates.h"
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_lShiftC_16s(
const INT16 *pSrc,
INT32 val,
INT16 *pDst,
INT32 len)
{
if (val == 0) return PRIMITIVES_SUCCESS;
while (len--) *pDst++ = *pSrc++ << val;
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_rShiftC_16s(
const INT16 *pSrc,
INT32 val,
INT16 *pDst,
INT32 len)
{
if (val == 0) return PRIMITIVES_SUCCESS;
while (len--) *pDst++ = *pSrc++ >> val;
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_lShiftC_16u(
const UINT16 *pSrc,
INT32 val,
UINT16 *pDst,
INT32 len)
{
if (val == 0) return PRIMITIVES_SUCCESS;
while (len--) *pDst++ = *pSrc++ << val;
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_rShiftC_16u(
const UINT16 *pSrc,
INT32 val,
UINT16 *pDst,
INT32 len)
{
if (val == 0) return PRIMITIVES_SUCCESS;
while (len--) *pDst++ = *pSrc++ >> val;
return PRIMITIVES_SUCCESS;
}
#ifdef WITH_SSE2
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, general_lShiftC_16s,
_mm_slli_epi16, *dptr++ = *sptr++ << val)
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, general_rShiftC_16s,
_mm_srai_epi16, *dptr++ = *sptr++ >> val)
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, general_lShiftC_16u,
_mm_slli_epi16, *dptr++ = *sptr++ << val)
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, general_rShiftC_16u,
_mm_srli_epi16, *dptr++ = *sptr++ >> val)
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_shiftC_16s(
const INT16 *pSrc,
INT32 val,
INT16 *pDst,
INT32 len)
{
primitives_t *prims;
if (val == 0) return PRIMITIVES_SUCCESS;
prims = primitives_get();
if (val < 0) return prims->rShiftC_16s(pSrc, -val, pDst, len);
else return prims->lShiftC_16s(pSrc, val, pDst, len);
}
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_shiftC_16u(
const UINT16 *pSrc,
INT32 val,
UINT16 *pDst,
INT32 len)
{
primitives_t *prims;
if (val == 0) return PRIMITIVES_SUCCESS;
prims = primitives_get();
if (val < 0) return prims->rShiftC_16u(pSrc, -val, pDst, len);
else return prims->lShiftC_16u(pSrc, val, pDst, len);
}
/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
* depending on the sign of val. To avoid using the deprecated inplace
* routines, a wrapper can use the src for the dest.
*/
/* ------------------------------------------------------------------------- */
void primitives_init_shift(
const primitives_hints_t *hints,
primitives_t *prims)
{
/* Start with the default. */
prims->lShiftC_16s = general_lShiftC_16s;
prims->rShiftC_16s = general_rShiftC_16s;
prims->lShiftC_16u = general_lShiftC_16u;
prims->rShiftC_16u = general_rShiftC_16u;
#if defined(WITH_IPP)
prims->lShiftC_16s = (__lShiftC_16s_t) ippsLShiftC_16s;
prims->rShiftC_16s = (__rShiftC_16s_t) ippsRShiftC_16s;
prims->lShiftC_16u = (__lShiftC_16u_t) ippsLShiftC_16u;
prims->rShiftC_16u = (__rShiftC_16u_t) ippsRShiftC_16u;
#elif defined(WITH_SSE2)
if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
{
prims->lShiftC_16s = sse2_lShiftC_16s;
prims->rShiftC_16s = sse2_rShiftC_16s;
prims->lShiftC_16u = sse2_lShiftC_16u;
prims->rShiftC_16u = sse2_rShiftC_16u;
}
#endif
/* Wrappers */
prims->shiftC_16s = general_shiftC_16s;
prims->shiftC_16u = general_shiftC_16u;
}
/* ------------------------------------------------------------------------- */
void primitives_deinit_shift(
primitives_t *prims)
{
/* Nothing to do. */
}

View File

@ -0,0 +1,170 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Sign operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <config.h>
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#ifdef WITH_SSE2
# include <emmintrin.h>
# include <tmmintrin.h>
#endif /* WITH_SSE2 */
#include "prim_internal.h"
/* ----------------------------------------------------------------------------
* Set pDst to the sign-value of the 16-bit values in pSrc (-1, 0, or 1).
*/
PRIM_STATIC pstatus_t general_sign_16s(
const INT16 *pSrc,
INT16 *pDst,
INT32 len)
{
while (len--)
{
INT16 src = *pSrc++;
*pDst++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0);
}
return PRIMITIVES_SUCCESS;
}
#ifdef WITH_SSE2
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t ssse3_sign_16s(
const INT16 *pSrc,
INT16 *pDst,
INT32 len)
{
const INT16 *sptr = (const INT16 *) pSrc;
INT16 *dptr = (INT16 *) pDst;
size_t count;
if (len < 16)
{
return general_sign_16s(pSrc, pDst, len);
}
/* Check for 16-byte alignment (eventually). */
if ((ULONG_PTR) pDst & 0x01)
{
return general_sign_16s(pSrc, pDst, len);
}
/* Seek 16-byte alignment. */
while ((ULONG_PTR) dptr & 0x0f)
{
INT16 src = *sptr++;
*dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0);
if (--len == 0) return PRIMITIVES_SUCCESS;
}
/* Do 32-short chunks using 8 XMM registers. */
count = len >> 5; /* / 32 */
len -= count << 5; /* * 32 */
if ((ULONG_PTR) sptr & 0x0f)
{
/* Unaligned */
while (count--)
{
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
xmm0 = _mm_set1_epi16(0x0001U);
xmm1 = _mm_set1_epi16(0x0001U);
xmm2 = _mm_set1_epi16(0x0001U);
xmm3 = _mm_set1_epi16(0x0001U);
xmm4 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
xmm5 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
xmm6 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
xmm7 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
xmm0 = _mm_sign_epi16(xmm0, xmm4);
xmm1 = _mm_sign_epi16(xmm1, xmm5);
xmm2 = _mm_sign_epi16(xmm2, xmm6);
xmm3 = _mm_sign_epi16(xmm3, xmm7);
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 8;
_mm_store_si128((__m128i *) dptr, xmm1); dptr += 8;
_mm_store_si128((__m128i *) dptr, xmm2); dptr += 8;
_mm_store_si128((__m128i *) dptr, xmm3); dptr += 8;
}
}
else
{
/* Aligned */
while (count--)
{
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
xmm0 = _mm_set1_epi16(0x0001U);
xmm1 = _mm_set1_epi16(0x0001U);
xmm2 = _mm_set1_epi16(0x0001U);
xmm3 = _mm_set1_epi16(0x0001U);
xmm4 = _mm_load_si128((__m128i *) sptr); sptr += 8;
xmm5 = _mm_load_si128((__m128i *) sptr); sptr += 8;
xmm6 = _mm_load_si128((__m128i *) sptr); sptr += 8;
xmm7 = _mm_load_si128((__m128i *) sptr); sptr += 8;
xmm0 = _mm_sign_epi16(xmm0, xmm4);
xmm1 = _mm_sign_epi16(xmm1, xmm5);
xmm2 = _mm_sign_epi16(xmm2, xmm6);
xmm3 = _mm_sign_epi16(xmm3, xmm7);
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 8;
_mm_store_si128((__m128i *) dptr, xmm1); dptr += 8;
_mm_store_si128((__m128i *) dptr, xmm2); dptr += 8;
_mm_store_si128((__m128i *) dptr, xmm3); dptr += 8;
}
}
/* Do 8-short chunks using two XMM registers. */
count = len >> 3;
len -= count << 3;
while (count--)
{
__m128i xmm0 = _mm_set1_epi16(0x0001U);
__m128i xmm1 = LOAD_SI128(sptr); sptr += 8;
xmm0 = _mm_sign_epi16(xmm0, xmm1);
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 8;
}
/* Do leftovers. */
while (len--)
{
INT16 src = *sptr++;
*dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0);
}
return PRIMITIVES_SUCCESS;
}
#endif /* WITH_SSE2 */
/* ------------------------------------------------------------------------- */
void primitives_init_sign(
const primitives_hints_t *hints,
primitives_t *prims)
{
/* Start with the default. */
prims->sign_16s = general_sign_16s;
/* Pick tuned versions if possible. */
/* I didn't spot an IPP version of this. */
#if defined(WITH_SSE2)
if ((hints->x86_flags & PRIM_X86_SSSE3_AVAILABLE)
&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
{
prims->sign_16s = ssse3_sign_16s;
}
#endif
}
/* ------------------------------------------------------------------------- */
void primitives_deinit_sign(
primitives_t *prims)
{
/* Nothing to do. */
}

View File

@ -0,0 +1,416 @@
/* prim_templates.h
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License. Algorithms used by
* this code may be covered by patents by HP, Microsoft, or other parties.
*/
#ifdef __GNUC__
# pragma once
#endif
#ifndef __PRIM_TEMPLATES_H_INCLUDED__
#define __PRIM_TEMPLATES_H_INCLUDED__
/* These are prototypes for SSE (potentially NEON) routines that do a
* simple SSE operation over an array of data. Since so much of this
* code is shared except for the operation itself, these prototypes are
* used rather than duplicating code. The naming convention depends on
* the parameters: S=Source param; C=Constant; D=Destination.
* All the macros have parameters for a fallback procedure if the data
* is too small and an operation "the slow way" for use at 16-byte edges.
*/
/* SSE3 note: If someone needs to support an SSE2 version of these without
* SSE3 support, an alternative version could be added that merely checks
* that 16-byte alignment on both destination and source(s) can be
* achieved, rather than use LDDQU for unaligned reads.
*/
/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
* It easily can't do that if the value is stored in a variable.
* So don't save it as an intermediate value.
*/
/* ----------------------------------------------------------------------------
* SCD = Source, Constant, Destination
*/
#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, INT32 val, _type_ *pDst, INT32 len) \
{ \
int shifts; \
UINT32 offBeatMask; \
const _type_ *sptr = pSrc; \
_type_ *dptr = pDst; \
size_t count; \
if (len < 16) /* pointless if too small */ \
{ \
return _fallback_(pSrc, val, pDst, len); \
} \
if (sizeof(_type_) == 1) shifts = 1; \
else if (sizeof(_type_) == 2) shifts = 2; \
else if (sizeof(_type_) == 4) shifts = 3; \
else if (sizeof(_type_) == 8) shifts = 4; \
offBeatMask = (1 << (shifts - 1)) - 1; \
if ((ULONG_PTR) pDst & offBeatMask) \
{ \
/* Incrementing the pointer skips over 16-byte boundary. */ \
return _fallback_(pSrc, val, pDst, len); \
} \
/* Get to the 16-byte boundary now. */ \
while ((ULONG_PTR) dptr & 0x0f) \
{ \
_slowWay_; \
if (--len == 0) return PRIMITIVES_SUCCESS; \
} \
/* Use 8 128-bit SSE registers. */ \
count = len >> (8-shifts); \
len -= count << (8-shifts); \
if ((ULONG_PTR) sptr & 0x0f) \
{ \
while (count--) \
{ \
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
xmm0 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm5 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm6 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm7 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, val); \
xmm1 = _op_(xmm1, val); \
xmm2 = _op_(xmm2, val); \
xmm3 = _op_(xmm3, val); \
xmm4 = _op_(xmm4, val); \
xmm5 = _op_(xmm5, val); \
xmm6 = _op_(xmm6, val); \
xmm7 = _op_(xmm7, val); \
_mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm4); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm5); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm6); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm7); \
dptr += (16/sizeof(_type_)); \
} \
} \
else \
{ \
while (count--) \
{ \
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
xmm0 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm1 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm2 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm3 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm4 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm5 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm6 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm7 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, val); \
xmm1 = _op_(xmm1, val); \
xmm2 = _op_(xmm2, val); \
xmm3 = _op_(xmm3, val); \
xmm4 = _op_(xmm4, val); \
xmm5 = _op_(xmm5, val); \
xmm6 = _op_(xmm6, val); \
xmm7 = _op_(xmm7, val); \
_mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm4); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm5); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm6); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm7); \
dptr += (16/sizeof(_type_)); \
} \
} \
/* Use a single 128-bit SSE register. */ \
count = len >> (5-shifts); \
len -= count << (5-shifts); \
while (count--) \
{ \
__m128i xmm0 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, val); \
_mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \
} \
/* Finish off the remainder. */ \
while (len--) { _slowWay_; } \
return PRIMITIVES_SUCCESS; \
}
/* ----------------------------------------------------------------------------
* SCD = Source, Constant, Destination
* PRE = preload xmm0 with the constant.
*/
#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32 len) \
{ \
int shifts; \
UINT32 offBeatMask; \
const _type_ *sptr = pSrc; \
_type_ *dptr = pDst; \
size_t count; \
__m128i xmm0; \
if (len < 16) /* pointless if too small */ \
{ \
return _fallback_(pSrc, val, pDst, len); \
} \
if (sizeof(_type_) == 1) shifts = 1; \
else if (sizeof(_type_) == 2) shifts = 2; \
else if (sizeof(_type_) == 4) shifts = 3; \
else if (sizeof(_type_) == 8) shifts = 4; \
offBeatMask = (1 << (shifts - 1)) - 1; \
if ((ULONG_PTR) pDst & offBeatMask) \
{ \
/* Incrementing the pointer skips over 16-byte boundary. */ \
return _fallback_(pSrc, val, pDst, len); \
} \
/* Get to the 16-byte boundary now. */ \
while ((ULONG_PTR) dptr & 0x0f) \
{ \
_slowWay_; \
if (--len == 0) return PRIMITIVES_SUCCESS; \
} \
/* Use 4 128-bit SSE registers. */ \
count = len >> (7-shifts); \
len -= count << (7-shifts); \
xmm0 = _mm_set1_epi32(val); \
if ((ULONG_PTR) sptr & 0x0f) \
{ \
while (count--) \
{ \
__m128i xmm1, xmm2, xmm3, xmm4; \
xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm1 = _op_(xmm1, xmm0); \
xmm2 = _op_(xmm2, xmm0); \
xmm3 = _op_(xmm3, xmm0); \
xmm4 = _op_(xmm4, xmm0); \
_mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm4); \
dptr += (16/sizeof(_type_)); \
} \
} \
else \
{ \
while (count--) \
{ \
__m128i xmm1, xmm2, xmm3, xmm4; \
xmm1 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm2 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm3 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm4 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm1 = _op_(xmm1, xmm0); \
xmm2 = _op_(xmm2, xmm0); \
xmm3 = _op_(xmm3, xmm0); \
xmm4 = _op_(xmm4, xmm0); \
_mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm4); \
dptr += (16/sizeof(_type_)); \
} \
} \
/* Use a single 128-bit SSE register. */ \
count = len >> (5-shifts); \
len -= count << (5-shifts); \
while (count--) \
{ \
__m128i xmm1 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
xmm1 = _op_(xmm1, xmm0); \
_mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \
} \
/* Finish off the remainder. */ \
while (len--) { _slowWay_; } \
return PRIMITIVES_SUCCESS; \
}
/* ----------------------------------------------------------------------------
* SSD = Source1, Source2, Destination
*/
#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
PRIM_STATIC pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, INT32 len) \
{ \
int shifts; \
UINT32 offBeatMask; \
const _type_ *sptr1 = pSrc1; \
const _type_ *sptr2 = pSrc2; \
_type_ *dptr = pDst; \
size_t count; \
if (len < 16) /* pointless if too small */ \
{ \
return _fallback_(pSrc1, pSrc2, pDst, len); \
} \
if (sizeof(_type_) == 1) shifts = 1; \
else if (sizeof(_type_) == 2) shifts = 2; \
else if (sizeof(_type_) == 4) shifts = 3; \
else if (sizeof(_type_) == 8) shifts = 4; \
offBeatMask = (1 << (shifts - 1)) - 1; \
if ((ULONG_PTR) pDst & offBeatMask) \
{ \
/* Incrementing the pointer skips over 16-byte boundary. */ \
return _fallback_(pSrc1, pSrc2, pDst, len); \
} \
/* Get to the 16-byte boundary now. */ \
while ((ULONG_PTR) dptr & 0x0f) \
{ \
_slowWay_; \
if (--len == 0) return PRIMITIVES_SUCCESS; \
} \
/* Use 4 128-bit SSE registers. */ \
count = len >> (7-shifts); \
len -= count << (7-shifts); \
if (((ULONG_PTR) sptr1 & 0x0f) || ((ULONG_PTR) sptr2 & 0x0f)) \
{ \
/* Unaligned loads */ \
while (count--) \
{ \
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
xmm0 = _mm_lddqu_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm1 = _mm_lddqu_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm2 = _mm_lddqu_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm3 = _mm_lddqu_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm4 = _mm_lddqu_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm5 = _mm_lddqu_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm6 = _mm_lddqu_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm7 = _mm_lddqu_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, xmm4); \
xmm1 = _op_(xmm1, xmm5); \
xmm2 = _op_(xmm2, xmm6); \
xmm3 = _op_(xmm3, xmm7); \
_mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \
dptr += (16/sizeof(_type_)); \
} \
} \
else \
{ \
/* Aligned loads */ \
while (count--) \
{ \
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
xmm0 = _mm_load_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm1 = _mm_load_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm2 = _mm_load_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm3 = _mm_load_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm4 = _mm_load_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm5 = _mm_load_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm6 = _mm_load_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm7 = _mm_load_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, xmm4); \
xmm1 = _op_(xmm1, xmm5); \
xmm2 = _op_(xmm2, xmm6); \
xmm3 = _op_(xmm3, xmm7); \
_mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \
dptr += (16/sizeof(_type_)); \
} \
} \
/* Use a single 128-bit SSE register. */ \
count = len >> (5-shifts); \
len -= count << (5-shifts); \
while (count--) \
{ \
__m128i xmm0, xmm1; \
xmm0 = LOAD_SI128(sptr1); sptr1 += (16/sizeof(_type_)); \
xmm1 = LOAD_SI128(sptr2); sptr2 += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, xmm1); \
_mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \
} \
/* Finish off the remainder. */ \
while (len--) { _slowWay_; } \
return PRIMITIVES_SUCCESS; \
}
#endif /* !__PRIM_TEMPLATES_H_INCLUDED__ */

View File

@ -0,0 +1,297 @@
/* primitives.c
* This code queries processor features and calls the init/deinit routines.
* vi:ts=4 sw=4
*
* Copyright 2011 Martin Fleisz <mfleisz@thinstuff.com>
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <string.h>
#include <stdlib.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
#ifdef ANDROID
# include "cpu-features.h"
#endif
/* Singleton pointer used throughout the program when requested. */
static primitives_t *pPrimitives = NULL;
#define D_BIT_MMX (1<<23)
#define D_BIT_SSE (1<<25)
#define D_BIT_SSE2 (1<<26)
#define D_BIT_3DN (1<<30)
#define C_BIT_SSE3 (1<<0)
#define C_BIT_3DNP (1<<8)
#define C_BIT_SSSE3 (1<<9)
#define C_BIT_SSE41 (1<<19)
#define C_BIT_SSE42 (1<<20)
#define C_BIT_XGETBV (1<<27)
#define C_BIT_AVX (1<<28)
#define C_BITS_AVX (C_BIT_XGETBV|C_BIT_AVX)
#define E_BIT_XMM (1<<1)
#define E_BIT_YMM (1<<2)
#define E_BITS_AVX (E_BIT_XMM|E_BIT_YMM)
#define C_BIT_FMA (1<<11)
#define C_BIT_AVX_AES (1<<24)
/* If x86 */
#if defined(__x86_64) || defined(__x86_64__) || defined(__amd64) \
|| defined(__amd64__) || defined(_M_AMD64) || defined(_M_X64) \
|| defined(i386) || defined(__i386) || defined(__i386__) \
|| defined(_M_IX86) || defined(_X86_)
#ifndef i386
#define i386
#endif
/* If GCC */
# ifdef __GNUC__
# define xgetbv(_func_, _lo_, _hi_) \
__asm__ __volatile__ ("xgetbv" : "=a" (_lo_), "=d" (_hi_) : "c" (_func_))
static void cpuid(
unsigned info,
unsigned *eax,
unsigned *ebx,
unsigned *ecx,
unsigned *edx)
{
*eax = *ebx = *ecx = *edx = 0;
__asm volatile
(
/* The EBX (or RBX register on x86_64) is used for the PIC base address
* and must not be corrupted by our inline assembly.
*/
# if defined(__i386__)
"mov %%ebx, %%esi;"
"cpuid;"
"xchg %%ebx, %%esi;"
#else
"mov %%rbx, %%rsi;"
"cpuid;"
"xchg %%rbx, %%rsi;"
#endif
: "=a" (*eax), "=S" (*ebx), "=c" (*ecx), "=d" (*edx)
: "0" (info)
);
}
static void set_hints(
primitives_hints_t *hints)
{
unsigned a, b, c, d;
cpuid(1, &a, &b, &c, &d);
if (d & D_BIT_MMX) hints->x86_flags |= PRIM_X86_MMX_AVAILABLE;
if (d & D_BIT_SSE) hints->x86_flags |= PRIM_X86_SSE_AVAILABLE;
if (d & D_BIT_SSE2) hints->x86_flags |= PRIM_X86_SSE2_AVAILABLE;
if (d & D_BIT_3DN) hints->x86_flags |= PRIM_X86_3DNOW_AVAILABLE;
if (c & C_BIT_3DNP) hints->x86_flags |= PRIM_X86_3DNOW_PREFETCH_AVAILABLE;
if (c & C_BIT_SSE3) hints->x86_flags |= PRIM_X86_SSE3_AVAILABLE;
if (c & C_BIT_SSSE3) hints->x86_flags |= PRIM_X86_SSSE3_AVAILABLE;
if (c & C_BIT_SSE41) hints->x86_flags |= PRIM_X86_SSE41_AVAILABLE;
if (c & C_BIT_SSE42) hints->x86_flags |= PRIM_X86_SSE42_AVAILABLE;
if ((c & C_BITS_AVX) == C_BITS_AVX)
{
int e, f;
xgetbv(0, e, f);
if ((e & E_BITS_AVX) == E_BITS_AVX)
{
hints->x86_flags |= PRIM_X86_AVX_AVAILABLE;
if (c & C_BIT_FMA) hints->x86_flags |= PRIM_X86_FMA_AVAILABLE;
if (c & C_BIT_AVX_AES) hints->x86_flags |= PRIM_X86_AVX_AES_AVAILABLE;
}
}
/* TODO: AVX2: set eax=7, ecx=0, cpuid, check ebx-bit5 */
}
# else
static void set_hints(
primitives_hints_t *hints)
{
/* x86 non-GCC: TODO */
}
# endif /* __GNUC__ */
/* ------------------------------------------------------------------------- */
#elif defined(__arm__) || defined(__ARM_ARCH_7A__) \
|| defined(__ARM_EABI__) || defined(__ARMEL__) || defined(ANDROID)
#ifndef __arm__
#define __arm__
#endif
static UINT32 androidNeon(void)
{
#if ANDROID
if (android_getCpuFamily() != ANDROID_CPU_FAMILY_ARM) return 0;
UINT64 features = android_getCpuFeatures();
if ((features & ANDROID_CPU_ARM_FEATURE_ARMv7))
{
if (features & ANDROID_CPU_ARM_FEATURE_NEON)
{
return PRIM_ARM_NEON_AVAILABLE;
}
}
/* else */
#endif
return 0;
}
static void set_hints(
primitives_hints_t *hints)
{
/* ARM: TODO */
hints->arm_flags |= androidNeon();
}
#else
static void set_hints(
primitives_hints_t *hints)
{
}
#endif /* x86 else ARM else */
/* ------------------------------------------------------------------------- */
void primitives_init(void)
{
primitives_hints_t *hints;
if (pPrimitives == NULL)
{
pPrimitives = calloc(1, sizeof(primitives_t));
if (pPrimitives == NULL) return;
}
hints = calloc(1, sizeof(primitives_hints_t));
set_hints(hints);
pPrimitives->hints = (void *) hints;
/* Now call each section's initialization routine. */
primitives_init_add(hints, pPrimitives);
primitives_init_andor(hints, pPrimitives);
primitives_init_alphaComp(hints, pPrimitives);
primitives_init_copy(hints, pPrimitives);
primitives_init_set(hints, pPrimitives);
primitives_init_shift(hints, pPrimitives);
primitives_init_sign(hints, pPrimitives);
primitives_init_colors(hints, pPrimitives);
}
/* ------------------------------------------------------------------------- */
primitives_t *primitives_get(void)
{
if (pPrimitives == NULL) primitives_init();
return pPrimitives;
}
/* ------------------------------------------------------------------------- */
UINT32 primitives_get_flags(
const primitives_t *prims)
{
primitives_hints_t *hints = (primitives_hints_t *) (prims->hints);
#ifdef i386
return hints->x86_flags;
#elif defined(__arm__)
return hints->arm_flags;
#else
return 0;
#endif
}
/* ------------------------------------------------------------------------- */
void primitives_flags_str(
const primitives_t *prims,
char *str,
size_t len)
{
typedef struct
{
UINT32 flag;
const char *str;
} flagpair_t;
static const flagpair_t x86_flags[] =
{
{ PRIM_X86_MMX_AVAILABLE, "MMX" },
{ PRIM_X86_3DNOW_AVAILABLE, "3DNow" },
{ PRIM_X86_3DNOW_PREFETCH_AVAILABLE, "3DNow-PF" },
{ PRIM_X86_SSE_AVAILABLE, "SSE" },
{ PRIM_X86_SSE2_AVAILABLE, "SSE2" },
{ PRIM_X86_SSE3_AVAILABLE, "SSE3" },
{ PRIM_X86_SSSE3_AVAILABLE, "SSSE3" },
{ PRIM_X86_SSE41_AVAILABLE, "SSE4.1" },
{ PRIM_X86_SSE42_AVAILABLE, "SSE4.2" },
{ PRIM_X86_AVX_AVAILABLE, "AVX" },
{ PRIM_X86_FMA_AVAILABLE, "FMA" },
{ PRIM_X86_AVX_AES_AVAILABLE, "AVX-AES" },
{ PRIM_X86_AVX2_AVAILABLE, "AVX2" },
};
static const flagpair_t arm_flags[] =
{
{ PRIM_ARM_VFP1_AVAILABLE, "VFP1" },
{ PRIM_ARM_VFP2_AVAILABLE, "VFP2" },
{ PRIM_ARM_VFP3_AVAILABLE, "VFP3" },
{ PRIM_ARM_VFP4_AVAILABLE, "VFP4" },
{ PRIM_ARM_FPA_AVAILABLE, "FPA" },
{ PRIM_ARM_FPE_AVAILABLE, "FPE" },
{ PRIM_ARM_IWMMXT_AVAILABLE, "IWMMXT" },
{ PRIM_ARM_NEON_AVAILABLE, "NEON" },
};
int i;
primitives_hints_t *hints;
*str = '\0';
--len; /* for the '/0' */
hints = (primitives_hints_t *) (prims->hints);
for (i=0; i<sizeof(x86_flags)/sizeof(flagpair_t); ++i)
{
if (hints->x86_flags & x86_flags[i].flag)
{
int slen = strlen(x86_flags[i].str) + 1;
if (len < slen) break;
if (*str != '\0') strcat(str, " ");
strcat(str, x86_flags[i].str);
len -= slen;
}
}
for (i=0; i<sizeof(arm_flags)/sizeof(flagpair_t); ++i)
{
if (hints->arm_flags & arm_flags[i].flag)
{
int slen = strlen(arm_flags[i].str) + 1;
if (len < slen) break;
if (*str != '\0') strcat(str, " ");
strcat(str, arm_flags[i].str);
len -= slen;
}
}
}
/* ------------------------------------------------------------------------- */
void primitives_deinit(void)
{
if (pPrimitives == NULL) return;
/* Call each section's de-initialization routine. */
primitives_deinit_add(pPrimitives);
primitives_deinit_andor(pPrimitives);
primitives_deinit_alphaComp(pPrimitives);
primitives_deinit_copy(pPrimitives);
primitives_deinit_set(pPrimitives);
primitives_deinit_shift(pPrimitives);
primitives_deinit_sign(pPrimitives);
primitives_deinit_colors(pPrimitives);
if (pPrimitives->hints != NULL) free((void *) (pPrimitives->hints));
free((void *) pPrimitives);
pPrimitives = NULL;
}

View File

@ -0,0 +1,139 @@
# FreeRDP: A Remote Desktop Protocol Client
# primitives test makefile builder
# vi:ts=4 sw=4:
#
# (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied. See the License for the specific language governing permissions
# and limitations under the License.
#
# TODO: Integrate this into the testing framework, in some form.
# Right now this produces a standalone test that covers both functionality
# and performance of the primitives library entrypoints.
cmake_minimum_required(VERSION 2.8)
set(MODULE_NAME "prim_test")
set(MODULE_PREFIX "PRIMITIVES_LIBRARY_TEST")
set(PRIMITIVE_TEST_CFILES
prim_test.c
test_add.c
test_alphaComp.c
test_andor.c
test_colors.c
test_copy.c
test_set.c
test_shift.c
test_sign.c
../prim_add.c
../prim_andor.c
../prim_alphaComp.c
../prim_colors.c
../prim_copy.c
../prim_set.c
../prim_shift.c
../prim_sign.c
../primitives.c
)
set(PRIMITIVE_TEST_HEADERS
measure.h
prim_test.h
../prim_internal.h
)
set(PRIMITIVE_TEST_SRCS
${PRIMITIVE_TEST_CFILES}
${PRIMITIVE_TEST_HEADERS}
)
include_directories(. ../../.. ../../../include ../../../winpr/include)
add_definitions(-DPRIM_STATIC=auto -DALL_PRIMITIVES_VERSIONS -DHAVE_CONFIG_H)
# If these haven't been set by the caller, set them now to defaults.
if(NOT DEFINED WITH_IPP)
set(WITH_IPP FALSE)
endif()
if(NOT DEFINED WITH_SSE2)
if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm*")
set(WITH_SSE2 FALSE)
else()
set(WITH_SSE2 TRUE)
endif()
endif()
if(NOT DEFINED WITH_NEON)
if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm*")
set(WITH_NEON TRUE)
else()
set(WITH_NEON FALSE)
endif()
endif()
if(WITH_SSE2)
if(CMAKE_COMPILER_IS_GNUCC)
set(OPTFLAGS "${OPTFLAGS} -msse2 -mssse3 -O2 -Wdeclaration-after-statement")
endif()
if(MSVC)
set(OPTFLAGS "${OPTFLAGS} /arch:SSE2")
endif()
elseif(WITH_NEON)
if(CMAKE_COMPILER_IS_GNUCC)
set(OPTIMZATION "${OPTFLAGS} -mfpu=neon -mfloat-abi=softfp -O2")
endif()
# TODO: Add MSVC equivalent
endif()
add_executable(prim_test ${PRIMITIVE_TEST_SRCS})
if(WITH_IPP)
if(NOT DEFINED IPP_FOUND)
include(../../../cmake/FindIPP.cmake)
endif()
add_definitions(-DWITH_IPP)
# IPP PATH debugging messages
message(IPP_FOUND=${IPP_FOUND})
message(IPP_VERSION_STR=${IPP_VERSION_STR})
message(IPP_VERSION_MAJOR=${IPP_VERSION_MAJOR})
message(IPP_VERSION_MINOR=${IPP_VERSION_MINOR})
message(IPP_VERSION_BUILD=${IPP_VERSION_BUILD})
message(IPP_ROOT_DIR=${IPP_ROOT_DIR})
message(IPP_INCLUDE_DIRS=${IPP_INCLUDE_DIRS})
message(IPP_LIBRARY_DIRS=${IPP_LIBRARY_DIRS})
message(IPP_LIBRARIES=${IPP_LIBRARIES})
message(IPP_COMPILER_LIBRARY_DIRS=${IPP_COMPILER_LIBRARY_DIRS})
message(IPP_COMPILER_LIBRARIES=${IPP_COMPILER_LIBRARIES})
message(IPP_LIBRARY_LIST=${IPP_LIBRARY_LIST})
message(IPP_LIB_PREFIX=${IPP_LIB_PREFIX})
message(IPP_LIB_SUFFIX=${IPP_LIB_SUFFIX})
message(IPP_PREFIX=${IPP_PREFIX})
message(IPP_SUFFIX=${IPP_SUFFIX})
message(IPPCORE=${IPPCORE})
message(IPPS=${IPPS})
message(IPPI=${IPPI})
message(IPPCC=${IPPCC})
message(IPPCV=${IPPCV})
message(IPPVM=${IPPVM})
if(CMAKE_COMPILER_IS_GNUCC)
foreach(INCLDIR ${IPP_INCLUDE_DIRS})
set(OPTFLAGS "${OPTFLAGS} -I${INCLDIR}")
endforeach(INCLDIR)
endif()
target_link_libraries(prim_test ${IPP_LIBRARY_LIST})
endif()
set_property(SOURCE ${PRIMITIVE_TEST_CFILES} PROPERTY COMPILE_FLAGS ${OPTFLAGS})
target_link_libraries(prim_test rt)
if(NOT TESTING_OUTPUT_DIRECTORY)
set(TESTING_OUTPUT_DIRECTORY .)
endif()
add_test(prim_test ${TESTING_OUTPUT_DIRECTORY}/prim_test functionality)

View File

@ -0,0 +1,121 @@
/* measure.h
* Macros to help with performance measurement.
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License. Algorithms used by
* this code may be covered by patents by HP, Microsoft, or other parties.
*
* MEASURE_LOOP_START("measurement", 2000)
* code to be measured
* MEASURE_LOOP_STOP
* buffer flush and such
* MEASURE_SHOW_RESULTS
*
* Define GOOGLE_PROFILER if you want gperftools included.
*/
#ifdef _GNUC_
# pragma once
#endif
#ifndef __MEASURE_H_INCLUDED__
#define __MEASURE_H_INCLUDED__
#include <time.h>
#include <sys/param.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#ifdef GOOGLE_PROFILER
#include <gperftools/profiler.h>
#define PROFILER_START(_prefix_) \
do { \
char _path[PATH_MAX]; \
sprintf(_path, "./%s.prof", (_prefix_)); \
ProfilerStart(_path); \
} while (0);
# define PROFILER_STOP \
do { \
ProfilerStop(); \
} while (0);
#else
#define PROFILER_START(_prefix_)
#define PROFILER_STOP
#endif // GOOGLE_PROFILER
extern float _delta_time(const struct timespec *t0, const struct timespec *t1);
extern void _floatprint(float t, char *output);
#ifndef CLOCK_MONOTONIC_RAW
#define CLOCK_MONOTONIC_RAW 4
#endif // !CLOCK_MONOTONIC_RAW
#define MEASURE_LOOP_START(_prefix_, _count_) \
{ struct timespec _start, _stop; \
char *_prefix; \
int _count = (_count_); \
int _loop; \
float _delta; \
char _str1[32], _str2[32]; \
_prefix = strdup(_prefix_); \
_str1[0] = '\0'; _str2[0] = '\0'; \
clock_gettime(CLOCK_MONOTONIC_RAW, &_start); \
PROFILER_START(_prefix); \
_loop = (_count); \
do {
#define MEASURE_LOOP_STOP \
} while (--_loop);
#define MEASURE_GET_RESULTS(_result_) \
PROFILER_STOP; \
clock_gettime(CLOCK_MONOTONIC_RAW, &_stop); \
_delta = _delta_time(&_start, &_stop); \
(_result_) = (float) _count / _delta; \
free(_prefix); \
}
#define MEASURE_SHOW_RESULTS(_result_) \
PROFILER_STOP; \
clock_gettime(CLOCK_MONOTONIC_RAW, &_stop); \
_delta = _delta_time(&_start, &_stop); \
(_result_) = (float) _count / _delta; \
_floatprint((float) _count / _delta, _str1); \
printf("%s: %9d iterations in %5.1f seconds = %s/s \n", \
_prefix, _count, _delta, _str1); \
free(_prefix); \
}
#define MEASURE_SHOW_RESULTS_SCALED(_scale_, _label_) \
PROFILER_STOP; \
clock_gettime(CLOCK_MONOTONIC_RAW, &_stop); \
_delta = _delta_time(&_start, &_stop); \
_floatprint((float) _count / _delta, _str1); \
_floatprint((float) _count / _delta * (_scale_), _str2); \
printf("%s: %9d iterations in %5.1f seconds = %s/s = %s%s \n", \
_prefix, _count, _delta, _str1, _str2, _label_); \
free(_prefix); \
}
#define MEASURE_TIMED(_label_, _init_iter_, _test_time_, _result_, _call_) \
{ float _r; \
MEASURE_LOOP_START(_label_, _init_iter_); \
_call_; \
MEASURE_LOOP_STOP; \
MEASURE_GET_RESULTS(_r); \
MEASURE_LOOP_START(_label_, _r * _test_time_); \
_call_; \
MEASURE_LOOP_STOP; \
MEASURE_SHOW_RESULTS(_result_); \
}
#endif // __MEASURE_H_INCLUDED__

View File

@ -0,0 +1,414 @@
/* prim_test.c
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include "prim_test.h"
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdlib.h>
#include <time.h>
int test_sizes[] = { 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096 };
int Quiet = 0;
/* ------------------------------------------------------------------------- */
static void get_random_data_lrand(
void *buffer,
size_t size)
{
static int seeded = 0;
long int *ptr = (long int *) buffer;
unsigned char *cptr;
if (!seeded)
{
seeded = 1;
srand48(time(NULL));
}
/* This isn't the perfect random number generator, but that's okay. */
while (size >= sizeof(long int))
{
*ptr++ = lrand48();
size -= sizeof(long int);
}
cptr = (unsigned char *) ptr;
while (size > 0)
{
*cptr++ = lrand48() & 0xff;
--size;
}
}
/* ------------------------------------------------------------------------- */
void get_random_data(
void *buffer,
size_t size)
{
#ifdef linux
size_t offset = 0;
int fd = open("/dev/urandom", O_RDONLY);
if (fd < 0)
{
get_random_data_lrand(buffer, size);
return;
}
while (size > 0)
{
ssize_t count = read(fd, buffer+offset, size);
size -= count;
offset += count;
}
close(fd);
#else
get_random_data_lrand(buffer, size);
#endif
}
/* ------------------------------------------------------------------------- */
float _delta_time(
const struct timespec *t0,
const struct timespec *t1)
{
INT64 secs = (INT64) (t1->tv_sec) - (INT64) (t0->tv_sec);
long nsecs = t1->tv_nsec - t0->tv_nsec;
double retval;
if (nsecs < 0)
{
--secs;
nsecs += 1000000000;
}
retval = (double) secs + (double) nsecs / (double) 1000000000.0;
return (retval < 0.0) ? 0.0 : (float) retval;
}
/* ------------------------------------------------------------------------- */
void _floatprint(
float t,
char *output)
{
/* I don't want to link against -lm, so avoid log,exp,... */
float f = 10.0;
int i;
while (t > f) f *= 10.0;
f /= 1000.0;
i = ((int) (t/f+0.5)) * (int) f;
if (t < 0.0) sprintf(output, "%f", t);
else if (i == 0) sprintf(output, "%d", (int) (t+0.5));
else if (t < 1e+3) sprintf(output, "%3d", i);
else if (t < 1e+6) sprintf(output, "%3d,%03d",
i/1000, i % 1000);
else if (t < 1e+9) sprintf(output, "%3d,%03d,000",
i/1000000, (i % 1000000) / 1000);
else if (t < 1e+12) sprintf(output, "%3d,%03d,000,000",
i/1000000000, (i % 1000000000) / 1000000);
else sprintf(output, "%f", t);
}
/* ------------------------------------------------------------------------- */
/* Specific areas to test: */
#define TEST_COPY8 (1<<0)
#define TEST_SET8 (1<<1)
#define TEST_SET32S (1<<2)
#define TEST_SET32U (1<<3)
#define TEST_SIGN16S (1<<4)
#define TEST_ADD16S (1<<5)
#define TEST_LSHIFT16S (1<<6)
#define TEST_LSHIFT16U (1<<7)
#define TEST_RSHIFT16S (1<<8)
#define TEST_RSHIFT16U (1<<9)
#define TEST_RGB (1<<10)
#define TEST_ALPHA (1<<11)
#define TEST_AND (1<<12)
#define TEST_OR (1<<13)
/* Specific types of testing: */
#define TEST_FUNCTIONALITY (1<<0)
#define TEST_PERFORMANCE (1<<1)
/* ------------------------------------------------------------------------- */
int main(
int argc,
char **argv)
{
typedef struct
{
const char *testStr;
UINT32 bits;
} test_t;
static const test_t testList[] =
{
{ "all", 0xFFFFFFFFU },
{ "copy", TEST_COPY8 },
{ "copy8", TEST_COPY8 },
{ "set", TEST_SET8|TEST_SET32S|TEST_SET32U },
{ "set8", TEST_SET8 },
{ "set32", TEST_SET32S|TEST_SET32U },
{ "set32s", TEST_SET32S },
{ "set32u", TEST_SET32U },
{ "sign", TEST_SIGN16S },
{ "sign16s", TEST_SIGN16S },
{ "add", TEST_ADD16S },
{ "add16s", TEST_ADD16S },
{ "lshift", TEST_LSHIFT16S|TEST_LSHIFT16U },
{ "rshift", TEST_RSHIFT16S|TEST_RSHIFT16U },
{ "shift", TEST_LSHIFT16S|TEST_LSHIFT16U|TEST_RSHIFT16S|TEST_RSHIFT16U },
{ "lshift16s", TEST_LSHIFT16S },
{ "lshift16u", TEST_LSHIFT16U },
{ "rshift16s", TEST_RSHIFT16S },
{ "rshift16u", TEST_RSHIFT16U },
{ "rgb", TEST_RGB },
{ "color", TEST_RGB },
{ "colors", TEST_RGB },
{ "alpha", TEST_ALPHA },
{ "and", TEST_AND },
{ "or", TEST_OR },
};
#define NUMTESTS (sizeof(testList)/sizeof(test_t))
static const test_t testTypeList[] =
{
{ "functionality", TEST_FUNCTIONALITY },
{ "performance", TEST_PERFORMANCE },
};
#define NUMTESTTYPES (sizeof(testTypeList)/sizeof(test_t))
char hints[256];
UINT32 testSet = 0;
UINT32 testTypes = 0;
int i;
int results = SUCCESS;
/* Parse command line for the test set. */
for (i=1; i<argc; ++i)
{
BOOL found = 0;
int j;
for (j=0; j<NUMTESTS; ++j)
{
if (strcasecmp(argv[i], testList[j].testStr) == 0)
{
testSet |= testList[j].bits;
found = 1;
break;
}
}
for (j=0; j<NUMTESTTYPES; ++j)
{
if (strcasecmp(argv[i], testTypeList[j].testStr) == 0)
{
testTypes |= testTypeList[j].bits;
found = 1;
break;
}
}
if (!found)
{
if (strstr(argv[i], "help") != NULL)
{
printf("Available tests:\n");
for (j=0; j<NUMTESTS; ++j)
{
printf(" %s\n", testList[j].testStr);
}
for (j=0; j<NUMTESTTYPES; ++j)
{
printf(" %s\n", testTypeList[j].testStr);
}
}
else fprintf(stderr, "Unknown parameter '%s'!\n", argv[i]);
}
}
if (testSet == 0) testSet = 0xffffffff;
if (testTypes == 0) testTypes = 0xffffffff;
primitives_init();
primitives_flags_str(primitives_get(), hints, sizeof(hints));
printf("Hints: %s\n", hints);
/* COPY */
if (testSet & TEST_COPY8)
{
if (testTypes & TEST_FUNCTIONALITY)
{
results |= test_copy8u_func();
}
if (testTypes & TEST_PERFORMANCE)
{
results |= test_copy8u_speed();
}
}
/* SET */
if (testSet & TEST_SET8)
{
if (testTypes & TEST_FUNCTIONALITY)
{
results |= test_set8u_func();
}
if (testTypes & TEST_PERFORMANCE)
{
results |= test_set8u_speed();
}
}
if (testSet & TEST_SET32S)
{
if (testTypes & TEST_FUNCTIONALITY)
{
results |= test_set32s_func();
}
if (testTypes & TEST_PERFORMANCE)
{
results |= test_set32s_speed();
}
}
if (testSet & TEST_SET32U)
{
if (testTypes & TEST_FUNCTIONALITY)
{
results |= test_set32u_func();
}
if (testTypes & TEST_PERFORMANCE)
{
results |= test_set32u_speed();
}
}
/* SIGN */
if (testSet & TEST_SIGN16S)
{
if (testTypes & TEST_FUNCTIONALITY)
{
results |= test_sign16s_func();
}
if (testTypes & TEST_PERFORMANCE)
{
results |= test_sign16s_speed();
}
}
/* ADD */
if (testSet & TEST_ADD16S)
{
if (testTypes & TEST_FUNCTIONALITY)
{
results |= test_add16s_func();
}
if (testTypes & TEST_PERFORMANCE)
{
results |= test_add16s_speed();
}
}
/* SHIFTS */
if (testSet & TEST_LSHIFT16S)
{
if (testTypes & TEST_FUNCTIONALITY)
{
results |= test_lShift_16s_func();
}
if (testTypes & TEST_PERFORMANCE)
{
results |= test_lShift_16s_speed();
}
}
if (testSet & TEST_LSHIFT16U)
{
if (testTypes & TEST_FUNCTIONALITY)
{
results |= test_lShift_16u_func();
}
if (testTypes & TEST_PERFORMANCE)
{
results |= test_lShift_16u_speed();
}
}
if (testSet & TEST_RSHIFT16S)
{
if (testTypes & TEST_FUNCTIONALITY)
{
results |= test_rShift_16s_func();
}
if (testTypes & TEST_PERFORMANCE)
{
results |= test_rShift_16s_speed();
}
}
if (testSet & TEST_RSHIFT16U)
{
if (testTypes & TEST_FUNCTIONALITY)
{
results |= test_rShift_16u_func();
}
if (testTypes & TEST_PERFORMANCE)
{
results |= test_rShift_16u_speed();
}
}
/* COLORS */
if (testSet & TEST_RGB)
{
if (testTypes & TEST_FUNCTIONALITY)
{
results |= test_RGBToRGB_16s8u_P3AC4R_func();
}
if (testTypes & TEST_PERFORMANCE)
{
results |= test_RGBToRGB_16s8u_P3AC4R_speed();
}
if (testTypes & TEST_FUNCTIONALITY)
{
results |= test_yCbCrToRGB_16s16s_P3P3_func();
}
if (testTypes & TEST_PERFORMANCE)
{
results |= test_yCbCrToRGB_16s16s_P3P3_speed();
}
}
/* ALPHA COMPOSITION */
if (testSet & TEST_ALPHA)
{
if (testTypes & TEST_FUNCTIONALITY)
{
results |= test_alphaComp_func();
}
if (testTypes & TEST_PERFORMANCE)
{
results |= test_alphaComp_speed();
}
}
/* AND & OR */
if (testSet & TEST_AND)
{
if (testTypes & TEST_FUNCTIONALITY)
{
results |= test_and_32u_func();
}
if (testTypes & TEST_PERFORMANCE)
{
results |= test_and_32u_speed();
}
}
if (testSet & TEST_OR)
{
if (testTypes & TEST_FUNCTIONALITY)
{
results |= test_or_32u_func();
}
if (testTypes & TEST_PERFORMANCE)
{
results |= test_or_32u_speed();
}
}
primitives_deinit();
return results;
}

View File

@ -0,0 +1,247 @@
/* primtest.h
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License. Algorithms used by
* this code may be covered by patents by HP, Microsoft, or other parties.
*/
#ifdef __GNUC__
# pragma once
#endif
#ifndef __PRIMTEST_H_INCLUDED__
#define __PRIMTEST_H_INCLUDED__
#include <config.h>
#include <stdint.h>
#include <winpr/wtypes.h>
#include <measure.h>
#include <string.h>
#include <stdio.h>
#include <freerdp/primitives.h>
#ifdef WITH_IPP
#include <ipps.h>
#include <ippi.h>
#endif
#define BLOCK_ALIGNMENT 16
#ifdef __GNUC__
#define ALIGN(x) x __attribute((aligned(BLOCK_ALIGNMENT)))
#define POSSIBLY_UNUSED(x) x __attribute((unused))
#else
/* TODO: Someone needs to finish this for non-GNU C */
#define ALIGN(x) x
#define POSSIBLY_UNUSED(x) x
#endif
#define ABS(_x_) ((_x_) < 0 ? (-(_x_)) : (_x_))
#define MAX_TEST_SIZE 4096
extern int test_sizes[];
#define NUM_TEST_SIZES 10
extern void get_random_data(void *buffer, size_t size);
#ifndef SUCCESS
#define SUCCESS 0
#endif
#ifndef FAILURE
#define FAILURE 1
#endif
extern int test_copy8u_func(void);
extern int test_copy8u_speed(void);
extern int test_set8u_func(void);
extern int test_set32s_func(void);
extern int test_set8u_speed(void);
extern int test_set32s_speed(void);
extern int test_set32u_speed(void);
extern int test_sign16s_func(void);
extern int test_sign16s_speed(void);
extern int test_add16s_func(void);
extern int test_add16s_speed(void);
extern int test_lShift_16s_func(void);
extern int test_lShift_16u_func(void);
extern int test_rShift_16s_func(void);
extern int test_rShift_16u_func(void);
extern int test_lShift_16s_speed(void);
extern int test_lShift_16u_speed(void);
extern int test_rShift_16s_speed(void);
extern int test_rShift_16u_speed(void);
extern int test_RGBToRGB_16s8u_P3AC4R_func(void);
extern int test_RGBToRGB_16s8u_P3AC4R_speed(void);
extern int test_yCbCrToRGB_16s16s_P3P3_func(void);
extern int test_yCbCrToRGB_16s16s_P3P3_speed(void);
extern int test_alphaComp_func(void);
extern int test_alphaComp_speed(void);
extern int test_and_32u_func(void);
extern int test_and_32u_speed(void);
extern int test_or_32u_func(void);
extern int test_or_32u_speed(void);
/* Since so much of this code is repeated, define a macro to build
* functions to do speed tests.
*/
#ifdef armel
#define SIMD_TYPE "Neon"
#else
#define SIMD_TYPE "SSE"
#endif
#define DO_NORMAL_MEASUREMENTS(_funcNormal_, _prework_) \
do { \
for (s=0; s<num_sizes; ++s) \
{ \
int iter; \
char label[256]; \
int size = size_array[s]; \
_prework_; \
iter = iterations/size; \
sprintf(label, "%s-%-4d", oplabel, size); \
MEASURE_TIMED(label, iter, test_time, resultNormal[s], \
_funcNormal_); \
} \
} while (0)
#if defined(i386) && defined(WITH_SSE2)
#define DO_SSE_MEASUREMENTS(_funcSSE_, _prework_) \
do { \
for (s=0; s<num_sizes; ++s) \
{ \
int iter; \
char label[256]; \
int size = size_array[s]; \
_prework_; \
iter = iterations/size; \
sprintf(label, "%s-%s-%-4d", SIMD_TYPE, oplabel, size); \
MEASURE_TIMED(label, iter, test_time, resultSSENeon[s], \
_funcSSE_); \
} \
} while (0)
#else
#define DO_SSE_MEASUREMENTS(_funcSSE_, _prework_)
#endif
#if defined(armel) && defined(INCLUDE_NEON_MEASUREMENTS)
#define DO_NEON_MEASUREMENTS(_funcNeon_, _prework_) \
do { \
for (s=0; s<num_sizes; ++s) \
{ \
int iter; \
char label[256]; \
int size = size_array[s]; \
_prework_; \
iter = iterations/size; \
sprintf(label, "%s-%s-%-4d", SIMD_TYPE, oplabel, size); \
MEASURE_TIMED(label, iter, test_time, resultSSENeon[s], \
_funcNeon_); \
} \
} while (0)
#else
#define DO_NEON_MEASUREMENTS(_funcNeon_, _prework_)
#endif
#if defined(i386) && defined(WITH_IPP)
#define DO_IPP_MEASUREMENTS(_funcIPP_, _prework_) \
do { \
for (s=0; s<num_sizes; ++s) \
{ \
int iter; \
char label[256]; \
int size = size_array[s]; \
_prework_; \
iter = iterations/size; \
sprintf(label, "IPP-%s-%-4d", oplabel, size); \
MEASURE_TIMED(label, iter, test_time, resultIPP[s], \
_funcIPP_); \
} \
} while (0)
#else
#define DO_IPP_MEASUREMENTS(_funcIPP_, _prework_)
#endif
/* ------------------------------------------------------------------------- */
#define STD_SPEED_TEST( \
_name_, _srctype_, _dsttype_, _prework_, \
_doNormal_, _funcNormal_, \
_doSSE_, _funcSSE_, _flagsSSE_, \
_doNeon_, _funcNeon_, _flagsNeon_, \
_doIPP_, _funcIPP_) \
static void _name_( \
const char *oplabel, const char *type, \
const _srctype_ *src1, const _srctype_ *src2, _srctype_ constant, \
_dsttype_ *dst, \
const int *size_array, int num_sizes, \
int iterations, float test_time) \
{ \
int s; \
float *resultNormal, *resultSSENeon, *resultIPP; \
UINT32 pflags = primitives_get_flags(primitives_get()); \
resultNormal = (float *) calloc(num_sizes, sizeof(float)); \
resultSSENeon = (float *) calloc(num_sizes, sizeof(float)); \
resultIPP = (float *) calloc(num_sizes, sizeof(float)); \
printf("******************** %s %s ******************\n", \
oplabel, type); \
if (_doNormal_) { DO_NORMAL_MEASUREMENTS(_funcNormal_, _prework_); } \
if (_doSSE_) { \
if ((pflags & (_flagsSSE_)) == (_flagsSSE_)) \
{ \
DO_SSE_MEASUREMENTS(_funcSSE_, _prework_); \
} \
} \
if (_doNeon_) { \
if ((pflags & (_flagsNeon_)) == (_flagsNeon_)) \
{ \
DO_NEON_MEASUREMENTS(_funcNeon_, _prework_); \
} \
} \
if (_doIPP_) { DO_IPP_MEASUREMENTS(_funcIPP_, _prework_); } \
printf("----------------------- SUMMARY ----------------------------\n"); \
printf("%8s: %15s %15s %5s %15s %5s\n", \
"size", "general", SIMD_TYPE, "%", "IPP", "%"); \
for (s=0; s<num_sizes; ++s) \
{ \
char sN[32], sSN[32], sSNp[8], sIPP[32], sIPPp[8]; \
strcpy(sN, "N/A"); strcpy(sSN, "N/A"); strcpy(sSNp, "N/A"); \
strcpy(sIPP, "N/A"); strcpy(sIPPp, "N/A"); \
if (resultNormal[s] > 0.0) _floatprint(resultNormal[s], sN); \
if (resultSSENeon[s] > 0.0) \
{ \
_floatprint(resultSSENeon[s], sSN); \
if (resultNormal[s] > 0.0) \
{ \
sprintf(sSNp, "%d%%", \
(int) (resultSSENeon[s] / resultNormal[s] * 100.0 + 0.5)); \
} \
} \
if (resultIPP[s] > 0.0) \
{ \
_floatprint(resultIPP[s], sIPP); \
if (resultNormal[s] > 0.0) \
{ \
sprintf(sIPPp, "%d%%", \
(int) (resultIPP[s] / resultNormal[s] * 100.0 + 0.5)); \
} \
} \
printf("%8d: %15s %15s %5s %15s %5s\n", \
size_array[s], sN, sSN, sSNp, sIPP, sIPPp); \
} \
free(resultNormal); free(resultSSENeon); free(resultIPP); \
}
#endif // !__PRIMTEST_H_INCLUDED__

View File

@ -0,0 +1,105 @@
/* test_add.c
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include "prim_test.h"
#define FUNC_TEST_SIZE 65536
static const int ADD16S_PRETEST_ITERATIONS = 300000*64;
static const int TEST_TIME = 2.0; // seconds
extern pstatus_t general_add_16s(
const INT16 *pSrc1, const INT16 *pSrc2, INT16 *pDst, int len);
extern pstatus_t sse3_add_16s(
const INT16 *pSrc1, const INT16 *pSrc2, INT16 *pDst, int len);
/* ========================================================================= */
int test_add16s_func(void)
{
INT16 ALIGN(src1[FUNC_TEST_SIZE+3]), ALIGN(src2[FUNC_TEST_SIZE+3]),
ALIGN(d1[FUNC_TEST_SIZE+3]), ALIGN(d2[FUNC_TEST_SIZE+3]);
int failed = 0;
int i;
char testStr[256];
UINT32 pflags = primitives_get_flags(primitives_get());
testStr[0] = '\0';
get_random_data(src1, sizeof(src1));
get_random_data(src2, sizeof(src2));
memset(d1, 0, sizeof(d1));
memset(d2, 0, sizeof(d2));
general_add_16s(src1+1, src2+1, d1+1, FUNC_TEST_SIZE);
#ifdef i386
if (pflags & PRIM_X86_SSE3_AVAILABLE)
{
strcat(testStr, " SSE3");
/* Aligned */
sse3_add_16s(src1+1, src2+1, d2+1, FUNC_TEST_SIZE);
for (i=1; i<FUNC_TEST_SIZE; ++i)
{
if (d1[i] != d2[i])
{
printf("ADD16S-SSE-aligned FAIL[%d] %d+%d=%d, got %d\n",
i, src1[i], src2[i], d1[i], d2[i]);
++failed;
}
}
/* Unaligned */
sse3_add_16s(src1+1, src2+1, d2+2, FUNC_TEST_SIZE);
for (i=1; i<FUNC_TEST_SIZE; ++i)
{
if (d1[i] != d2[i+1])
{
printf("ADD16S-SSE-unaligned FAIL[%d] %d+%d=%d, got %d\n",
i, src1[i], src2[i], d1[i], d2[i+1]);
++failed;
}
}
}
#endif /* i386 */
#ifdef WITH_IPP
strcat(testStr, " IPP");
ippsAdd_16s(src1+1, src2+1, d2+1, FUNC_TEST_SIZE);
for (i=1; i<FUNC_TEST_SIZE; ++i)
{
if (d1[i] != d2[i])
{
printf("ADD16S-IPP FAIL[%d] %d+%d=%d, got %d\n",
i, src1[i], src2[i], d1[i], d2[i]);
++failed;
}
}
#endif /* WITH_IPP */
if (!failed) printf("All add16s tests passed (%s).\n", testStr);
return (failed > 0) ? FAILURE : SUCCESS;
}
/* ------------------------------------------------------------------------- */
STD_SPEED_TEST(add16s_speed_test, INT16, INT16, dst=dst,
TRUE, general_add_16s(src1, src2, dst, size),
TRUE, sse3_add_16s(src1, src2, dst, size), PRIM_X86_SSE3_AVAILABLE,
FALSE, dst=dst, 0,
TRUE, ippsAdd_16s(src1, src2, dst, size));
int test_add16s_speed(void)
{
INT16 ALIGN(src1[MAX_TEST_SIZE+3]), ALIGN(src2[MAX_TEST_SIZE+3]),
ALIGN(dst[MAX_TEST_SIZE+3]);
get_random_data(src1, sizeof(src1));
get_random_data(src2, sizeof(src2));
add16s_speed_test("add16s", "aligned", src1, src2, 0, dst,
test_sizes, NUM_TEST_SIZES, ADD16S_PRETEST_ITERATIONS, TEST_TIME);
add16s_speed_test("add16s", "unaligned", src1+1, src2+2, 0, dst,
test_sizes, NUM_TEST_SIZES, ADD16S_PRETEST_ITERATIONS, TEST_TIME);
return SUCCESS;
}

View File

@ -0,0 +1,225 @@
/* test_alphaComp.c
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include "prim_test.h"
static const int ALPHA_PRETEST_ITERATIONS = 5000000;
static const float TEST_TIME = 5.0;
static const int block_size[] = { 4, 64, 256 };
#define NUM_BLOCK_SIZES (sizeof(block_size)/sizeof(int))
#define MAX_BLOCK_SIZE 256
#define SIZE_SQUARED (MAX_BLOCK_SIZE*MAX_BLOCK_SIZE)
extern pstatus_t general_alphaComp_argb(
const BYTE *pSrc1, int src1Step,
const BYTE *pSrc2, int src2Step,
BYTE *pDst, int dstStep,
int width, int height);
extern pstatus_t sse2_alphaComp_argb(
const BYTE *pSrc1, int src1Step,
const BYTE *pSrc2, int src2Step,
BYTE *pDst, int dstStep,
int width, int height);
extern pstatus_t ipp_alphaComp_argb(
const BYTE *pSrc1, int src1Step,
const BYTE *pSrc2, int src2Step,
BYTE *pDst, int dstStep,
int width, int height);
/* ========================================================================= */
#define ALF(_c_) (((_c_) & 0xFF000000U) >> 24)
#define RED(_c_) (((_c_) & 0x00FF0000U) >> 16)
#define GRN(_c_) (((_c_) & 0x0000FF00U) >> 8)
#define BLU(_c_) ((_c_) & 0x000000FFU)
#define TOLERANCE 1
#define PIXEL(_addr_, _bytes_, _x_, _y_) \
((UINT32 *) (((BYTE *) (_addr_)) + (_x_)*4 + (_y_)*(_bytes_)))
#define SRC1_WIDTH 6
#define SRC1_HEIGHT 6
#define SRC2_WIDTH 7
#define SRC2_HEIGHT 7
#define DST_WIDTH 9
#define DST_HEIGHT 9
#define TEST_WIDTH 4
#define TEST_HEIGHT 5
/* ------------------------------------------------------------------------- */
static UINT32 alpha_add(
UINT32 c1,
UINT32 c2)
{
UINT32 a1 = ALF(c1);
UINT32 r1 = RED(c1);
UINT32 g1 = GRN(c1);
UINT32 b1 = BLU(c1);
UINT32 a2 = ALF(c2);
UINT32 r2 = RED(c2);
UINT32 g2 = GRN(c2);
UINT32 b2 = BLU(c2);
UINT32 a3 = ((a1 * a1 + (255-a1) * a2) / 255) & 0xff;
UINT32 r3 = ((a1 * r1 + (255-a1) * r2) / 255) & 0xff;
UINT32 g3 = ((a1 * g1 + (255-a1) * g2) / 255) & 0xff;
UINT32 b3 = ((a1 * b1 + (255-a1) * b2) / 255) & 0xff;
return (a3 << 24) | (r3 << 16) | (g3 << 8) | b3;
}
/* ------------------------------------------------------------------------- */
static UINT32 colordist(
UINT32 c1,
UINT32 c2)
{
int d, maxd = 0;
d = ABS(ALF(c1) - ALF(c2));
if (d > maxd) maxd = d;
d = ABS(RED(c1) - RED(c2));
if (d > maxd) maxd = d;
d = ABS(GRN(c1) - GRN(c2));
if (d > maxd) maxd = d;
d = ABS(BLU(c1) - BLU(c2));
if (d > maxd) maxd = d;
return maxd;
}
/* ------------------------------------------------------------------------- */
int test_alphaComp_func(void)
{
UINT32 ALIGN(src1[SRC1_WIDTH*SRC1_HEIGHT]);
UINT32 ALIGN(src2[SRC2_WIDTH*SRC2_HEIGHT]);
UINT32 ALIGN(dst1[DST_WIDTH*DST_HEIGHT]);
UINT32 ALIGN(dst2a[DST_WIDTH*DST_HEIGHT]);
UINT32 ALIGN(dst2u[DST_WIDTH*DST_HEIGHT+1]);
UINT32 ALIGN(dst3[DST_WIDTH*DST_HEIGHT]);
int error = 0;
UINT32 pflags = primitives_get_flags(primitives_get());
char testStr[256];
UINT32 *ptr;
int i, x, y;
testStr[0] = '\0';
get_random_data(src1, sizeof(src1));
/* Special-case the first two values */
src1[0] &= 0x00FFFFFFU;
src1[1] |= 0xFF000000U;
get_random_data(src2, sizeof(src2));
/* Set the second operand to fully-opaque. */
ptr = src2;
for (i=0; i<sizeof(src2)/4; ++i) *ptr++ |= 0xFF000000U;
memset(dst1, 0, sizeof(dst1));
memset(dst2a, 0, sizeof(dst2a));
memset(dst2u, 0, sizeof(dst2u));
memset(dst3, 0, sizeof(dst3));
general_alphaComp_argb((const BYTE *) src1, 4*SRC1_WIDTH,
(const BYTE *) src2, 4*SRC2_WIDTH,
(BYTE *) dst1, 4*DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
#ifdef i386
if (pflags & PRIM_X86_SSE2_AVAILABLE)
{
strcat(testStr, " SSE2");
sse2_alphaComp_argb((const BYTE *) src1, 4*SRC1_WIDTH,
(const BYTE *) src2, 4*SRC2_WIDTH,
(BYTE *) dst2a, 4*DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
sse2_alphaComp_argb((const BYTE *) src1, 4*SRC1_WIDTH,
(const BYTE *) src2, 4*SRC2_WIDTH,
(BYTE *) (dst2u+1), 4*DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
}
#endif /* i386 */
#ifdef WITH_IPP
strcat(testStr, " IPP");
ipp_alphaComp_argb((const BYTE *) src1, 4*SRC1_WIDTH,
(const BYTE *) src2, 4*SRC2_WIDTH,
(BYTE *) dst3, 4*DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
#endif
for (y=0; y<TEST_HEIGHT; ++y)
{
for (x=0; x<TEST_WIDTH; ++x)
{
UINT32 s1 = *PIXEL(src1, 4*SRC1_WIDTH, x, y);
UINT32 s2 = *PIXEL(src2, 4*SRC2_WIDTH, x, y);
UINT32 c0 = alpha_add(s1, s2);
UINT32 c1 = *PIXEL(dst1, 4*DST_WIDTH, x, y);
if (colordist(c0, c1) > TOLERANCE)
{
printf("alphaComp-general: [%d,%d] 0x%08x+0x%08x=0x%08x, got 0x%08x\n",
x, y, s1, s2, c0, c1);
error = 1;
}
#ifdef i386
if (pflags & PRIM_X86_SSE2_AVAILABLE)
{
UINT32 c2 = *PIXEL(dst2a, 4*DST_WIDTH, x, y);
if (colordist(c0, c2) > TOLERANCE)
{
printf("alphaComp-SSE-aligned: [%d,%d] 0x%08x+0x%08x=0x%08x, got 0x%08x\n",
x, y, s1, s2, c0, c2);
error = 1;
}
c2 = *PIXEL(dst2u+1, 4*DST_WIDTH, x, y);
if (colordist(c0, c2) > TOLERANCE)
{
printf("alphaComp-SSE-unaligned: [%d,%d] 0x%08x+0x%08x=0x%08x, got 0x%08x\n",
x, y, s1, s2, c0, c2);
error = 1;
}
}
#endif /* i386 */
#ifdef WITH_IPP
UINT32 c3 = *PIXEL(dst3, 4*DST_WIDTH, x, y);
if (colordist(c0, c3) > TOLERANCE)
{
printf("alphaComp-IPP: [%d,%d] 0x%08x+0x%08x=0x%08x, got 0x%08x\n",
x, y, s1, s2, c0, c3);
error = 1;
}
#endif
}
}
if (!error) printf("All alphaComp tests passed (%s).\n", testStr);
return (error > 0) ? FAILURE : SUCCESS;
}
/* ------------------------------------------------------------------------- */
STD_SPEED_TEST(alphaComp_speed, BYTE, BYTE, int bytes = size*4,
TRUE, general_alphaComp_argb(src1, bytes, src2, bytes, dst, bytes,
size, size),
TRUE, sse2_alphaComp_argb(src1, bytes, src2, bytes, dst, bytes,
size, size), PRIM_X86_SSE2_AVAILABLE,
FALSE, dst=dst, 0,
TRUE, ipp_alphaComp_argb(src1, bytes, src2, bytes, dst, bytes,
size, size));
int test_alphaComp_speed(void)
{
INT32 ALIGN(src1[MAX_BLOCK_SIZE*(MAX_BLOCK_SIZE+1)]),
ALIGN(src2[SIZE_SQUARED]),
ALIGN(dst[SIZE_SQUARED]);
get_random_data(src1, sizeof(src1));
get_random_data(src2, sizeof(src2));
alphaComp_speed("alphaComp", "aligned",
(BYTE *) src1, (BYTE *) src2, 0, (BYTE *) dst,
block_size, NUM_BLOCK_SIZES, ALPHA_PRETEST_ITERATIONS, TEST_TIME);
alphaComp_speed("alphaComp", "unaligned",
(BYTE *) src1+1, (BYTE *) src2, 0, (BYTE *) dst,
block_size, NUM_BLOCK_SIZES, ALPHA_PRETEST_ITERATIONS, TEST_TIME);
return SUCCESS;
}

View File

@ -0,0 +1,178 @@
/* test_andor.c
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include "prim_test.h"
#define FUNC_TEST_SIZE 65536
static const int ANDOR_PRETEST_ITERATIONS = 100000;
static const int TEST_TIME = 2.0; // seconds
extern pstatus_t general_andC_32u(const UINT32 *pSrc, UINT32 val,
UINT32 *pDst, int len);
extern pstatus_t sse3_andC_32u(const UINT32 *pSrc, UINT32 val,
UINT32 *pDst, int len);
extern pstatus_t general_orC_32u(const UINT32 *pSrc, UINT32 val,
UINT32 *pDst, int len);
extern pstatus_t sse3_orC_32u(const UINT32 *pSrc, UINT32 val,
UINT32 *pDst, int len);
#define VALUE (0xA5A5A5A5U)
/* ========================================================================= */
int test_and_32u_func(void)
{
UINT32 ALIGN(src[FUNC_TEST_SIZE+3]), ALIGN(dst[FUNC_TEST_SIZE+3]);
int failed = 0;
int i;
UINT32 pflags = primitives_get_flags(primitives_get());
char testStr[256];
testStr[0] = '\0';
get_random_data(src, sizeof(src));
general_andC_32u(src+1, VALUE, dst+1, FUNC_TEST_SIZE);
strcat(testStr, " general");
for (i=1; i<=FUNC_TEST_SIZE; ++i)
{
if (dst[i] != (src[i] & VALUE))
{
printf("AND-general FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
i, src[i], VALUE, src[i] & VALUE, dst[i]);
++failed;
}
}
#ifdef i386
if (pflags & PRIM_X86_SSE3_AVAILABLE)
{
strcat(testStr, " SSE3");
/* Aligned */
memset(dst, 0, sizeof(dst));
sse3_andC_32u(src+1, VALUE, dst+1, FUNC_TEST_SIZE);
for (i=1; i<=FUNC_TEST_SIZE; ++i)
{
if (dst[i] != (src[i] & VALUE))
{
printf("AND-SSE-aligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
i, src[i], VALUE, src[i] & VALUE, dst[i]);
++failed;
}
}
/* Unaligned */
memset(dst, 0, sizeof(dst));
sse3_andC_32u(src+1, VALUE, dst+2, FUNC_TEST_SIZE);
for (i=1; i<=FUNC_TEST_SIZE; ++i)
{
if (dst[i+1] != (src[i] & VALUE))
{
printf("AND-SSE-unaligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
i, src[i], VALUE, src[i] & VALUE, dst[i+1]);
++failed;
}
}
}
#endif /* i386 */
if (!failed) printf("All and_32u tests passed (%s).\n", testStr);
return (failed > 0) ? FAILURE : SUCCESS;
}
/* ------------------------------------------------------------------------- */
STD_SPEED_TEST(andC_32u_speed_test, UINT32, UINT32, dst=dst,
TRUE, general_andC_32u(src1, constant, dst, size),
TRUE, sse3_andC_32u(src1, constant, dst, size), PRIM_X86_SSE3_AVAILABLE,
FALSE, dst=dst, 0,
TRUE, ippsAndC_32u(src1, constant, dst, size))
int test_and_32u_speed(void)
{
UINT32 ALIGN(src[MAX_TEST_SIZE+3]), ALIGN(dst[MAX_TEST_SIZE+3]);
get_random_data(src, sizeof(src));
andC_32u_speed_test("and32u", "aligned", src, NULL, VALUE, dst,
test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME);
andC_32u_speed_test("and32u", "unaligned", src+1, NULL, VALUE, dst,
test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME);
return SUCCESS;
}
/* ========================================================================= */
int test_or_32u_func(void)
{
UINT32 ALIGN(src[FUNC_TEST_SIZE+3]), ALIGN(dst[FUNC_TEST_SIZE+3]);
int failed = 0;
int i;
UINT32 pflags = primitives_get_flags(primitives_get());
char testStr[256];
testStr[0] = '\0';
get_random_data(src, sizeof(src));
general_orC_32u(src+1, VALUE, dst+1, FUNC_TEST_SIZE);
strcat(testStr, " general");
for (i=1; i<=FUNC_TEST_SIZE; ++i)
{
if (dst[i] != (src[i] | VALUE))
{
printf("OR-general general FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
i, src[i], VALUE, src[i] | VALUE, dst[i]);
++failed;
}
}
#ifdef i386
if (pflags & PRIM_X86_SSE3_AVAILABLE)
{
strcat(testStr, " SSE3");
/* Aligned */
memset(dst, 0, sizeof(dst));
sse3_orC_32u(src+1, VALUE, dst+1, FUNC_TEST_SIZE);
for (i=1; i<FUNC_TEST_SIZE; ++i)
{
if (dst[i] != (src[i] | VALUE))
{
printf("OR-SSE-aligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
i, src[i], VALUE, src[i] | VALUE, dst[i]);
++failed;
}
}
/* Unaligned */
memset(dst, 0, sizeof(dst));
sse3_orC_32u(src+1, VALUE, dst+2, FUNC_TEST_SIZE);
for (i=1; i<FUNC_TEST_SIZE; ++i)
{
if (dst[i+1] != (src[i] | VALUE))
{
printf("OR-SSE-unaligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
i, src[i], VALUE, src[i] | VALUE, dst[i+1]);
++failed;
}
}
}
#endif /* i386 */
if (!failed) printf("All or_32u tests passed (%s).\n", testStr);
return (failed > 0) ? FAILURE : SUCCESS;
}
/* ------------------------------------------------------------------------- */
STD_SPEED_TEST(orC_32u_speed_test, UINT32, UINT32, dst=dst,
TRUE, general_orC_32u(src1, constant, dst, size),
TRUE, sse3_orC_32u(src1, constant, dst, size), PRIM_X86_SSE3_AVAILABLE,
FALSE, dst=dst, 0,
TRUE, ippsOrC_32u(src1, constant, dst, size))
int test_or_32u_speed(void)
{
UINT32 ALIGN(src[MAX_TEST_SIZE+3]), ALIGN(dst[MAX_TEST_SIZE+3]);
get_random_data(src, sizeof(src));
orC_32u_speed_test("or32u", "aligned", src, NULL, VALUE, dst,
test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME);
orC_32u_speed_test("or32u", "unaligned", src+1, NULL, VALUE, dst,
test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME);
return SUCCESS;
}

View File

@ -0,0 +1,226 @@
/* test_colors.c
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include "prim_test.h"
static const int RGB_TRIAL_ITERATIONS = 1000;
static const int YCBCR_TRIAL_ITERATIONS = 1000;
static const float TEST_TIME = 4.0;
extern pstatus_t general_RGBToRGB_16s8u_P3AC4R(const INT16 *pSrc[3],
int srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi);
extern pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(const INT16 *pSrc[3],
int srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi);
extern pstatus_t general_yCbCrToRGB_16s16s_P3P3(const INT16 *pSrc[3],
int srcStep, INT16 *pDst[3], int dstStep, const prim_size_t *roi);
extern pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(const INT16 *pSrc[3],
int srcStep, INT16 *pDst[3], int dstStep, const prim_size_t *roi);
/* ------------------------------------------------------------------------- */
int test_RGBToRGB_16s8u_P3AC4R_func(void)
{
INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]);
UINT32 ALIGN(out1[4096]), ALIGN(out2[4096]);
int i;
int failed = 0;
UINT32 pflags = primitives_get_flags(primitives_get());
char testStr[256];
INT16 *ptrs[3];
prim_size_t roi = { 64, 64 };
testStr[0] = '\0';
get_random_data(r, sizeof(r));
get_random_data(g, sizeof(g));
get_random_data(b, sizeof(b));
/* clear upper bytes */
for (i=0; i<4096; ++i)
{
r[i] &= 0x00FFU;
g[i] &= 0x00FFU;
b[i] &= 0x00FFU;
}
ptrs[0] = r;
ptrs[1] = g;
ptrs[2] = b;
general_RGBToRGB_16s8u_P3AC4R((const INT16 **) ptrs, 64*2,
(BYTE *) out1, 64*4, &roi);
#ifdef i386
if (pflags & PRIM_X86_SSE2_AVAILABLE)
{
strcat(testStr, " SSE2");
sse2_RGBToRGB_16s8u_P3AC4R((const INT16 **) ptrs, 64*2,
(BYTE *) out2, 64*4, &roi);
for (i=0; i<4096; ++i)
{
if (out1[i] != out2[i])
{
printf("RGBToRGB-SSE FAIL: out1[%d]=0x%08x out2[%d]=0x%08x\n",
i, out1[i], i, out2[i]);
failed = 1;
}
}
}
#endif /* i386 */
if (!failed) printf("All RGBToRGB_16s8u_P3AC4R tests passed (%s).\n", testStr);
return (failed > 0) ? FAILURE : SUCCESS;
}
/* ------------------------------------------------------------------------- */
static const prim_size_t roi64x64 = { 64, 64 };
STD_SPEED_TEST(
rgb_to_argb_speed, INT16*, UINT32, dst=dst,
TRUE, general_RGBToRGB_16s8u_P3AC4R(
(const INT16 **) src1, 64*2, (BYTE *) dst, 64*4, &roi64x64),
TRUE, sse2_RGBToRGB_16s8u_P3AC4R(
(const INT16 **) src1, 64*2, (BYTE *) dst, 64*4, &roi64x64),
PRIM_X86_SSE2_AVAILABLE,
FALSE, dst=dst, 0,
FALSE, dst=dst);
int test_RGBToRGB_16s8u_P3AC4R_speed(void)
{
INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]);
UINT32 ALIGN(dst[4096]);
int i;
INT16 *ptrs[3];
int size_array[] = { 64 };
get_random_data(r, sizeof(r));
get_random_data(g, sizeof(g));
get_random_data(b, sizeof(b));
/* clear upper bytes */
for (i=0; i<4096; ++i)
{
r[i] &= 0x00FFU;
g[i] &= 0x00FFU;
b[i] &= 0x00FFU;
}
ptrs[0] = r;
ptrs[1] = g;
ptrs[2] = b;
rgb_to_argb_speed("RGBToARGB", "aligned",
(const INT16 **) ptrs, NULL, 0, dst,
size_array, 1, RGB_TRIAL_ITERATIONS, TEST_TIME);
return SUCCESS;
}
/* ========================================================================= */
int test_yCbCrToRGB_16s16s_P3P3_func(void)
{
INT16 ALIGN(y[4096]), ALIGN(cb[4096]), ALIGN(cr[4096]);
INT16 ALIGN(r1[4096]), ALIGN(g1[4096]), ALIGN(b1[4096]);
INT16 ALIGN(r2[4096]), ALIGN(g2[4096]), ALIGN(b2[4096]);
int i;
int failed = 0;
UINT32 pflags = primitives_get_flags(primitives_get());
char testStr[256];
const INT16 *in[3];
INT16 *out1[3];
INT16 *out2[3];
prim_size_t roi = { 64, 64 };
testStr[0] = '\0';
get_random_data(y, sizeof(y));
get_random_data(cb, sizeof(cb));
get_random_data(cr, sizeof(cr));
/* Normalize to 11.5 fixed radix */
for (i=0; i<4096; ++i)
{
y[i] &= 0x1FE0U;
cb[i] &= 0x1FE0U;
cr[i] &= 0x1FE0U;
}
memset(r1, 0, sizeof(r1));
memset(g1, 0, sizeof(g1));
memset(b1, 0, sizeof(b1));
memset(r2, 0, sizeof(r2));
memset(g2, 0, sizeof(g2));
memset(b2, 0, sizeof(b2));
in[0] = y;
in[1] = cb;
in[2] = cr;
out1[0] = r1;
out1[1] = g1;
out1[2] = b1;
out2[0] = r2;
out2[1] = g2;
out2[2] = b2;
general_yCbCrToRGB_16s16s_P3P3(in, 64*2, out1, 64*2, &roi);
#ifdef i386
if (pflags & PRIM_X86_SSE2_AVAILABLE)
{
strcat(testStr, " SSE2");
sse2_yCbCrToRGB_16s16s_P3P3(in, 64*2, out2, 64*2, &roi);
for (i=0; i<4096; ++i)
{
if ((ABS(r1[i]-r2[i]) > 1)
|| (ABS(g1[i]-g2[i]) > 1)
|| (ABS(b1[i]-b2[i]) > 1)) {
printf("YCbCrToRGB-SSE FAIL[%d]: %d,%d,%d vs %d,%d,%d\n", i,
r1[i],g1[i],b1[i], r2[i],g2[i],b2[i]);
failed = 1;
}
}
}
#endif /* i386 */
if (!failed) printf("All yCbCrToRGB_16s16s_P3P3 tests passed (%s).\n", testStr);
return (failed > 0) ? FAILURE : SUCCESS;
}
/* ------------------------------------------------------------------------- */
STD_SPEED_TEST(
ycbcr_to_rgb_speed, INT16*, INT16*, dst=dst,
TRUE, general_yCbCrToRGB_16s16s_P3P3(src1, 64*2, dst, 64*2, &roi64x64),
TRUE, sse2_yCbCrToRGB_16s16s_P3P3(src1, 64*2, dst, 64*2, &roi64x64),
PRIM_X86_SSE2_AVAILABLE,
FALSE, dst=dst, 0,
FALSE, dst=dst);
int test_yCbCrToRGB_16s16s_P3P3_speed(void)
{
INT16 ALIGN(y[4096]), ALIGN(cb[4096]), ALIGN(cr[4096]);
INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]);
int i;
const INT16 *input[3];
INT16 *output[3];
int size_array[] = { 64 };
get_random_data(y, sizeof(y));
get_random_data(cb, sizeof(cb));
get_random_data(cr, sizeof(cr));
/* Normalize to 11.5 fixed radix */
for (i=0; i<4096; ++i)
{
y[i] &= 0x1FE0U;
cb[i] &= 0x1FE0U;
cr[i] &= 0x1FE0U;
}
input[0] = y;
input[1] = cb;
input[2] = cr;
output[0] = r;
output[1] = g;
output[2] = b;
ycbcr_to_rgb_speed("yCbCrToRGB", "aligned", input, NULL, NULL, output,
size_array, 1, YCBCR_TRIAL_ITERATIONS, TEST_TIME);
return SUCCESS;
}

View File

@ -0,0 +1,83 @@
/* test_copy.c
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include "prim_test.h"
static const int MEMCPY_PRETEST_ITERATIONS = 1000000;
static const int TEST_TIME = 1.0; // seconds
#define COPY_TESTSIZE (256*2+16*2+15+15)
#if 0
extern pstatus_t sse3_copy_8u(const BYTE *pSrc, BYTE *pDst, int len);
#endif
/* ------------------------------------------------------------------------- */
int test_copy8u_func(void)
{
primitives_t *prims = primitives_get();
BYTE ALIGN(data[COPY_TESTSIZE+15]);
int i, soff;
int failed = 0;
char testStr[256];
BYTE ALIGN(dest[COPY_TESTSIZE+15]);
testStr[0] = '\0';
get_random_data(data, sizeof(data));
strcat(testStr, " ptr");
for (soff=0; soff<16; ++soff)
{
int doff;
for (doff=0; doff<16; ++doff)
{
int length;
for (length=1; length<=COPY_TESTSIZE-doff; ++length)
{
memset(dest, 0, sizeof(dest));
prims->copy_8u(data+soff, dest+doff, length);
for (i=0; i<length; ++i)
{
if (dest[i+doff] != data[i+soff])
{
printf("COPY8U FAIL: off=%d len=%d, dest[%d]=0x%02x"
"data[%d]=0x%02x\n",
doff, length, i+doff, dest[i+doff],
i+soff, data[i+soff]);
failed = 1;
}
}
}
}
}
if (!failed) printf("All copy8 tests passed (%s).\n", testStr);
return (failed > 0) ? FAILURE : SUCCESS;
}
/* ------------------------------------------------------------------------- */
STD_SPEED_TEST(copy8u_speed_test, BYTE, BYTE, dst=dst,
TRUE, memcpy(dst, src1, size),
FALSE, NULL, 0,
FALSE, NULL, 0,
TRUE, ippsCopy_8u(src1, dst, size));
int test_copy8u_speed(void)
{
BYTE ALIGN(src[MAX_TEST_SIZE+4]);
BYTE ALIGN(intervening[MAX_TEST_SIZE*7]);
BYTE ALIGN(dst[MAX_TEST_SIZE+4]);
copy8u_speed_test("copy8u", "aligned", src, NULL, 0, dst,
test_sizes, NUM_TEST_SIZES, MEMCPY_PRETEST_ITERATIONS, TEST_TIME);
copy8u_speed_test("copy8u", "unaligned", src+1, NULL, 0, dst,
test_sizes, NUM_TEST_SIZES, MEMCPY_PRETEST_ITERATIONS, TEST_TIME);
return SUCCESS;
}

View File

@ -0,0 +1,294 @@
/* test_set.c
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include "prim_test.h"
static const int MEMSET8_PRETEST_ITERATIONS = 100000000;
static const int MEMSET32_PRETEST_ITERATIONS = 40000000;
static const float TEST_TIME = 1.0;
extern pstatus_t general_set_8u(BYTE val, BYTE *pDst, int len);
extern pstatus_t sse2_set_8u(BYTE val, BYTE *pDst, int len);
extern pstatus_t general_set_32s(INT32 val, INT32 *pDst, int len);
extern pstatus_t sse2_set_32s(INT32 val, INT32 *pDst, int len);
extern pstatus_t general_set_32u(UINT32 val, UINT32 *pDst, int len);
extern pstatus_t sse2_set_32u(UINT32 val, UINT32 *pDst, int len);
extern pstatus_t ipp_wrapper_set_32u(UINT32 val, UINT32 *pDst, int len);
static const int set_sizes[] = { 1, 4, 16, 32, 64, 256, 1024, 4096 };
#define NUM_SET_SIZES (sizeof(set_sizes)/sizeof(int))
/* ------------------------------------------------------------------------- */
int test_set8u_func(void)
{
BYTE ALIGN(dest[48]);
int failed = 0;
int off;
char testStr[256];
UINT32 pflags = primitives_get_flags(primitives_get());
testStr[0] = '\0';
#ifdef i386
/* Test SSE under various alignments */
if (pflags & PRIM_X86_SSE2_AVAILABLE)
{
strcat(testStr, " SSE2");
for (off=0; off<16; ++off)
{
int len;
for (len=1; len<48-off; ++len)
{
int i;
memset(dest, 0, sizeof(dest));
sse2_set_8u(0xa5, dest+off, len);
for (i=0; i<len; ++i)
{
if (dest[off+i] != 0xa5)
{
printf("SET8U-SSE FAILED: off=%d len=%d dest[%d]=0x%02x\n",
off, len, i+off, dest[i+off]);
failed=1;
}
}
}
}
}
#endif /* i386 */
#ifdef WITH_IPP
/* Test IPP under various alignments */
strcat(testStr, " IPP");
for (off=0; off<16; ++off)
{
int len;
for (len=1; len<48-off; ++len)
{
int i;
memset(dest, 0, sizeof(dest));
ippsSet_8u(0xa5, dest+off, len);
for (i=0; i<len; ++i)
{
if (dest[off+i] != 0xa5)
{
printf("SET8U-IPP FAILED: off=%d len=%d dest[%d]=0x%02x\n",
off, len, i+off, dest[i+off]);
failed=1;
}
}
}
}
#endif /* WITH_IPP */
if (!failed) printf("All set8u tests passed (%s).\n", testStr);
return (failed > 0) ? FAILURE : SUCCESS;
}
/* ------------------------------------------------------------------------- */
STD_SPEED_TEST(set8u_speed_test, BYTE, BYTE, dst=dst,
TRUE, memset(dst, constant, size),
FALSE, NULL, 0,
FALSE, NULL, 0,
TRUE, ippsSet_8u(constant, dst, size));
int test_set8u_speed(void)
{
BYTE ALIGN(dst[MAX_TEST_SIZE]);
set8u_speed_test("set8u", "aligned", NULL, NULL, 0xA5, dst,
set_sizes, NUM_SET_SIZES, MEMSET8_PRETEST_ITERATIONS, TEST_TIME);
return SUCCESS;
}
/* ------------------------------------------------------------------------- */
int test_set32s_func(void)
{
primitives_t *prims = primitives_get();
INT32 ALIGN(dest[512]);
int failed = 0;
int off;
char testStr[256];
UINT32 pflags = primitives_get_flags(primitives_get());
testStr[0] = '\0';
#ifdef i386
/* Test SSE under various alignments */
if (pflags & PRIM_X86_SSE2_AVAILABLE)
{
strcat(testStr, " SSE2");
for (off=0; off<16; ++off) {
int len;
for (len=1; len<512-off; ++len)
{
int i;
memset(dest, 0, sizeof(dest));
sse2_set_32s(0xdeadbeef, dest+off, len);
for (i=0; i<len; ++i)
{
if (dest[off+i] != 0xdeadbeef)
{
printf("set32s-SSE FAIL: off=%d len=%d dest[%d]=0x%08x\n",
off, len, i+off, dest[i+off]);
failed=1;
}
}
}
}
}
#endif /* i386 */
#ifdef WITH_IPP
strcat(testStr, " IPP");
for (off=0; off<16; ++off) {
int len;
for (len=1; len<512-off; ++len)
{
int i;
memset(dest, 0, sizeof(dest));
ippsSet_32s(0xdeadbeef, dest+off, len);
for (i=0; i<len; ++i)
{
if (dest[off+i] != 0xdeadbeef)
{
printf("set32s-IPP FAIL: off=%d len=%d dest[%d]=0x%08x\n",
off, len, i+off, dest[i+off]);
failed=1;
}
}
}
}
#endif /* WITH_IPP */
if (!failed) printf("All set32s tests passed (%s).\n", testStr);
return (failed > 0) ? FAILURE : SUCCESS;
}
/* ------------------------------------------------------------------------- */
int test_set32u_func(void)
{
primitives_t *prims = primitives_get();
UINT32 ALIGN(dest[512]);
int failed = 0;
int off;
char testStr[256];
UINT32 pflags = primitives_get_flags(primitives_get());
testStr[0] = '\0';
#ifdef i386
/* Test SSE under various alignments */
if (pflags & PRIM_X86_SSE2_AVAILABLE)
{
strcat(testStr, " SSE2");
for (off=0; off<16; ++off) {
int len;
for (len=1; len<512-off; ++len)
{
int i;
memset(dest, 0, sizeof(dest));
sse2_set_32u(0xdeadbeefU, dest+off, len);
for (i=0; i<len; ++i)
{
if (dest[off+i] != 0xdeadbeefU)
{
printf("set32u-SSE FAIL: off=%d len=%d dest[%d]=0x%08x\n",
off, len, i+off, dest[i+off]);
failed=1;
}
}
}
}
}
#endif /* i386 */
#ifdef WITH_IPP
strcat(testStr, " IPP");
for (off=0; off<16; ++off) {
int len;
for (len=1; len<512-off; ++len)
{
memset(dest, 0, sizeof(dest));
ipp_wrapper_set_32u(0xdeadbeefU, dest+off, len);
int i;
for (i=0; i<len; ++i)
{
if (dest[off+i] != 0xdeadbeefU)
{
printf("set32u-IPP FAIL: off=%d len=%d dest[%d]=0x%08x\n",
off, len, i+off, dest[i+off]);
failed=1;
}
}
}
}
#endif /* WITH_IPP */
if (!failed) printf("All set32u tests passed (%s).\n", testStr);
return (failed > 0) ? FAILURE : SUCCESS;
}
/* ------------------------------------------------------------------------- */
static inline void memset32u_naive(
UINT32 val,
UINT32 *dst,
size_t count)
{
while (count--) *dst++ = val;
}
/* ------------------------------------------------------------------------- */
STD_SPEED_TEST(set32u_speed_test, UINT32, UINT32, dst=dst,
TRUE, memset32u_naive(constant, dst, size),
TRUE, sse2_set_32u(constant, dst, size), PRIM_X86_SSE2_AVAILABLE,
FALSE, dst=dst, 0,
TRUE, ipp_wrapper_set_32u(constant, dst, size));
int test_set32u_speed(void)
{
UINT32 ALIGN(dst[MAX_TEST_SIZE+1]);
set32u_speed_test("set32u", "aligned", NULL, NULL, 0xdeadbeef, dst,
set_sizes, NUM_SET_SIZES, MEMSET32_PRETEST_ITERATIONS, TEST_TIME);
#if 0
/* Not really necessary; should be almost as fast. */
set32u_speed_test("set32u", "unaligned", NULL, NULL, dst+1,
set_sizes, NUM_SET_SIZES, MEMSET32_PRETEST_ITERATIONS, TEST_TIME);
#endif
return SUCCESS;
}
/* ------------------------------------------------------------------------- */
static inline void memset32s_naive(
INT32 val,
INT32 *dst,
size_t count)
{
while (count--) *dst++ = val;
}
/* ------------------------------------------------------------------------- */
STD_SPEED_TEST(set32s_speed_test, INT32, INT32, dst=dst,
TRUE, memset32s_naive(constant, dst, size),
TRUE, sse2_set_32s(constant, dst, size), PRIM_X86_SSE2_AVAILABLE,
FALSE, dst=dst, 0,
TRUE, ippsSet_32s(constant, dst, size));
int test_set32s_speed(void)
{
INT32 ALIGN(dst[MAX_TEST_SIZE+1]);
set32s_speed_test("set32s", "aligned", NULL, NULL, 0xdeadbeef, dst,
set_sizes, NUM_SET_SIZES, MEMSET32_PRETEST_ITERATIONS, TEST_TIME);
#if 0
/* Not really necessary; should be almost as fast. */
set32s_speed_test("set32s", "unaligned", NULL, NULL, dst+1,
set_sizes, NUM_SET_SIZES, MEMSET32_PRETEST_ITERATIONS, TEST_TIME);
#endif
return SUCCESS;
}

View File

@ -0,0 +1,173 @@
/* test_shift.c
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include "prim_test.h"
#define FUNC_TEST_SIZE 65536
static const int SHIFT_PRETEST_ITERATIONS = 50000;
static const float TEST_TIME = 1.0;
extern pstatus_t general_lShiftC_16s(
const INT16 *pSrc, int val, INT16 *pDst, int len);
extern pstatus_t general_rShiftC_16s(
const INT16 *pSrc, int val, INT16 *pDst, int len);
extern pstatus_t general_shiftC_16s(
const INT16 *pSrc, int val, INT16 *pDst, int len);
extern pstatus_t general_lShiftC_16u(
const UINT16 *pSrc, int val, UINT16 *pDst, int len);
extern pstatus_t general_rShiftC_16u(
const UINT16 *pSrc, int val, UINT16 *pDst, int len);
extern pstatus_t general_shiftC_16u(
const UINT16 *pSrc, int val, UINT16 *pDst, int len);
extern pstatus_t sse2_lShiftC_16s(
const INT16 *pSrc, int val, INT16 *pDst, int len);
extern pstatus_t sse2_rShiftC_16s(
const INT16 *pSrc, int val, INT16 *pDst, int len);
extern pstatus_t sse2_shiftC_16s(
const INT16 *pSrc, int val, INT16 *pDst, int len);
extern pstatus_t sse2_lShiftC_16u(
const UINT16 *pSrc, int val, UINT16 *pDst, int len);
extern pstatus_t sse2_rShiftC_16u(
const UINT16 *pSrc, int val, UINT16 *pDst, int len);
extern pstatus_t sse2_shiftC_16u(
const UINT16 *pSrc, int val, UINT16 *pDst, int len);
#ifdef i386
#define SHIFT_TEST_FUNC(_name_, _type_, _str_, _f1_, _f2_) \
int _name_(void) \
{ \
_type_ ALIGN(src[FUNC_TEST_SIZE+3]), \
ALIGN(d1[FUNC_TEST_SIZE+3]), ALIGN(d2[FUNC_TEST_SIZE+3]); \
int failed = 0; \
int i; \
UINT32 pflags = primitives_get_flags(primitives_get()); \
char testStr[256]; \
testStr[0] = '\0'; \
get_random_data(src, sizeof(src)); \
_f1_(src+1, 3, d1+1, FUNC_TEST_SIZE); \
if (pflags & PRIM_X86_SSE3_AVAILABLE) \
{ \
strcat(testStr, " SSE3"); \
/* Aligned */ \
_f2_(src+1, 3, d2+1, FUNC_TEST_SIZE); \
for (i=1; i<=FUNC_TEST_SIZE; ++i) \
{ \
if (d1[i] != d2[i]) \
{ \
printf("%s-SSE-aligned FAIL[%d]: 0x%x>>3=0x%x, got 0x%x\n", \
_str_, i, src[i], d1[i], d2[i]); \
++failed; \
} \
} \
/* Unaligned */ \
_f2_(src+1, 3, d2+2, FUNC_TEST_SIZE); \
for (i=1; i<=FUNC_TEST_SIZE; ++i) \
{ \
if (d1[i] != d2[i+1]) \
{ \
printf("%s-SSE-unaligned FAIL[%d]: 0x%x>>3=0x%x, got 0x%x\n", \
_str_, i, src[i], d1[i], d2[i+1]); \
++failed; \
} \
} \
} \
if (!failed) printf("All %s tests passed (%s).\n", _str_, testStr); \
return (failed > 0) ? FAILURE : SUCCESS; \
}
#else
#define SHIFT_TEST_FUNC(_name_, _type_, _str_, _f1_, _f2_) \
int _name_(void) \
{ \
return SUCCESS; \
}
#endif /* i386 */
SHIFT_TEST_FUNC(test_lShift_16s_func, INT16, "lshift_16s", general_lShiftC_16s,
sse2_lShiftC_16s)
SHIFT_TEST_FUNC(test_lShift_16u_func, UINT16, "lshift_16u", general_lShiftC_16u,
sse2_lShiftC_16u)
SHIFT_TEST_FUNC(test_rShift_16s_func, INT16, "rshift_16s", general_rShiftC_16s,
sse2_rShiftC_16s)
SHIFT_TEST_FUNC(test_rShift_16u_func, UINT16, "rshift_16u", general_rShiftC_16u,
sse2_rShiftC_16u)
/* ========================================================================= */
STD_SPEED_TEST(speed_lShift_16s, INT16, INT16, dst=dst,
TRUE, general_lShiftC_16s(src1, constant, dst, size),
TRUE, sse2_lShiftC_16s(src1, constant, dst, size), PRIM_X86_SSE2_AVAILABLE,
FALSE, dst=dst, 0,
TRUE, ippsLShiftC_16s(src1, constant, dst, size));
STD_SPEED_TEST(speed_lShift_16u, UINT16, UINT16, dst=dst,
TRUE, general_lShiftC_16u(src1, constant, dst, size),
TRUE, sse2_lShiftC_16u(src1, constant, dst, size), PRIM_X86_SSE2_AVAILABLE,
FALSE, dst=dst, 0,
TRUE, ippsLShiftC_16u(src1, constant, dst, size));
STD_SPEED_TEST(speed_rShift_16s, INT16, INT16, dst=dst,
TRUE, general_rShiftC_16s(src1, constant, dst, size),
TRUE, sse2_rShiftC_16s(src1, constant, dst, size), PRIM_X86_SSE2_AVAILABLE,
FALSE, dst=dst, 0,
TRUE, ippsRShiftC_16s(src1, constant, dst, size));
STD_SPEED_TEST(speed_rShift_16u, UINT16, UINT16, dst=dst,
TRUE, general_rShiftC_16u(src1, constant, dst, size),
TRUE, sse2_rShiftC_16u(src1, constant, dst, size), PRIM_X86_SSE2_AVAILABLE,
FALSE, dst=dst, 0,
TRUE, ippsRShiftC_16u(src1, constant, dst, size));
/* ------------------------------------------------------------------------- */
int test_lShift_16s_speed(void)
{
INT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]);
get_random_data(src, sizeof(src));
speed_lShift_16s("lShift_16s", "aligned", src, NULL, 3, dst,
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
speed_lShift_16s("lShift_16s", "unaligned", src+1, NULL, 3, dst,
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
return SUCCESS;
}
/* ------------------------------------------------------------------------- */
int test_lShift_16u_speed(void)
{
UINT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]);
get_random_data(src, sizeof(src));
speed_lShift_16u("lShift_16u", "aligned", src, NULL, 3, dst,
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
speed_lShift_16u("lShift_16u", "unaligned", src+1, NULL, 3, dst,
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
return SUCCESS;
}
/* ------------------------------------------------------------------------- */
int test_rShift_16s_speed(void)
{
INT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]);
get_random_data(src, sizeof(src));
speed_rShift_16s("rShift_16s", "aligned", src, NULL, 3, dst,
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
speed_rShift_16s("rShift_16s", "unaligned", src+1, NULL, 3, dst,
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
return SUCCESS;
}
/* ------------------------------------------------------------------------- */
int test_rShift_16u_speed(void)
{
UINT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]);
get_random_data(src, sizeof(src));
speed_rShift_16u("rShift_16u", "aligned", src, NULL, 3, dst,
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
speed_rShift_16u("rShift_16u", "unaligned", src+1, NULL, 3, dst,
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
return SUCCESS;
}

View File

@ -0,0 +1,91 @@
/* test_sign.c
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include "prim_test.h"
static const int SIGN_PRETEST_ITERATIONS = 100000;
static const float TEST_TIME = 1.0;
extern pstatus_t general_sign_16s(const INT16 *pSrc, INT16 *pDst, int len);
extern pstatus_t ssse3_sign_16s(const INT16 *pSrc, INT16 *pDst, int len);
/* ------------------------------------------------------------------------- */
int test_sign16s_func(void)
{
INT16 ALIGN(src[65535]), ALIGN(d1[65535]), ALIGN(d2[65535]);
int failed = 0;
int i;
UINT32 pflags = primitives_get_flags(primitives_get());
char testStr[256];
/* Test when we can reach 16-byte alignment */
testStr[0] = '\0';
get_random_data(src, sizeof(src));
general_sign_16s(src+1, d1+1, 65535);
#ifdef i386
if (pflags & PRIM_X86_SSSE3_AVAILABLE)
{
strcat(testStr, " SSSE3");
ssse3_sign_16s(src+1, d2+1, 65535);
for (i=1; i<65535; ++i)
{
if (d1[i] != d2[i])
{
printf("SIGN16s-SSE-aligned FAIL[%d] of %d: want %d, got %d\n",
i, src[i], d1[i], d2[i]);
++failed;
}
}
}
#endif /* i386 */
/* Test when we cannot reach 16-byte alignment */
get_random_data(src, sizeof(src));
general_sign_16s(src+1, d1+2, 65535);
#ifdef i386
if (pflags & PRIM_X86_SSSE3_AVAILABLE)
{
ssse3_sign_16s(src+1, d2+2, 65535);
for (i=2; i<65535; ++i)
{
if (d1[i] != d2[i])
{
printf("SIGN16s-SSE-unaligned FAIL[%d] of %d: want %d, got %d\n",
i, src[i-1], d1[i], d2[i]);
++failed;
}
}
}
#endif /* i386 */
if (!failed) printf("All sign16s tests passed (%s).\n", testStr);
return (failed > 0) ? FAILURE : SUCCESS;
}
/* ------------------------------------------------------------------------- */
STD_SPEED_TEST(sign16s_speed_test, INT16, INT16, dst=dst,
TRUE, general_sign_16s(src1, dst, size),
TRUE, ssse3_sign_16s(src1, dst, size), PRIM_X86_SSSE3_AVAILABLE,
FALSE, dst=dst, 0,
FALSE, dst=dst);
int test_sign16s_speed(void)
{
INT16 ALIGN(src[MAX_TEST_SIZE+3]), ALIGN(dst[MAX_TEST_SIZE+3]);
get_random_data(src, sizeof(src));
sign16s_speed_test("sign16s", "aligned", src, NULL, 0, dst,
test_sizes, NUM_TEST_SIZES, SIGN_PRETEST_ITERATIONS, TEST_TIME);
sign16s_speed_test("sign16s", "unaligned", src+1, NULL, 0, dst,
test_sizes, NUM_TEST_SIZES, SIGN_PRETEST_ITERATIONS, TEST_TIME);
return SUCCESS;
}

View File

@ -64,7 +64,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS}
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
MONOLITHIC ${MONOLITHIC_BUILD}
MODULE freerdp
MODULES freerdp-core freerdp-utils freerdp-codec)
MODULES freerdp-core freerdp-utils freerdp-codec freerdp-primitives)
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
MONOLITHIC ${MONOLITHIC_BUILD}

View File

@ -33,7 +33,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} freerdp-server)
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
MONOLITHIC ${MONOLITHIC_BUILD}
MODULE freerdp
MODULES freerdp-core freerdp-utils freerdp-codec)
MODULES freerdp-core freerdp-utils freerdp-codec freerdp-primitives)
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
MONOLITHIC ${MONOLITHIC_BUILD}

View File

@ -59,7 +59,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} freerdp-server)
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
MONOLITHIC ${MONOLITHIC_BUILD}
MODULE freerdp
MODULES freerdp-core freerdp-utils freerdp-codec)
MODULES freerdp-core freerdp-utils freerdp-codec freerdp-primitives)
target_link_libraries(${MODULE_NAME} ${${MODULE_PREFIX}_LIBS})

View File

@ -90,7 +90,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} ${X11_LIBRARIES})
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
MONOLITHIC ${MONOLITHIC_BUILD}
MODULE freerdp
MODULES freerdp-core freerdp-common freerdp-codec freerdp-utils freerdp-gdi freerdp-crypto freerdp-locale)
MODULES freerdp-core freerdp-common freerdp-codec freerdp-primitives freerdp-utils freerdp-gdi freerdp-crypto freerdp-locale)
set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
MONOLITHIC ${MONOLITHIC_BUILD}