diff --git a/libfreerdp/primitives/CMakeLists.txt b/libfreerdp/primitives/CMakeLists.txt index 01dce882b..e8d1a7bfb 100644 --- a/libfreerdp/primitives/CMakeLists.txt +++ b/libfreerdp/primitives/CMakeLists.txt @@ -28,6 +28,15 @@ set(${MODULE_PREFIX}_SRCS primitives.c prim_internal.h) +set(${MODULE_PREFIX}_OPT_SRCS + prim_add_opt.c + prim_andor_opt.c + prim_alphaComp_opt.c + prim_colors_opt.c + prim_set_opt.c + prim_shift_opt.c + prim_sign_opt.c) + add_definitions(-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}) ### IPP Variable debugging @@ -63,7 +72,9 @@ if(ANDROID) ${ANDROID_CPU_FEATURES_PATH}/cpu-features.h) endif() -set_property(SOURCE ${${MODULE_PREFIX}_SRCS} PROPERTY COMPILE_FLAGS ${OPTIMIZATION}) +set_property(SOURCE ${${MODULE_PREFIX}_OPT_SRCS} PROPERTY COMPILE_FLAGS ${OPTIMIZATION}) + +set(${MODULE_PREFIX}_SRCS ${${MODULE_PREFIX}_SRCS} ${${MODULE_PREFIX}_OPT_SRCS}) add_complex_library(MODULE ${MODULE_NAME} TYPE "OBJECT" MONOLITHIC ${MONOLITHIC_BUILD} diff --git a/libfreerdp/primitives/prim_add.c b/libfreerdp/primitives/prim_add.c index 17e504201..258bcc6ea 100644 --- a/libfreerdp/primitives/prim_add.c +++ b/libfreerdp/primitives/prim_add.c @@ -18,27 +18,16 @@ #include "config.h" #endif -#include - #include #include -#ifdef WITH_SSE2 -#include -#include -#endif /* WITH_SSE2 */ - -#ifdef WITH_IPP -#include -#endif /* WITH_IPP */ - #include "prim_internal.h" -#include "prim_templates.h" +#include "prim_add.h" /* ---------------------------------------------------------------------------- * 16-bit signed add with saturation (under and over). */ -PRIM_STATIC pstatus_t general_add_16s( +pstatus_t general_add_16s( const INT16 *pSrc1, const INT16 *pSrc2, INT16 *pDst, @@ -55,29 +44,14 @@ PRIM_STATIC pstatus_t general_add_16s( return PRIMITIVES_SUCCESS; } -#ifdef WITH_SSE2 -# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) -/* ------------------------------------------------------------------------- */ -SSE3_SSD_ROUTINE(sse3_add_16s, INT16, general_add_16s, - _mm_adds_epi16, general_add_16s(sptr1++, sptr2++, dptr++, 1)) -# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ -#endif - /* ------------------------------------------------------------------------- */ void primitives_init_add( const primitives_hints_t *hints, primitives_t *prims) { prims->add_16s = general_add_16s; -#ifdef WITH_IPP - prims->add_16s = (__add_16s_t) ippsAdd_16s; -#elif defined(WITH_SSE2) - if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE) - && (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) /* for LDDQU */ - { - prims->add_16s = sse3_add_16s; - } -#endif + + primitives_init_add_opt(hints, prims); } /* ------------------------------------------------------------------------- */ diff --git a/libfreerdp/primitives/prim_add.h b/libfreerdp/primitives/prim_add.h new file mode 100644 index 000000000..4ad460279 --- /dev/null +++ b/libfreerdp/primitives/prim_add.h @@ -0,0 +1,30 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Add operations. + * vi:ts=4 sw=4 + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. Algorithms used by + * this code may be covered by patents by HP, Microsoft, or other parties. + * + */ + +#ifdef __GNUC__ +# pragma once +#endif + +#ifndef __PRIM_ADD_H_INCLUDED__ +#define __PRIM_ADD_H_INCLUDED__ + +pstatus_t general_add_16s(const INT16 *pSrc1, const INT16 *pSrc2, INT16 *pDst, INT32 len); + +void primitives_init_add_opt(const primitives_hints_t *hints, primitives_t *prims); + +#endif /* !__PRIM_ADD_H_INCLUDED__ */ + diff --git a/libfreerdp/primitives/prim_add_opt.c b/libfreerdp/primitives/prim_add_opt.c new file mode 100644 index 000000000..2de0b8fc6 --- /dev/null +++ b/libfreerdp/primitives/prim_add_opt.c @@ -0,0 +1,62 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Optimized add operations. + * vi:ts=4 sw=4: + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + * + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include + +#ifdef WITH_SSE2 +#include +#include +#endif /* WITH_SSE2 */ + +#ifdef WITH_IPP +#include +#endif /* WITH_IPP */ + +#include "prim_internal.h" +#include "prim_templates.h" +#include "prim_add.h" + + +#ifdef WITH_SSE2 +# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) +/* ------------------------------------------------------------------------- */ +SSE3_SSD_ROUTINE(sse3_add_16s, INT16, general_add_16s, + _mm_adds_epi16, general_add_16s(sptr1++, sptr2++, dptr++, 1)) +# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ +#endif + +/* ------------------------------------------------------------------------- */ +void primitives_init_add_opt( + const primitives_hints_t *hints, + primitives_t *prims) +{ +#ifdef WITH_IPP + prims->add_16s = (__add_16s_t) ippsAdd_16s; +#elif defined(WITH_SSE2) + if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE) + && (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) /* for LDDQU */ + { + prims->add_16s = sse3_add_16s; + } +#endif +} + + diff --git a/libfreerdp/primitives/prim_alphaComp.c b/libfreerdp/primitives/prim_alphaComp.c index 7f1a19093..24f916770 100644 --- a/libfreerdp/primitives/prim_alphaComp.c +++ b/libfreerdp/primitives/prim_alphaComp.c @@ -24,21 +24,11 @@ #include "config.h" #endif -#include - #include #include #include "prim_internal.h" - -#ifdef WITH_SSE2 -#include -#include -#endif /* WITH_SSE2 */ - -#ifdef WITH_IPP -#include -#endif /* WITH_IPP */ +#include "prim_alphaComp.h" #define ALPHA(_k_) (((_k_) & 0xFF000000U) >> 24) #define RED(_k_) (((_k_) & 0x00FF0000U) >> 16) @@ -46,7 +36,7 @@ #define BLU(_k_) (((_k_) & 0x000000FFU)) /* ------------------------------------------------------------------------- */ -PRIM_STATIC pstatus_t general_alphaComp_argb( +pstatus_t general_alphaComp_argb( const BYTE *pSrc1, INT32 src1Step, const BYTE *pSrc2, INT32 src2Step, BYTE *pDst, INT32 dstStep, @@ -111,188 +101,12 @@ PRIM_STATIC pstatus_t general_alphaComp_argb( return PRIMITIVES_SUCCESS; } -/* ------------------------------------------------------------------------- */ -#ifdef WITH_SSE2 -#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) - -PRIM_STATIC pstatus_t sse2_alphaComp_argb( - const BYTE *pSrc1, INT32 src1Step, - const BYTE *pSrc2, INT32 src2Step, - BYTE *pDst, INT32 dstStep, - INT32 width, INT32 height) -{ - const UINT32 *sptr1 = (const UINT32 *) pSrc1; - const UINT32 *sptr2 = (const UINT32 *) pSrc2; - UINT32 *dptr; - int linebytes, src1Jump, src2Jump, dstJump, y; - __m128i xmm0, xmm1; - - if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS; - - if (width < 4) /* pointless if too small */ - { - return general_alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, - pDst, dstStep, width, height); - } - dptr = (UINT32 *) pDst; - linebytes = width * sizeof(UINT32); - src1Jump = (src1Step - linebytes) / sizeof(UINT32); - src2Jump = (src2Step - linebytes) / sizeof(UINT32); - dstJump = (dstStep - linebytes) / sizeof(UINT32); - - xmm0 = _mm_set1_epi32(0); - xmm1 = _mm_set1_epi16(1); - - for (y=0; y> 2; - pixels -= count << 2; - while (count--) - { - __m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - /* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */ - xmm2 = LOAD_SI128(sptr1); sptr1 += 4; - /* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */ - xmm3 = LOAD_SI128(sptr2); sptr2 += 4; - /* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */ - xmm4 = _mm_unpackhi_epi8(xmm2, xmm0); - /* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */ - xmm5 = _mm_unpackhi_epi8(xmm3, xmm0); - /* subtract */ - xmm6 = _mm_subs_epi16(xmm4, xmm5); - /* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */ - xmm4 = _mm_shufflelo_epi16(xmm4, 0xff); - /* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */ - xmm4 = _mm_shufflehi_epi16(xmm4, 0xff); - /* Add one to alphas */ - xmm4 = _mm_adds_epi16(xmm4, xmm1); - /* Multiply and take low word */ - xmm4 = _mm_mullo_epi16(xmm4, xmm6); - /* Shift 8 right */ - xmm4 = _mm_srai_epi16(xmm4, 8); - /* Add xmm5 */ - xmm4 = _mm_adds_epi16(xmm4, xmm5); - /* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */ - - /* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */ - xmm5 = _mm_unpacklo_epi8(xmm2, xmm0); - /* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */ - xmm6 = _mm_unpacklo_epi8(xmm3, xmm0); - /* subtract */ - xmm7 = _mm_subs_epi16(xmm5, xmm6); - /* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */ - xmm5 = _mm_shufflelo_epi16(xmm5, 0xff); - /* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */ - xmm5 = _mm_shufflehi_epi16(xmm5, 0xff); - /* Add one to alphas */ - xmm5 = _mm_adds_epi16(xmm5, xmm1); - /* Multiply and take low word */ - xmm5 = _mm_mullo_epi16(xmm5, xmm7); - /* Shift 8 right */ - xmm5 = _mm_srai_epi16(xmm5, 8); - /* Add xmm6 */ - xmm5 = _mm_adds_epi16(xmm5, xmm6); - /* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */ - - /* Must mask off remainders or pack gets confused */ - xmm3 = _mm_set1_epi16(0x00ffU); - xmm4 = _mm_and_si128(xmm4, xmm3); - xmm5 = _mm_and_si128(xmm5, xmm3); - - /* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */ - xmm5 = _mm_packus_epi16(xmm5, xmm4); - _mm_store_si128((__m128i *) dptr, xmm5); dptr += 4; - } - - /* Finish off the remainder. */ - if (pixels) - { - general_alphaComp_argb((const BYTE *) sptr1, src1Step, - (const BYTE *) sptr2, src2Step, - (BYTE *) dptr, dstStep, pixels, 1); - sptr1 += pixels; - sptr2 += pixels; - dptr += pixels; - } - - /* Jump to next row. */ - sptr1 += src1Jump; - sptr2 += src2Jump; - dptr += dstJump; - } - - return PRIMITIVES_SUCCESS; -} -#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ -#endif - -#ifdef WITH_IPP -/* ------------------------------------------------------------------------- */ -PRIM_STATIC pstatus_t ipp_alphaComp_argb( - const BYTE *pSrc1, INT32 src1Step, - const BYTE *pSrc2, INT32 src2Step, - BYTE *pDst, INT32 dstStep, - INT32 width, INT32 height) -{ - IppiSize sz; - sz.width = width; - sz.height = height; - return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step, - pDst, dstStep, sz, ippAlphaOver); -} -#endif - /* ------------------------------------------------------------------------- */ void primitives_init_alphaComp(const primitives_hints_t* hints, primitives_t* prims) { prims->alphaComp_argb = general_alphaComp_argb; -#ifdef WITH_IPP - prims->alphaComp_argb = ipp_alphaComp_argb; -#elif defined(WITH_SSE2) - if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE) - && (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) /* for LDDQU */ - { - prims->alphaComp_argb = sse2_alphaComp_argb; - } -#endif + + primitives_init_alphaComp_opt(hints, prims); } /* ------------------------------------------------------------------------- */ @@ -300,3 +114,4 @@ void primitives_deinit_alphaComp(primitives_t *prims) { /* Nothing to do. */ } + diff --git a/libfreerdp/primitives/prim_alphaComp.h b/libfreerdp/primitives/prim_alphaComp.h new file mode 100644 index 000000000..50591162d --- /dev/null +++ b/libfreerdp/primitives/prim_alphaComp.h @@ -0,0 +1,30 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Alpha blending routines. + * vi:ts=4 sw=4 + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. Algorithms used by + * this code may be covered by patents by HP, Microsoft, or other parties. + * + */ + +#ifdef __GNUC__ +# pragma once +#endif + +#ifndef __PRIM_ALPHACOMP_H_INCLUDED__ +#define __PRIM_ALPHACOMP_H_INCLUDED__ + +pstatus_t general_alphaComp_argb(const BYTE *pSrc1, INT32 src1Step, const BYTE *pSrc2, INT32 src2Step, BYTE *pDst, INT32 dstStep, INT32 width, INT32 height); + +void primitives_init_alphaComp_opt(const primitives_hints_t* hints, primitives_t* prims); + +#endif /* !__PRIM_ALPHACOMP_H_INCLUDED__ */ + diff --git a/libfreerdp/primitives/prim_alphaComp_opt.c b/libfreerdp/primitives/prim_alphaComp_opt.c new file mode 100644 index 000000000..5550fcbc1 --- /dev/null +++ b/libfreerdp/primitives/prim_alphaComp_opt.c @@ -0,0 +1,225 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Optimized alpha blending routines. + * vi:ts=4 sw=4: + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + * + * Note: this code assumes the second operand is fully opaque, + * e.g. + * newval = alpha1*val1 + (1-alpha1)*val2 + * rather than + * newval = alpha1*val1 + (1-alpha1)*alpha2*val2 + * The IPP gives other options. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include + +#ifdef WITH_SSE2 +#include +#include +#endif /* WITH_SSE2 */ + +#ifdef WITH_IPP +#include +#endif /* WITH_IPP */ + +#include "prim_internal.h" +#include "prim_alphaComp.h" + + +/* ------------------------------------------------------------------------- */ +#ifdef WITH_SSE2 +#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) + +pstatus_t sse2_alphaComp_argb( + const BYTE *pSrc1, INT32 src1Step, + const BYTE *pSrc2, INT32 src2Step, + BYTE *pDst, INT32 dstStep, + INT32 width, INT32 height) +{ + const UINT32 *sptr1 = (const UINT32 *) pSrc1; + const UINT32 *sptr2 = (const UINT32 *) pSrc2; + UINT32 *dptr; + int linebytes, src1Jump, src2Jump, dstJump, y; + __m128i xmm0, xmm1; + + if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS; + + if (width < 4) /* pointless if too small */ + { + return general_alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, + pDst, dstStep, width, height); + } + dptr = (UINT32 *) pDst; + linebytes = width * sizeof(UINT32); + src1Jump = (src1Step - linebytes) / sizeof(UINT32); + src2Jump = (src2Step - linebytes) / sizeof(UINT32); + dstJump = (dstStep - linebytes) / sizeof(UINT32); + + xmm0 = _mm_set1_epi32(0); + xmm1 = _mm_set1_epi16(1); + + for (y=0; y> 2; + pixels -= count << 2; + while (count--) + { + __m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + /* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */ + xmm2 = LOAD_SI128(sptr1); sptr1 += 4; + /* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */ + xmm3 = LOAD_SI128(sptr2); sptr2 += 4; + /* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */ + xmm4 = _mm_unpackhi_epi8(xmm2, xmm0); + /* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */ + xmm5 = _mm_unpackhi_epi8(xmm3, xmm0); + /* subtract */ + xmm6 = _mm_subs_epi16(xmm4, xmm5); + /* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */ + xmm4 = _mm_shufflelo_epi16(xmm4, 0xff); + /* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */ + xmm4 = _mm_shufflehi_epi16(xmm4, 0xff); + /* Add one to alphas */ + xmm4 = _mm_adds_epi16(xmm4, xmm1); + /* Multiply and take low word */ + xmm4 = _mm_mullo_epi16(xmm4, xmm6); + /* Shift 8 right */ + xmm4 = _mm_srai_epi16(xmm4, 8); + /* Add xmm5 */ + xmm4 = _mm_adds_epi16(xmm4, xmm5); + /* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */ + + /* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */ + xmm5 = _mm_unpacklo_epi8(xmm2, xmm0); + /* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */ + xmm6 = _mm_unpacklo_epi8(xmm3, xmm0); + /* subtract */ + xmm7 = _mm_subs_epi16(xmm5, xmm6); + /* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */ + xmm5 = _mm_shufflelo_epi16(xmm5, 0xff); + /* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */ + xmm5 = _mm_shufflehi_epi16(xmm5, 0xff); + /* Add one to alphas */ + xmm5 = _mm_adds_epi16(xmm5, xmm1); + /* Multiply and take low word */ + xmm5 = _mm_mullo_epi16(xmm5, xmm7); + /* Shift 8 right */ + xmm5 = _mm_srai_epi16(xmm5, 8); + /* Add xmm6 */ + xmm5 = _mm_adds_epi16(xmm5, xmm6); + /* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */ + + /* Must mask off remainders or pack gets confused */ + xmm3 = _mm_set1_epi16(0x00ffU); + xmm4 = _mm_and_si128(xmm4, xmm3); + xmm5 = _mm_and_si128(xmm5, xmm3); + + /* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */ + xmm5 = _mm_packus_epi16(xmm5, xmm4); + _mm_store_si128((__m128i *) dptr, xmm5); dptr += 4; + } + + /* Finish off the remainder. */ + if (pixels) + { + general_alphaComp_argb((const BYTE *) sptr1, src1Step, + (const BYTE *) sptr2, src2Step, + (BYTE *) dptr, dstStep, pixels, 1); + sptr1 += pixels; + sptr2 += pixels; + dptr += pixels; + } + + /* Jump to next row. */ + sptr1 += src1Jump; + sptr2 += src2Jump; + dptr += dstJump; + } + + return PRIMITIVES_SUCCESS; +} +#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ +#endif + +#ifdef WITH_IPP +/* ------------------------------------------------------------------------- */ +pstatus_t ipp_alphaComp_argb( + const BYTE *pSrc1, INT32 src1Step, + const BYTE *pSrc2, INT32 src2Step, + BYTE *pDst, INT32 dstStep, + INT32 width, INT32 height) +{ + IppiSize sz; + sz.width = width; + sz.height = height; + return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step, + pDst, dstStep, sz, ippAlphaOver); +} +#endif + +/* ------------------------------------------------------------------------- */ +void primitives_init_alphaComp_opt(const primitives_hints_t* hints, primitives_t* prims) +{ +#ifdef WITH_IPP + prims->alphaComp_argb = ipp_alphaComp_argb; +#elif defined(WITH_SSE2) + if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE) + && (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) /* for LDDQU */ + { + prims->alphaComp_argb = sse2_alphaComp_argb; + } +#endif +} + diff --git a/libfreerdp/primitives/prim_andor.c b/libfreerdp/primitives/prim_andor.c index 358d6c823..0b8092ff2 100644 --- a/libfreerdp/primitives/prim_andor.c +++ b/libfreerdp/primitives/prim_andor.c @@ -17,27 +17,16 @@ #include "config.h" #endif -#include - #include #include -#ifdef WITH_SSE2 -#include -#include -#endif /* WITH_SSE2 */ - -#ifdef WITH_IPP -#include -#endif /* WITH_IPP */ - #include "prim_internal.h" -#include "prim_templates.h" +#include "prim_andor.h" /* ---------------------------------------------------------------------------- * 32-bit AND with a constant. */ -PRIM_STATIC pstatus_t general_andC_32u( +pstatus_t general_andC_32u( const UINT32 *pSrc, UINT32 val, UINT32 *pDst, @@ -55,7 +44,7 @@ PRIM_STATIC pstatus_t general_andC_32u( /* ---------------------------------------------------------------------------- * 32-bit OR with a constant. */ -PRIM_STATIC pstatus_t general_orC_32u( +pstatus_t general_orC_32u( const UINT32 *pSrc, UINT32 val, UINT32 *pDst, @@ -70,16 +59,6 @@ PRIM_STATIC pstatus_t general_orC_32u( return PRIMITIVES_SUCCESS; } -#ifdef WITH_SSE2 -# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) -/* ------------------------------------------------------------------------- */ -SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, general_andC_32u, - _mm_and_si128, *dptr++ = *sptr++ & val) -SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, general_orC_32u, - _mm_or_si128, *dptr++ = *sptr++ | val) -# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ -#endif - /* ------------------------------------------------------------------------- */ void primitives_init_andor( const primitives_hints_t *hints, @@ -89,17 +68,7 @@ void primitives_init_andor( prims->andC_32u = general_andC_32u; prims->orC_32u = general_orC_32u; -#if defined(WITH_IPP) - prims->andC_32u = (__andC_32u_t) ippsAndC_32u; - prims->orC_32u = (__orC_32u_t) ippsOrC_32u; -#elif defined(WITH_SSE2) - if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE) - && (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) - { - prims->andC_32u = sse3_andC_32u; - prims->orC_32u = sse3_orC_32u; - } -#endif + primitives_init_andor_opt(hints, prims); } /* ------------------------------------------------------------------------- */ @@ -108,3 +77,4 @@ void primitives_deinit_andor( { /* Nothing to do. */ } + diff --git a/libfreerdp/primitives/prim_andor.h b/libfreerdp/primitives/prim_andor.h new file mode 100644 index 000000000..6a2e7ac46 --- /dev/null +++ b/libfreerdp/primitives/prim_andor.h @@ -0,0 +1,31 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Logical operations. + * vi:ts=4 sw=4 + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. Algorithms used by + * this code may be covered by patents by HP, Microsoft, or other parties. + * + */ + +#ifdef __GNUC__ +# pragma once +#endif + +#ifndef __PRIM_ANDOR_H_INCLUDED__ +#define __PRIM_ANDOR_H_INCLUDED__ + +pstatus_t general_andC_32u(const UINT32 *pSrc, UINT32 val, UINT32 *pDst, INT32 len); +pstatus_t general_orC_32u(const UINT32 *pSrc, UINT32 val, UINT32 *pDst, INT32 len); + +void primitives_init_andor_opt(const primitives_hints_t *hints, primitives_t *prims); + +#endif /* !__PRIM_ANDOR_H_INCLUDED__ */ + diff --git a/libfreerdp/primitives/prim_andor_opt.c b/libfreerdp/primitives/prim_andor_opt.c new file mode 100644 index 000000000..1a7ebd2a0 --- /dev/null +++ b/libfreerdp/primitives/prim_andor_opt.c @@ -0,0 +1,61 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Optimized Logical operations. + * vi:ts=4 sw=4: + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include + +#ifdef WITH_SSE2 +#include +#include +#endif /* WITH_SSE2 */ + +#ifdef WITH_IPP +#include +#endif /* WITH_IPP */ + +#include "prim_internal.h" +#include "prim_templates.h" +#include "prim_andor.h" + +#ifdef WITH_SSE2 +# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) +/* ------------------------------------------------------------------------- */ +SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, general_andC_32u, + _mm_and_si128, *dptr++ = *sptr++ & val) +SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, general_orC_32u, + _mm_or_si128, *dptr++ = *sptr++ | val) +# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ +#endif + +/* ------------------------------------------------------------------------- */ +void primitives_init_andor_opt(const primitives_hints_t *hints, primitives_t *prims) +{ +#if defined(WITH_IPP) + prims->andC_32u = (__andC_32u_t) ippsAndC_32u; + prims->orC_32u = (__orC_32u_t) ippsOrC_32u; +#elif defined(WITH_SSE2) + if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE) + && (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) + { + prims->andC_32u = sse3_andC_32u; + prims->orC_32u = sse3_orC_32u; + } +#endif +} + diff --git a/libfreerdp/primitives/prim_colors.c b/libfreerdp/primitives/prim_colors.c index 865f4767c..179e569b7 100644 --- a/libfreerdp/primitives/prim_colors.c +++ b/libfreerdp/primitives/prim_colors.c @@ -21,16 +21,11 @@ #include "config.h" #endif -#include #include #include -#ifdef WITH_SSE2 -#include -#elif defined(WITH_NEON) -#include -#endif /* WITH_SSE2 else WITH_NEON */ + #include "prim_internal.h" -#include "prim_templates.h" +#include "prim_colors.h" #ifndef MINMAX #define MINMAX(_v_, _l_, _h_) \ @@ -38,7 +33,7 @@ #endif /* !MINMAX */ /* ------------------------------------------------------------------------- */ -PRIM_STATIC pstatus_t general_yCbCrToRGB_16s16s_P3P3( +pstatus_t general_yCbCrToRGB_16s16s_P3P3( const INT16 *pSrc[3], INT32 srcStep, INT16 *pDst[3], INT32 dstStep, const prim_size_t *roi) /* region of interest */ @@ -119,7 +114,7 @@ PRIM_STATIC pstatus_t general_yCbCrToRGB_16s16s_P3P3( } /* ------------------------------------------------------------------------- */ -PRIM_STATIC pstatus_t general_RGBToYCbCr_16s16s_P3P3( +pstatus_t general_RGBToYCbCr_16s16s_P3P3( const INT16 *pSrc[3], INT32 srcStep, INT16 *pDst[3], INT32 dstStep, const prim_size_t *roi) /* region of interest */ @@ -187,7 +182,7 @@ PRIM_STATIC pstatus_t general_RGBToYCbCr_16s16s_P3P3( } /* ------------------------------------------------------------------------- */ -PRIM_STATIC pstatus_t general_RGBToRGB_16s8u_P3AC4R( +pstatus_t general_RGBToRGB_16s8u_P3AC4R( const INT16 *pSrc[3], /* 16-bit R,G, and B arrays */ int srcStep, /* bytes between rows in source data */ BYTE *pDst, /* 32-bit interleaved ARGB (ABGR?) data */ @@ -219,514 +214,6 @@ PRIM_STATIC pstatus_t general_RGBToRGB_16s8u_P3AC4R( return PRIMITIVES_SUCCESS; } - -#ifdef WITH_SSE2 - -#ifdef __GNUC__ -# define GNU_INLINE \ - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) -#else -# define GNU_INLINE -#endif - -#define CACHE_LINE_BYTES 64 - -#define _mm_between_epi16(_val, _min, _max) \ - do { _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); } while (0) - -#ifdef DO_PREFETCH -/*---------------------------------------------------------------------------*/ -static inline void GNU_INLINE _mm_prefetch_buffer( - char * buffer, - int num_bytes) -{ - __m128i * buf = (__m128i*) buffer; - unsigned int i; - for (i = 0; i < (num_bytes / sizeof(__m128i)); - i+=(CACHE_LINE_BYTES / sizeof(__m128i))) - { - _mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA); - } -} -#endif /* DO_PREFETCH */ - -/*---------------------------------------------------------------------------*/ -PRIM_STATIC pstatus_t sse2_yCbCrToRGB_16s16s_P3P3( - const INT16 *pSrc[3], - int srcStep, - INT16 *pDst[3], - int dstStep, - const prim_size_t *roi) /* region of interest */ -{ - __m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096; - __m128i *y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf; - int srcbump, dstbump, yp, imax; - - if (((ULONG_PTR) (pSrc[0]) & 0x0f) - || ((ULONG_PTR) (pSrc[1]) & 0x0f) - || ((ULONG_PTR) (pSrc[2]) & 0x0f) - || ((ULONG_PTR) (pDst[0]) & 0x0f) - || ((ULONG_PTR) (pDst[1]) & 0x0f) - || ((ULONG_PTR) (pDst[2]) & 0x0f) - || (roi->width & 0x07) - || (srcStep & 127) - || (dstStep & 127)) - { - /* We can't maintain 16-byte alignment. */ - return general_yCbCrToRGB_16s16s_P3P3(pSrc, srcStep, - pDst, dstStep, roi); - } - - zero = _mm_setzero_si128(); - max = _mm_set1_epi16(255); - - y_buf = (__m128i*) (pSrc[0]); - cb_buf = (__m128i*) (pSrc[1]); - cr_buf = (__m128i*) (pSrc[2]); - r_buf = (__m128i*) (pDst[0]); - g_buf = (__m128i*) (pDst[1]); - b_buf = (__m128i*) (pDst[2]); - - r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */ - g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */ - g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */ - b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */ - c4096 = _mm_set1_epi16(4096); - srcbump = srcStep / sizeof(__m128i); - dstbump = dstStep / sizeof(__m128i); - -#ifdef DO_PREFETCH - /* Prefetch Y's, Cb's, and Cr's. */ - for (yp=0; ypheight; yp++) - { - int i; - for (i=0; iwidth * sizeof(INT16) / sizeof(__m128i); - i += (CACHE_LINE_BYTES / sizeof(__m128i))) - { - _mm_prefetch((char*)(&y_buf[i]), _MM_HINT_NTA); - _mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA); - _mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA); - } - y_buf += srcbump; - cb_buf += srcbump; - cr_buf += srcbump; - } - y_buf = (__m128i*) (pSrc[0]); - cb_buf = (__m128i*) (pSrc[1]); - cr_buf = (__m128i*) (pSrc[2]); -#endif /* DO_PREFETCH */ - - imax = roi->width * sizeof(INT16) / sizeof(__m128i); - for (yp=0; ypheight; ++yp) - { - int i; - for (i=0; i>5) + 128 + (cr*1.403)>>5 // our base formula - * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above - * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification - * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3 - */ - - /* y = (y_r_buf[i] + 4096) >> 2 */ - __m128i y, cb, cr, r, g, b; - y = _mm_load_si128(y_buf + i); - y = _mm_add_epi16(y, c4096); - y = _mm_srai_epi16(y, 2); - /* cb = cb_g_buf[i]; */ - cb = _mm_load_si128(cb_buf + i); - /* cr = cr_b_buf[i]; */ - cr = _mm_load_si128(cr_buf + i); - - /* (y + HIWORD(cr*22986)) >> 3 */ - r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr)); - r = _mm_srai_epi16(r, 3); - - /* r_buf[i] = MINMAX(r, 0, 255); */ - _mm_between_epi16(r, zero, max); - _mm_store_si128(r_buf + i, r); - - /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */ - g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb)); - g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr)); - g = _mm_srai_epi16(g, 3); - - /* g_buf[i] = MINMAX(g, 0, 255); */ - _mm_between_epi16(g, zero, max); - _mm_store_si128(g_buf + i, g); - - /* (y + HIWORD(cb*28999)) >> 3 */ - b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb)); - b = _mm_srai_epi16(b, 3); - /* b_buf[i] = MINMAX(b, 0, 255); */ - _mm_between_epi16(b, zero, max); - _mm_store_si128(b_buf + i, b); - } - y_buf += srcbump; - cb_buf += srcbump; - cr_buf += srcbump; - r_buf += dstbump; - g_buf += dstbump; - b_buf += dstbump; - } - - return PRIMITIVES_SUCCESS; -} - -/*---------------------------------------------------------------------------*/ -/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point - * numbers. See the general code above. - */ -PRIM_STATIC pstatus_t sse2_RGBToYCbCr_16s16s_P3P3( - const INT16 *pSrc[3], - int srcStep, - INT16 *pDst[3], - int dstStep, - const prim_size_t *roi) /* region of interest */ -{ - __m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b; - __m128i *r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf; - int srcbump, dstbump, yp, imax; - - if (((ULONG_PTR) (pSrc[0]) & 0x0f) - || ((ULONG_PTR) (pSrc[1]) & 0x0f) - || ((ULONG_PTR) (pSrc[2]) & 0x0f) - || ((ULONG_PTR) (pDst[0]) & 0x0f) - || ((ULONG_PTR) (pDst[1]) & 0x0f) - || ((ULONG_PTR) (pDst[2]) & 0x0f) - || (roi->width & 0x07) - || (srcStep & 127) - || (dstStep & 127)) - { - /* We can't maintain 16-byte alignment. */ - return general_RGBToYCbCr_16s16s_P3P3(pSrc, srcStep, - pDst, dstStep, roi); - } - - min = _mm_set1_epi16(-128 << 5); - max = _mm_set1_epi16(127 << 5); - - r_buf = (__m128i*) (pSrc[0]); - g_buf = (__m128i*) (pSrc[1]); - b_buf = (__m128i*) (pSrc[2]); - y_buf = (__m128i*) (pDst[0]); - cb_buf = (__m128i*) (pDst[1]); - cr_buf = (__m128i*) (pDst[2]); - - y_r = _mm_set1_epi16(9798); /* 0.299000 << 15 */ - y_g = _mm_set1_epi16(19235); /* 0.587000 << 15 */ - y_b = _mm_set1_epi16(3735); /* 0.114000 << 15 */ - cb_r = _mm_set1_epi16(-5535); /* -0.168935 << 15 */ - cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */ - cb_b = _mm_set1_epi16(16403); /* 0.500590 << 15 */ - cr_r = _mm_set1_epi16(16377); /* 0.499813 << 15 */ - cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */ - cr_b = _mm_set1_epi16(-2663); /* -0.081282 << 15 */ - - srcbump = srcStep / sizeof(__m128i); - dstbump = dstStep / sizeof(__m128i); - -#ifdef DO_PREFETCH - /* Prefetch RGB's. */ - for (yp=0; ypheight; yp++) - { - int i; - for (i=0; iwidth * sizeof(INT16) / sizeof(__m128i); - i += (CACHE_LINE_BYTES / sizeof(__m128i))) - { - _mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA); - _mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA); - _mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA); - } - r_buf += srcbump; - g_buf += srcbump; - b_buf += srcbump; - } - r_buf = (__m128i*) (pSrc[0]); - g_buf = (__m128i*) (pSrc[1]); - b_buf = (__m128i*) (pSrc[2]); -#endif /* DO_PREFETCH */ - - imax = roi->width * sizeof(INT16) / sizeof(__m128i); - for (yp=0; ypheight; ++yp) - { - int i; - for (i=0; iwidth & 0x0f) - || (srcStep & 0x0f) - || (dstStep & 0x0f)) - { - return general_RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, roi); - } - - out = (BYTE *) pDst; - srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16); - dstbump = (dstStep - (roi->width * sizeof(UINT32))); - - for (y=0; yheight; ++y) - { - int width = roi->width; - do { - __m128i R0, R1, R2, R3, R4; - /* The comments below pretend these are 8-byte registers - * rather than 16-byte, for readability. - */ - R0 = LOAD128(b); b += 8; /* R0 = 00B300B200B100B0 */ - R1 = LOAD128(b); b += 8; /* R1 = 00B700B600B500B4 */ - PACKUSWB(R0,R1); /* R0 = B7B6B5B4B3B2B1B0 */ - R1 = LOAD128(g); g += 8; /* R1 = 00G300G200G100G0 */ - R2 = LOAD128(g); g += 8; /* R2 = 00G700G600G500G4 */ - PACKUSWB(R1,R2); /* R1 = G7G6G5G4G3G2G1G0 */ - R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */ - PUNPCKLBW(R2,R0); /* R2 = G3B3G2B2G1B1G0B0 */ - PUNPCKHBW(R1,R0); /* R1 = G7B7G6B7G5B5G4B4 */ - R0 = LOAD128(r); r += 8; /* R0 = 00R300R200R100R0 */ - R3 = LOAD128(r); r += 8; /* R3 = 00R700R600R500R4 */ - PACKUSWB(R0,R3); /* R0 = R7R6R5R4R3R2R1R0 */ - R3 = XMM_ALL_ONES; /* R3 = FFFFFFFFFFFFFFFF */ - R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */ - PUNPCKLBW(R4,R0); /* R4 = FFR3FFR2FFR1FFR0 */ - PUNPCKHBW(R3,R0); /* R3 = FFR7FFR6FFR5FFR4 */ - R0 = R4; /* R0 = R4 */ - PUNPCKLWD(R0,R2); /* R0 = FFR1G1B1FFR0G0B0 */ - PUNPCKHWD(R4,R2); /* R4 = FFR3G3B3FFR2G2B2 */ - R2 = R3; /* R2 = R3 */ - PUNPCKLWD(R2,R1); /* R2 = FFR5G5B5FFR4G4B4 */ - PUNPCKHWD(R3,R1); /* R3 = FFR7G7B7FFR6G6B6 */ - STORE128(out, R0); out += 16; /* FFR1G1B1FFR0G0B0 */ - STORE128(out, R4); out += 16; /* FFR3G3B3FFR2G2B2 */ - STORE128(out, R2); out += 16; /* FFR5G5B5FFR4G4B4 */ - STORE128(out, R3); out += 16; /* FFR7G7B7FFR6G6B6 */ - } while (width -= 16); - /* Jump to next row. */ - r += srcbump; - g += srcbump; - b += srcbump; - out += dstbump; - } - return PRIMITIVES_SUCCESS; -} -#endif /* WITH_SSE2 */ - -/*---------------------------------------------------------------------------*/ -#ifdef WITH_NEON -PRIM_STATIC pstatus_t neon_yCbCrToRGB_16s16s_P3P3( - const INT16 *pSrc[3], - int srcStep, - INT16 *pDst[3], - int dstStep, - const prim_size_t *roi) /* region of interest */ -{ - /* TODO: If necessary, check alignments and call the general version. */ - - int16x8_t zero = vdupq_n_s16(0); - int16x8_t max = vdupq_n_s16(255); - - int16x8_t r_cr = vdupq_n_s16(22986); // 1.403 << 14 - int16x8_t g_cb = vdupq_n_s16(-5636); // -0.344 << 14 - int16x8_t g_cr = vdupq_n_s16(-11698); // -0.714 << 14 - int16x8_t b_cb = vdupq_n_s16(28999); // 1.770 << 14 - int16x8_t c4096 = vdupq_n_s16(4096); - - int16x8_t* y_buf = (int16x8_t*) pSrc[0]; - int16x8_t* cb_buf = (int16x8_t*) pSrc[1]; - int16x8_t* cr_buf = (int16x8_t*) pSrc[2]; - int16x8_t* r_buf = (int16x8_t*) pDst[0]; - int16x8_t* g_buf = (int16x8_t*) pDst[1]; - int16x8_t* b_buf = (int16x8_t*) pDst[2]; - - int srcbump = srcStep / sizeof(int16x8_t); - int dstbump = dstStep / sizeof(int16x8_t); - int yp; - - int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t); - for (yp=0; ypheight; ++yp) - { - int i; - for (i=0; i>5) + 128 + (cr*1.403)>>5 // our base formula - r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above - r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification - r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3 - */ - - /* y = (y_buf[i] + 4096) >> 2 */ - int16x8_t y = vld1q_s16((INT16*) &y_buf[i]); - y = vaddq_s16(y, c4096); - y = vshrq_n_s16(y, 2); - /* cb = cb_buf[i]; */ - int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]); - /* cr = cr_buf[i]; */ - int16x8_t cr = vld1q_s16((INT16*) &cr_buf[i]); - - /* (y + HIWORD(cr*22986)) >> 3 */ - int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1)); - r = vshrq_n_s16(r, 3); - /* r_buf[i] = MINMAX(r, 0, 255); */ - r = vminq_s16(vmaxq_s16(r, zero), max); - vst1q_s16((INT16*)&r_buf[i], r); - - /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */ - int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1)); - g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1)); - g = vshrq_n_s16(g, 3); - /* g_buf[i] = MINMAX(g, 0, 255); */ - g = vminq_s16(vmaxq_s16(g, zero), max); - vst1q_s16((INT16*)&g_buf[i], g); - - /* (y + HIWORD(cb*28999)) >> 3 */ - int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1)); - b = vshrq_n_s16(b, 3); - /* b_buf[i] = MINMAX(b, 0, 255); */ - b = vminq_s16(vmaxq_s16(b, zero), max); - vst1q_s16((INT16*)&b_buf[i], b); - } - - y_buf += srcbump; - cb_buf += srcbump; - cr_buf += srcbump; - r_buf += dstbump; - g_buf += dstbump; - b_buf += dstbump; - } - return PRIMITIVES_SUCCESS; -} -#endif /* WITH_NEON */ - - -/* I don't see a direct IPP version of this, since the input is INT16 - * YCbCr. It may be possible via Deinterleave and then YCbCrToRGB_. - * But that would likely be slower. - */ - /* ------------------------------------------------------------------------- */ void primitives_init_colors(const primitives_hints_t* hints, primitives_t* prims) { @@ -734,19 +221,7 @@ void primitives_init_colors(const primitives_hints_t* hints, primitives_t* prims prims->yCbCrToRGB_16s16s_P3P3 = general_yCbCrToRGB_16s16s_P3P3; prims->RGBToYCbCr_16s16s_P3P3 = general_RGBToYCbCr_16s16s_P3P3; -#if defined(WITH_SSE2) - if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE) - { - prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R; - prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3; - prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3; - } -#elif defined(WITH_NEON) - if (hints->arm_flags & PRIM_ARM_NEON_AVAILABLE) - { - prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3; - } -#endif /* WITH_SSE2 */ + primitives_init_colors_opt(hints, prims); } /* ------------------------------------------------------------------------- */ @@ -754,3 +229,4 @@ void primitives_deinit_colors(primitives_t* prims) { /* Nothing to do. */ } + diff --git a/libfreerdp/primitives/prim_colors.h b/libfreerdp/primitives/prim_colors.h new file mode 100644 index 000000000..70f478547 --- /dev/null +++ b/libfreerdp/primitives/prim_colors.h @@ -0,0 +1,32 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Color conversion operations. + * vi:ts=4 sw=4 + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. Algorithms used by + * this code may be covered by patents by HP, Microsoft, or other parties. + * + */ + +#ifdef __GNUC__ +# pragma once +#endif + +#ifndef __PRIM_COLORS_H_INCLUDED__ +#define __PRIM_COLORS_H_INCLUDED__ + +pstatus_t general_yCbCrToRGB_16s16s_P3P3(const INT16 *pSrc[3], INT32 srcStep, INT16 *pDst[3], INT32 dstStep, const prim_size_t *roi); +pstatus_t general_RGBToYCbCr_16s16s_P3P3(const INT16 *pSrc[3], INT32 srcStep, INT16 *pDst[3], INT32 dstStep, const prim_size_t *roi); +pstatus_t general_RGBToRGB_16s8u_P3AC4R(const INT16 *pSrc[3], int srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi); + +void primitives_init_colors_opt(const primitives_hints_t* hints, primitives_t* prims); + +#endif /* !__PRIM_COLORS_H_INCLUDED__ */ + diff --git a/libfreerdp/primitives/prim_colors_opt.c b/libfreerdp/primitives/prim_colors_opt.c new file mode 100644 index 000000000..cfc87414e --- /dev/null +++ b/libfreerdp/primitives/prim_colors_opt.c @@ -0,0 +1,561 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Optimized Color conversion operations. + * vi:ts=4 sw=4: + * + * Copyright 2011 Stephen Erisman + * Copyright 2011 Norbert Federa + * Copyright 2011 Martin Fleisz + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include + +#ifdef WITH_SSE2 +#include +#elif defined(WITH_NEON) +#include +#endif /* WITH_SSE2 else WITH_NEON */ + +#include "prim_internal.h" +#include "prim_templates.h" +#include "prim_colors.h" + +#ifdef WITH_SSE2 + +#ifdef __GNUC__ +# define GNU_INLINE \ + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +#else +# define GNU_INLINE +#endif + +#define CACHE_LINE_BYTES 64 + +#define _mm_between_epi16(_val, _min, _max) \ + do { _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); } while (0) + +#ifdef DO_PREFETCH +/*---------------------------------------------------------------------------*/ +static inline void GNU_INLINE _mm_prefetch_buffer( + char * buffer, + int num_bytes) +{ + __m128i * buf = (__m128i*) buffer; + unsigned int i; + for (i = 0; i < (num_bytes / sizeof(__m128i)); + i+=(CACHE_LINE_BYTES / sizeof(__m128i))) + { + _mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA); + } +} +#endif /* DO_PREFETCH */ + +/*---------------------------------------------------------------------------*/ +pstatus_t sse2_yCbCrToRGB_16s16s_P3P3( + const INT16 *pSrc[3], + int srcStep, + INT16 *pDst[3], + int dstStep, + const prim_size_t *roi) /* region of interest */ +{ + __m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096; + __m128i *y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf; + int srcbump, dstbump, yp, imax; + + if (((ULONG_PTR) (pSrc[0]) & 0x0f) + || ((ULONG_PTR) (pSrc[1]) & 0x0f) + || ((ULONG_PTR) (pSrc[2]) & 0x0f) + || ((ULONG_PTR) (pDst[0]) & 0x0f) + || ((ULONG_PTR) (pDst[1]) & 0x0f) + || ((ULONG_PTR) (pDst[2]) & 0x0f) + || (roi->width & 0x07) + || (srcStep & 127) + || (dstStep & 127)) + { + /* We can't maintain 16-byte alignment. */ + return general_yCbCrToRGB_16s16s_P3P3(pSrc, srcStep, + pDst, dstStep, roi); + } + + zero = _mm_setzero_si128(); + max = _mm_set1_epi16(255); + + y_buf = (__m128i*) (pSrc[0]); + cb_buf = (__m128i*) (pSrc[1]); + cr_buf = (__m128i*) (pSrc[2]); + r_buf = (__m128i*) (pDst[0]); + g_buf = (__m128i*) (pDst[1]); + b_buf = (__m128i*) (pDst[2]); + + r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */ + g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */ + g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */ + b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */ + c4096 = _mm_set1_epi16(4096); + srcbump = srcStep / sizeof(__m128i); + dstbump = dstStep / sizeof(__m128i); + +#ifdef DO_PREFETCH + /* Prefetch Y's, Cb's, and Cr's. */ + for (yp=0; ypheight; yp++) + { + int i; + for (i=0; iwidth * sizeof(INT16) / sizeof(__m128i); + i += (CACHE_LINE_BYTES / sizeof(__m128i))) + { + _mm_prefetch((char*)(&y_buf[i]), _MM_HINT_NTA); + _mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA); + _mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA); + } + y_buf += srcbump; + cb_buf += srcbump; + cr_buf += srcbump; + } + y_buf = (__m128i*) (pSrc[0]); + cb_buf = (__m128i*) (pSrc[1]); + cr_buf = (__m128i*) (pSrc[2]); +#endif /* DO_PREFETCH */ + + imax = roi->width * sizeof(INT16) / sizeof(__m128i); + for (yp=0; ypheight; ++yp) + { + int i; + for (i=0; i>5) + 128 + (cr*1.403)>>5 // our base formula + * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above + * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification + * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3 + */ + + /* y = (y_r_buf[i] + 4096) >> 2 */ + __m128i y, cb, cr, r, g, b; + y = _mm_load_si128(y_buf + i); + y = _mm_add_epi16(y, c4096); + y = _mm_srai_epi16(y, 2); + /* cb = cb_g_buf[i]; */ + cb = _mm_load_si128(cb_buf + i); + /* cr = cr_b_buf[i]; */ + cr = _mm_load_si128(cr_buf + i); + + /* (y + HIWORD(cr*22986)) >> 3 */ + r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr)); + r = _mm_srai_epi16(r, 3); + + /* r_buf[i] = MINMAX(r, 0, 255); */ + _mm_between_epi16(r, zero, max); + _mm_store_si128(r_buf + i, r); + + /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */ + g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb)); + g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr)); + g = _mm_srai_epi16(g, 3); + + /* g_buf[i] = MINMAX(g, 0, 255); */ + _mm_between_epi16(g, zero, max); + _mm_store_si128(g_buf + i, g); + + /* (y + HIWORD(cb*28999)) >> 3 */ + b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb)); + b = _mm_srai_epi16(b, 3); + /* b_buf[i] = MINMAX(b, 0, 255); */ + _mm_between_epi16(b, zero, max); + _mm_store_si128(b_buf + i, b); + } + y_buf += srcbump; + cb_buf += srcbump; + cr_buf += srcbump; + r_buf += dstbump; + g_buf += dstbump; + b_buf += dstbump; + } + + return PRIMITIVES_SUCCESS; +} + +/*---------------------------------------------------------------------------*/ +/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point + * numbers. See the general code above. + */ +pstatus_t sse2_RGBToYCbCr_16s16s_P3P3( + const INT16 *pSrc[3], + int srcStep, + INT16 *pDst[3], + int dstStep, + const prim_size_t *roi) /* region of interest */ +{ + __m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b; + __m128i *r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf; + int srcbump, dstbump, yp, imax; + + if (((ULONG_PTR) (pSrc[0]) & 0x0f) + || ((ULONG_PTR) (pSrc[1]) & 0x0f) + || ((ULONG_PTR) (pSrc[2]) & 0x0f) + || ((ULONG_PTR) (pDst[0]) & 0x0f) + || ((ULONG_PTR) (pDst[1]) & 0x0f) + || ((ULONG_PTR) (pDst[2]) & 0x0f) + || (roi->width & 0x07) + || (srcStep & 127) + || (dstStep & 127)) + { + /* We can't maintain 16-byte alignment. */ + return general_RGBToYCbCr_16s16s_P3P3(pSrc, srcStep, + pDst, dstStep, roi); + } + + min = _mm_set1_epi16(-128 << 5); + max = _mm_set1_epi16(127 << 5); + + r_buf = (__m128i*) (pSrc[0]); + g_buf = (__m128i*) (pSrc[1]); + b_buf = (__m128i*) (pSrc[2]); + y_buf = (__m128i*) (pDst[0]); + cb_buf = (__m128i*) (pDst[1]); + cr_buf = (__m128i*) (pDst[2]); + + y_r = _mm_set1_epi16(9798); /* 0.299000 << 15 */ + y_g = _mm_set1_epi16(19235); /* 0.587000 << 15 */ + y_b = _mm_set1_epi16(3735); /* 0.114000 << 15 */ + cb_r = _mm_set1_epi16(-5535); /* -0.168935 << 15 */ + cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */ + cb_b = _mm_set1_epi16(16403); /* 0.500590 << 15 */ + cr_r = _mm_set1_epi16(16377); /* 0.499813 << 15 */ + cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */ + cr_b = _mm_set1_epi16(-2663); /* -0.081282 << 15 */ + + srcbump = srcStep / sizeof(__m128i); + dstbump = dstStep / sizeof(__m128i); + +#ifdef DO_PREFETCH + /* Prefetch RGB's. */ + for (yp=0; ypheight; yp++) + { + int i; + for (i=0; iwidth * sizeof(INT16) / sizeof(__m128i); + i += (CACHE_LINE_BYTES / sizeof(__m128i))) + { + _mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA); + _mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA); + _mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA); + } + r_buf += srcbump; + g_buf += srcbump; + b_buf += srcbump; + } + r_buf = (__m128i*) (pSrc[0]); + g_buf = (__m128i*) (pSrc[1]); + b_buf = (__m128i*) (pSrc[2]); +#endif /* DO_PREFETCH */ + + imax = roi->width * sizeof(INT16) / sizeof(__m128i); + for (yp=0; ypheight; ++yp) + { + int i; + for (i=0; iwidth & 0x0f) + || (srcStep & 0x0f) + || (dstStep & 0x0f)) + { + return general_RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, roi); + } + + out = (BYTE *) pDst; + srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16); + dstbump = (dstStep - (roi->width * sizeof(UINT32))); + + for (y=0; yheight; ++y) + { + int width = roi->width; + do { + __m128i R0, R1, R2, R3, R4; + /* The comments below pretend these are 8-byte registers + * rather than 16-byte, for readability. + */ + R0 = LOAD128(b); b += 8; /* R0 = 00B300B200B100B0 */ + R1 = LOAD128(b); b += 8; /* R1 = 00B700B600B500B4 */ + PACKUSWB(R0,R1); /* R0 = B7B6B5B4B3B2B1B0 */ + R1 = LOAD128(g); g += 8; /* R1 = 00G300G200G100G0 */ + R2 = LOAD128(g); g += 8; /* R2 = 00G700G600G500G4 */ + PACKUSWB(R1,R2); /* R1 = G7G6G5G4G3G2G1G0 */ + R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */ + PUNPCKLBW(R2,R0); /* R2 = G3B3G2B2G1B1G0B0 */ + PUNPCKHBW(R1,R0); /* R1 = G7B7G6B7G5B5G4B4 */ + R0 = LOAD128(r); r += 8; /* R0 = 00R300R200R100R0 */ + R3 = LOAD128(r); r += 8; /* R3 = 00R700R600R500R4 */ + PACKUSWB(R0,R3); /* R0 = R7R6R5R4R3R2R1R0 */ + R3 = XMM_ALL_ONES; /* R3 = FFFFFFFFFFFFFFFF */ + R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */ + PUNPCKLBW(R4,R0); /* R4 = FFR3FFR2FFR1FFR0 */ + PUNPCKHBW(R3,R0); /* R3 = FFR7FFR6FFR5FFR4 */ + R0 = R4; /* R0 = R4 */ + PUNPCKLWD(R0,R2); /* R0 = FFR1G1B1FFR0G0B0 */ + PUNPCKHWD(R4,R2); /* R4 = FFR3G3B3FFR2G2B2 */ + R2 = R3; /* R2 = R3 */ + PUNPCKLWD(R2,R1); /* R2 = FFR5G5B5FFR4G4B4 */ + PUNPCKHWD(R3,R1); /* R3 = FFR7G7B7FFR6G6B6 */ + STORE128(out, R0); out += 16; /* FFR1G1B1FFR0G0B0 */ + STORE128(out, R4); out += 16; /* FFR3G3B3FFR2G2B2 */ + STORE128(out, R2); out += 16; /* FFR5G5B5FFR4G4B4 */ + STORE128(out, R3); out += 16; /* FFR7G7B7FFR6G6B6 */ + } while (width -= 16); + /* Jump to next row. */ + r += srcbump; + g += srcbump; + b += srcbump; + out += dstbump; + } + return PRIMITIVES_SUCCESS; +} +#endif /* WITH_SSE2 */ + +/*---------------------------------------------------------------------------*/ +#ifdef WITH_NEON +pstatus_t neon_yCbCrToRGB_16s16s_P3P3( + const INT16 *pSrc[3], + int srcStep, + INT16 *pDst[3], + int dstStep, + const prim_size_t *roi) /* region of interest */ +{ + /* TODO: If necessary, check alignments and call the general version. */ + + int16x8_t zero = vdupq_n_s16(0); + int16x8_t max = vdupq_n_s16(255); + + int16x8_t r_cr = vdupq_n_s16(22986); // 1.403 << 14 + int16x8_t g_cb = vdupq_n_s16(-5636); // -0.344 << 14 + int16x8_t g_cr = vdupq_n_s16(-11698); // -0.714 << 14 + int16x8_t b_cb = vdupq_n_s16(28999); // 1.770 << 14 + int16x8_t c4096 = vdupq_n_s16(4096); + + int16x8_t* y_buf = (int16x8_t*) pSrc[0]; + int16x8_t* cb_buf = (int16x8_t*) pSrc[1]; + int16x8_t* cr_buf = (int16x8_t*) pSrc[2]; + int16x8_t* r_buf = (int16x8_t*) pDst[0]; + int16x8_t* g_buf = (int16x8_t*) pDst[1]; + int16x8_t* b_buf = (int16x8_t*) pDst[2]; + + int srcbump = srcStep / sizeof(int16x8_t); + int dstbump = dstStep / sizeof(int16x8_t); + int yp; + + int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t); + for (yp=0; ypheight; ++yp) + { + int i; + for (i=0; i>5) + 128 + (cr*1.403)>>5 // our base formula + r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above + r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification + r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3 + */ + + /* y = (y_buf[i] + 4096) >> 2 */ + int16x8_t y = vld1q_s16((INT16*) &y_buf[i]); + y = vaddq_s16(y, c4096); + y = vshrq_n_s16(y, 2); + /* cb = cb_buf[i]; */ + int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]); + /* cr = cr_buf[i]; */ + int16x8_t cr = vld1q_s16((INT16*) &cr_buf[i]); + + /* (y + HIWORD(cr*22986)) >> 3 */ + int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1)); + r = vshrq_n_s16(r, 3); + /* r_buf[i] = MINMAX(r, 0, 255); */ + r = vminq_s16(vmaxq_s16(r, zero), max); + vst1q_s16((INT16*)&r_buf[i], r); + + /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */ + int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1)); + g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1)); + g = vshrq_n_s16(g, 3); + /* g_buf[i] = MINMAX(g, 0, 255); */ + g = vminq_s16(vmaxq_s16(g, zero), max); + vst1q_s16((INT16*)&g_buf[i], g); + + /* (y + HIWORD(cb*28999)) >> 3 */ + int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1)); + b = vshrq_n_s16(b, 3); + /* b_buf[i] = MINMAX(b, 0, 255); */ + b = vminq_s16(vmaxq_s16(b, zero), max); + vst1q_s16((INT16*)&b_buf[i], b); + } + + y_buf += srcbump; + cb_buf += srcbump; + cr_buf += srcbump; + r_buf += dstbump; + g_buf += dstbump; + b_buf += dstbump; + } + return PRIMITIVES_SUCCESS; +} +#endif /* WITH_NEON */ + + +/* I don't see a direct IPP version of this, since the input is INT16 + * YCbCr. It may be possible via Deinterleave and then YCbCrToRGB_. + * But that would likely be slower. + */ + +/* ------------------------------------------------------------------------- */ +void primitives_init_colors_opt(const primitives_hints_t* hints, primitives_t* prims) +{ +#if defined(WITH_SSE2) + if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE) + { + prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R; + prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3; + prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3; + } +#elif defined(WITH_NEON) + if (hints->arm_flags & PRIM_ARM_NEON_AVAILABLE) + { + prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3; + } +#endif /* WITH_SSE2 */ +} + diff --git a/libfreerdp/primitives/prim_copy.c b/libfreerdp/primitives/prim_copy.c index 71303b1a6..4198f2d8f 100644 --- a/libfreerdp/primitives/prim_copy.c +++ b/libfreerdp/primitives/prim_copy.c @@ -72,7 +72,7 @@ static BOOL memory_regions_overlap_2d( } /* ------------------------------------------------------------------------- */ -PRIM_STATIC pstatus_t general_copy_8u( +pstatus_t general_copy_8u( const BYTE *pSrc, BYTE *pDst, INT32 len) @@ -94,7 +94,7 @@ PRIM_STATIC pstatus_t general_copy_8u( * The addresses are assumed to have been already offset to the upper-left * corners of the source and destination region of interest. */ -PRIM_STATIC pstatus_t general_copy_8u_AC4r( +pstatus_t general_copy_8u_AC4r( const BYTE *pSrc, INT32 srcStep, BYTE *pDst, INT32 dstStep, INT32 width, INT32 height) diff --git a/libfreerdp/primitives/prim_internal.h b/libfreerdp/primitives/prim_internal.h index 21df8cae0..001ab8562 100644 --- a/libfreerdp/primitives/prim_internal.h +++ b/libfreerdp/primitives/prim_internal.h @@ -27,15 +27,6 @@ #include -/* Normally the internal entrypoints should be static, but a benchmark - * program may want to access them directly and turn this off. - */ -#ifndef PRIM_STATIC -# define PRIM_STATIC static -#else -# undef PRIM_STATIC -# define PRIM_STATIC -#endif /* !PRIM_STATIC */ /* Use lddqu for unaligned; load for 16-byte aligned. */ #define LOAD_SI128(_ptr_) \ diff --git a/libfreerdp/primitives/prim_set.c b/libfreerdp/primitives/prim_set.c index 5b40ce00c..9176c8722 100644 --- a/libfreerdp/primitives/prim_set.c +++ b/libfreerdp/primitives/prim_set.c @@ -19,18 +19,15 @@ #endif #include + #include #include -#ifdef WITH_SSE2 -# include -#endif /* WITH_SSE2 */ -#ifdef WITH_IPP -# include -#endif /* WITH_IPP */ + #include "prim_internal.h" +#include "prim_set.h" /* ========================================================================= */ -PRIM_STATIC pstatus_t general_set_8u( +pstatus_t general_set_8u( BYTE val, BYTE *pDst, INT32 len) @@ -40,7 +37,7 @@ PRIM_STATIC pstatus_t general_set_8u( } /* ------------------------------------------------------------------------- */ -PRIM_STATIC pstatus_t general_zero( +pstatus_t general_zero( void *pDst, size_t len) { @@ -48,75 +45,8 @@ PRIM_STATIC pstatus_t general_zero( return PRIMITIVES_SUCCESS; } -/* ------------------------------------------------------------------------- */ -#ifdef WITH_SSE2 -# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) -PRIM_STATIC pstatus_t sse2_set_8u( - BYTE val, - BYTE *pDst, - INT32 len) -{ - BYTE byte, *dptr; - __m128i xmm0; - size_t count; - - if (len < 16) return general_set_8u(val, pDst, len); - - byte = val; - dptr = (BYTE *) pDst; - - /* Seek 16-byte alignment. */ - while ((ULONG_PTR) dptr & 0x0f) - { - *dptr++ = byte; - if (--len == 0) return PRIMITIVES_SUCCESS; - } - - xmm0 = _mm_set1_epi8(byte); - - /* Cover 256-byte chunks via SSE register stores. */ - count = len >> 8; - len -= count << 8; - /* Do 256-byte chunks using one XMM register. */ - while (count--) - { - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; - } - - /* Cover 16-byte chunks via SSE register stores. */ - count = len >> 4; - len -= count << 4; - /* Do 16-byte chunks using one XMM register. */ - while (count--) - { - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; - } - - /* Do leftover bytes. */ - while (len--) *dptr++ = byte; - - return PRIMITIVES_SUCCESS; -} -# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ -#endif /* WITH_SSE2 */ - /* ========================================================================= */ -PRIM_STATIC pstatus_t general_set_32s( +pstatus_t general_set_32s( INT32 val, INT32 *pDst, INT32 len) @@ -148,7 +78,7 @@ PRIM_STATIC pstatus_t general_set_32s( } /* ------------------------------------------------------------------------- */ -PRIM_STATIC pstatus_t general_set_32u( +pstatus_t general_set_32u( UINT32 val, UINT32 *pDst, INT32 len) @@ -179,104 +109,6 @@ PRIM_STATIC pstatus_t general_set_32u( return PRIMITIVES_SUCCESS; } -/* ------------------------------------------------------------------------- */ -#ifdef WITH_SSE2 -# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) -PRIM_STATIC pstatus_t sse2_set_32u( - UINT32 val, - UINT32 *pDst, - INT32 len) -{ - UINT32 *dptr = (UINT32 *) pDst; - __m128i xmm0; - size_t count; - - /* If really short, just do it here. */ - if (len < 32) - { - while (len--) *dptr++ = val; - return PRIMITIVES_SUCCESS; - } - - /* Assure we can reach 16-byte alignment. */ - if (((ULONG_PTR) dptr & 0x03) != 0) - { - return general_set_32u(val, pDst, len); - } - - /* Seek 16-byte alignment. */ - while ((ULONG_PTR) dptr & 0x0f) - { - *dptr++ = val; - if (--len == 0) return PRIMITIVES_SUCCESS; - } - - xmm0 = _mm_set1_epi32(val); - - /* Cover 256-byte chunks via SSE register stores. */ - count = len >> 6; - len -= count << 6; - /* Do 256-byte chunks using one XMM register. */ - while (count--) - { - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; - } - - /* Cover 16-byte chunks via SSE register stores. */ - count = len >> 2; - len -= count << 2; - /* Do 16-byte chunks using one XMM register. */ - while (count--) - { - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; - } - - /* Do leftover bytes. */ - while (len--) *dptr++ = val; - - return PRIMITIVES_SUCCESS; -} - -/* ------------------------------------------------------------------------- */ -PRIM_STATIC pstatus_t sse2_set_32s( - INT32 val, - INT32 *pDst, - INT32 len) -{ - UINT32 uval = *((UINT32 *) &val); - return sse2_set_32u(uval, (UINT32 *) pDst, len); -} -# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ -#endif /* WITH_SSE2 */ - -#ifdef WITH_IPP -/* ------------------------------------------------------------------------- */ -PRIM_STATIC pstatus_t ipp_wrapper_set_32u( - UINT32 val, - UINT32 *pDst, - INT32 len) -{ - /* A little type conversion, then use the signed version. */ - INT32 sval = *((INT32 *) &val); - return ippsSet_32s(sval, (INT32 *) pDst, len); -} -#endif - /* ------------------------------------------------------------------------- */ void primitives_init_set( const primitives_hints_t *hints, @@ -288,20 +120,7 @@ void primitives_init_set( prims->set_32u = general_set_32u; prims->zero = general_zero; - /* Pick tuned versions if possible. */ -#ifdef WITH_IPP - prims->set_8u = (__set_8u_t) ippsSet_8u; - prims->set_32s = (__set_32s_t) ippsSet_32s; - prims->set_32u = (__set_32u_t) ipp_wrapper_set_32u; - prims->zero = (__zero_t) ippsZero_8u; -#elif defined(WITH_SSE2) - if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE) - { - prims->set_8u = sse2_set_8u; - prims->set_32s = sse2_set_32s; - prims->set_32u = sse2_set_32u; - } -#endif + primitives_init_set_opt(hints, prims); } /* ------------------------------------------------------------------------- */ @@ -310,3 +129,4 @@ void primitives_deinit_set( { /* Nothing to do. */ } + diff --git a/libfreerdp/primitives/prim_set.h b/libfreerdp/primitives/prim_set.h new file mode 100644 index 000000000..e4504dc2c --- /dev/null +++ b/libfreerdp/primitives/prim_set.h @@ -0,0 +1,34 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Routines to set a chunk of memory to a constant. + * vi:ts=4 sw=4 + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. Algorithms used by + * this code may be covered by patents by HP, Microsoft, or other parties. + * + */ + +#ifdef __GNUC__ +# pragma once +#endif + +#ifndef __PRIM_SET_H_INCLUDED__ +#define __PRIM_SET_H_INCLUDED__ + +pstatus_t general_set_8u(BYTE val, BYTE *pDst, INT32 len); +pstatus_t general_zero(void *pDst, size_t len); +pstatus_t general_set_32s(INT32 val, INT32 *pDst, INT32 len); +pstatus_t general_set_32u(UINT32 val, UINT32 *pDst, INT32 len); + + +void primitives_init_set_opt(const primitives_hints_t *hints, primitives_t *prims); + +#endif /* !__PRIM_SET_H_INCLUDED__ */ + diff --git a/libfreerdp/primitives/prim_set_opt.c b/libfreerdp/primitives/prim_set_opt.c new file mode 100644 index 000000000..0523434ff --- /dev/null +++ b/libfreerdp/primitives/prim_set_opt.c @@ -0,0 +1,218 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Optimized routines to set a chunk of memory to a constant. + * vi:ts=4 sw=4: + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + * + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include + +#ifdef WITH_SSE2 +# include +#endif /* WITH_SSE2 */ +#ifdef WITH_IPP +# include +#endif /* WITH_IPP */ + +#include "prim_internal.h" +#include "prim_set.h" + +/* ========================================================================= */ +#ifdef WITH_SSE2 +# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) +pstatus_t sse2_set_8u( + BYTE val, + BYTE *pDst, + INT32 len) +{ + BYTE byte, *dptr; + __m128i xmm0; + size_t count; + + if (len < 16) return general_set_8u(val, pDst, len); + + byte = val; + dptr = (BYTE *) pDst; + + /* Seek 16-byte alignment. */ + while ((ULONG_PTR) dptr & 0x0f) + { + *dptr++ = byte; + if (--len == 0) return PRIMITIVES_SUCCESS; + } + + xmm0 = _mm_set1_epi8(byte); + + /* Cover 256-byte chunks via SSE register stores. */ + count = len >> 8; + len -= count << 8; + /* Do 256-byte chunks using one XMM register. */ + while (count--) + { + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + } + + /* Cover 16-byte chunks via SSE register stores. */ + count = len >> 4; + len -= count << 4; + /* Do 16-byte chunks using one XMM register. */ + while (count--) + { + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 16; + } + + /* Do leftover bytes. */ + while (len--) *dptr++ = byte; + + return PRIMITIVES_SUCCESS; +} +# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ +#endif /* WITH_SSE2 */ + +/* ------------------------------------------------------------------------- */ +#ifdef WITH_SSE2 +# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) +pstatus_t sse2_set_32u( + UINT32 val, + UINT32 *pDst, + INT32 len) +{ + UINT32 *dptr = (UINT32 *) pDst; + __m128i xmm0; + size_t count; + + /* If really short, just do it here. */ + if (len < 32) + { + while (len--) *dptr++ = val; + return PRIMITIVES_SUCCESS; + } + + /* Assure we can reach 16-byte alignment. */ + if (((ULONG_PTR) dptr & 0x03) != 0) + { + return general_set_32u(val, pDst, len); + } + + /* Seek 16-byte alignment. */ + while ((ULONG_PTR) dptr & 0x0f) + { + *dptr++ = val; + if (--len == 0) return PRIMITIVES_SUCCESS; + } + + xmm0 = _mm_set1_epi32(val); + + /* Cover 256-byte chunks via SSE register stores. */ + count = len >> 6; + len -= count << 6; + /* Do 256-byte chunks using one XMM register. */ + while (count--) + { + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + } + + /* Cover 16-byte chunks via SSE register stores. */ + count = len >> 2; + len -= count << 2; + /* Do 16-byte chunks using one XMM register. */ + while (count--) + { + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; + } + + /* Do leftover bytes. */ + while (len--) *dptr++ = val; + + return PRIMITIVES_SUCCESS; +} + +/* ------------------------------------------------------------------------- */ +pstatus_t sse2_set_32s( + INT32 val, + INT32 *pDst, + INT32 len) +{ + UINT32 uval = *((UINT32 *) &val); + return sse2_set_32u(uval, (UINT32 *) pDst, len); +} +# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ +#endif /* WITH_SSE2 */ + +#ifdef WITH_IPP +/* ------------------------------------------------------------------------- */ +pstatus_t ipp_wrapper_set_32u( + UINT32 val, + UINT32 *pDst, + INT32 len) +{ + /* A little type conversion, then use the signed version. */ + INT32 sval = *((INT32 *) &val); + return ippsSet_32s(sval, (INT32 *) pDst, len); +} +#endif + +/* ------------------------------------------------------------------------- */ +void primitives_init_set_opt(const primitives_hints_t *hints, primitives_t *prims) +{ + /* Pick tuned versions if possible. */ +#ifdef WITH_IPP + prims->set_8u = (__set_8u_t) ippsSet_8u; + prims->set_32s = (__set_32s_t) ippsSet_32s; + prims->set_32u = (__set_32u_t) ipp_wrapper_set_32u; + prims->zero = (__zero_t) ippsZero_8u; +#elif defined(WITH_SSE2) + if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE) + { + prims->set_8u = sse2_set_8u; + prims->set_32s = sse2_set_32s; + prims->set_32u = sse2_set_32u; + } +#endif +} + diff --git a/libfreerdp/primitives/prim_shift.c b/libfreerdp/primitives/prim_shift.c index 331c7216e..bd26dc0a0 100644 --- a/libfreerdp/primitives/prim_shift.c +++ b/libfreerdp/primitives/prim_shift.c @@ -17,25 +17,15 @@ #include "config.h" #endif -#include - #include #include -#ifdef WITH_SSE2 -#include -#include -#endif /* WITH_SSE2 */ - -#ifdef WITH_IPP -#include -#endif /* WITH_IPP */ - #include "prim_internal.h" -#include "prim_templates.h" +#include "prim_shift.h" + /* ------------------------------------------------------------------------- */ -PRIM_STATIC pstatus_t general_lShiftC_16s( +pstatus_t general_lShiftC_16s( const INT16 *pSrc, INT32 val, INT16 *pDst, @@ -47,7 +37,7 @@ PRIM_STATIC pstatus_t general_lShiftC_16s( } /* ------------------------------------------------------------------------- */ -PRIM_STATIC pstatus_t general_rShiftC_16s( +pstatus_t general_rShiftC_16s( const INT16 *pSrc, INT32 val, INT16 *pDst, @@ -59,7 +49,7 @@ PRIM_STATIC pstatus_t general_rShiftC_16s( } /* ------------------------------------------------------------------------- */ -PRIM_STATIC pstatus_t general_lShiftC_16u( +pstatus_t general_lShiftC_16u( const UINT16 *pSrc, INT32 val, UINT16 *pDst, @@ -71,7 +61,7 @@ PRIM_STATIC pstatus_t general_lShiftC_16u( } /* ------------------------------------------------------------------------- */ -PRIM_STATIC pstatus_t general_rShiftC_16u( +pstatus_t general_rShiftC_16u( const UINT16 *pSrc, INT32 val, UINT16 *pDst, @@ -82,25 +72,8 @@ PRIM_STATIC pstatus_t general_rShiftC_16u( return PRIMITIVES_SUCCESS; } -#ifdef WITH_SSE2 -# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) /* ------------------------------------------------------------------------- */ -SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, general_lShiftC_16s, - _mm_slli_epi16, *dptr++ = *sptr++ << val) -/* ------------------------------------------------------------------------- */ -SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, general_rShiftC_16s, - _mm_srai_epi16, *dptr++ = *sptr++ >> val) -/* ------------------------------------------------------------------------- */ -SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, general_lShiftC_16u, - _mm_slli_epi16, *dptr++ = *sptr++ << val) -/* ------------------------------------------------------------------------- */ -SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, general_rShiftC_16u, - _mm_srli_epi16, *dptr++ = *sptr++ >> val) -# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ -#endif - -/* ------------------------------------------------------------------------- */ -PRIM_STATIC pstatus_t general_shiftC_16s( +pstatus_t general_shiftC_16s( const INT16 *pSrc, INT32 val, INT16 *pDst, @@ -115,7 +88,7 @@ PRIM_STATIC pstatus_t general_shiftC_16s( } /* ------------------------------------------------------------------------- */ -PRIM_STATIC pstatus_t general_shiftC_16u( +pstatus_t general_shiftC_16u( const UINT16 *pSrc, INT32 val, UINT16 *pDst, @@ -129,11 +102,6 @@ PRIM_STATIC pstatus_t general_shiftC_16u( else return prims->lShiftC_16u(pSrc, val, pDst, len); } -/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s - * depending on the sign of val. To avoid using the deprecated inplace - * routines, a wrapper can use the src for the dest. - */ - /* ------------------------------------------------------------------------- */ void primitives_init_shift( const primitives_hints_t *hints, @@ -144,24 +112,12 @@ void primitives_init_shift( prims->rShiftC_16s = general_rShiftC_16s; prims->lShiftC_16u = general_lShiftC_16u; prims->rShiftC_16u = general_rShiftC_16u; -#if defined(WITH_IPP) - prims->lShiftC_16s = (__lShiftC_16s_t) ippsLShiftC_16s; - prims->rShiftC_16s = (__rShiftC_16s_t) ippsRShiftC_16s; - prims->lShiftC_16u = (__lShiftC_16u_t) ippsLShiftC_16u; - prims->rShiftC_16u = (__rShiftC_16u_t) ippsRShiftC_16u; -#elif defined(WITH_SSE2) - if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE) - && (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) - { - prims->lShiftC_16s = sse2_lShiftC_16s; - prims->rShiftC_16s = sse2_rShiftC_16s; - prims->lShiftC_16u = sse2_lShiftC_16u; - prims->rShiftC_16u = sse2_rShiftC_16u; - } -#endif + /* Wrappers */ prims->shiftC_16s = general_shiftC_16s; prims->shiftC_16u = general_shiftC_16u; + + primitives_init_shift_opt(hints, prims); } /* ------------------------------------------------------------------------- */ diff --git a/libfreerdp/primitives/prim_shift.h b/libfreerdp/primitives/prim_shift.h new file mode 100644 index 000000000..cad054013 --- /dev/null +++ b/libfreerdp/primitives/prim_shift.h @@ -0,0 +1,35 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Shift operations. + * vi:ts=4 sw=4 + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. Algorithms used by + * this code may be covered by patents by HP, Microsoft, or other parties. + * + */ + +#ifdef __GNUC__ +# pragma once +#endif + +#ifndef __PRIM_SHIFT_H_INCLUDED__ +#define __PRIM_SHIFT_H_INCLUDED__ + +pstatus_t general_lShiftC_16s(const INT16 *pSrc, INT32 val, INT16 *pDst, INT32 len); +pstatus_t general_rShiftC_16s(const INT16 *pSrc, INT32 val, INT16 *pDst, INT32 len); +pstatus_t general_lShiftC_16u(const UINT16 *pSrc, INT32 val, UINT16 *pDst, INT32 len); +pstatus_t general_rShiftC_16u(const UINT16 *pSrc, INT32 val, UINT16 *pDst, INT32 len); +pstatus_t general_shiftC_16s(const INT16 *pSrc, INT32 val, INT16 *pDst, INT32 len); +pstatus_t general_shiftC_16u(const UINT16 *pSrc, INT32 val, UINT16 *pDst, INT32 len); + +void primitives_init_shift_opt(const primitives_hints_t *hints, primitives_t *prims); + +#endif /* !__PRIM_SHIFT_H_INCLUDED__ */ + diff --git a/libfreerdp/primitives/prim_shift_opt.c b/libfreerdp/primitives/prim_shift_opt.c new file mode 100644 index 000000000..0e57da269 --- /dev/null +++ b/libfreerdp/primitives/prim_shift_opt.c @@ -0,0 +1,79 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Shift operations. + * vi:ts=4 sw=4: + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include + +#ifdef WITH_SSE2 +#include +#include +#endif /* WITH_SSE2 */ + +#ifdef WITH_IPP +#include +#endif /* WITH_IPP */ + +#include "prim_internal.h" +#include "prim_templates.h" +#include "prim_shift.h" + + +#ifdef WITH_SSE2 +# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) +/* ------------------------------------------------------------------------- */ +SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, general_lShiftC_16s, + _mm_slli_epi16, *dptr++ = *sptr++ << val) +/* ------------------------------------------------------------------------- */ +SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, general_rShiftC_16s, + _mm_srai_epi16, *dptr++ = *sptr++ >> val) +/* ------------------------------------------------------------------------- */ +SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, general_lShiftC_16u, + _mm_slli_epi16, *dptr++ = *sptr++ << val) +/* ------------------------------------------------------------------------- */ +SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, general_rShiftC_16u, + _mm_srli_epi16, *dptr++ = *sptr++ >> val) +# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ +#endif + + +/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s + * depending on the sign of val. To avoid using the deprecated inplace + * routines, a wrapper can use the src for the dest. + */ + +/* ------------------------------------------------------------------------- */ +void primitives_init_shift_opt(const primitives_hints_t *hints, primitives_t *prims) +{ +#if defined(WITH_IPP) + prims->lShiftC_16s = (__lShiftC_16s_t) ippsLShiftC_16s; + prims->rShiftC_16s = (__rShiftC_16s_t) ippsRShiftC_16s; + prims->lShiftC_16u = (__lShiftC_16u_t) ippsLShiftC_16u; + prims->rShiftC_16u = (__rShiftC_16u_t) ippsRShiftC_16u; +#elif defined(WITH_SSE2) + if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE) + && (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) + { + prims->lShiftC_16s = sse2_lShiftC_16s; + prims->rShiftC_16s = sse2_rShiftC_16s; + prims->lShiftC_16u = sse2_lShiftC_16u; + prims->rShiftC_16u = sse2_rShiftC_16u; + } +#endif +} + diff --git a/libfreerdp/primitives/prim_sign.c b/libfreerdp/primitives/prim_sign.c index a3b11ee14..d7d2eb018 100644 --- a/libfreerdp/primitives/prim_sign.c +++ b/libfreerdp/primitives/prim_sign.c @@ -17,22 +17,16 @@ #include "config.h" #endif -#include - #include #include -#ifdef WITH_SSE2 -#include -#include -#endif /* WITH_SSE2 */ - #include "prim_internal.h" +#include "prim_sign.h" /* ---------------------------------------------------------------------------- * Set pDst to the sign-value of the 16-bit values in pSrc (-1, 0, or 1). */ -PRIM_STATIC pstatus_t general_sign_16s( +pstatus_t general_sign_16s( const INT16 *pSrc, INT16 *pDst, INT32 len) @@ -46,110 +40,6 @@ PRIM_STATIC pstatus_t general_sign_16s( return PRIMITIVES_SUCCESS; } -#ifdef WITH_SSE2 -/* ------------------------------------------------------------------------- */ -PRIM_STATIC pstatus_t ssse3_sign_16s( - const INT16 *pSrc, - INT16 *pDst, - INT32 len) -{ - const INT16 *sptr = (const INT16 *) pSrc; - INT16 *dptr = (INT16 *) pDst; - size_t count; - - if (len < 16) - { - return general_sign_16s(pSrc, pDst, len); - } - - /* Check for 16-byte alignment (eventually). */ - if ((ULONG_PTR) pDst & 0x01) - { - return general_sign_16s(pSrc, pDst, len); - } - - /* Seek 16-byte alignment. */ - while ((ULONG_PTR) dptr & 0x0f) - { - INT16 src = *sptr++; - *dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0); - if (--len == 0) return PRIMITIVES_SUCCESS; - } - - /* Do 32-short chunks using 8 XMM registers. */ - count = len >> 5; /* / 32 */ - len -= count << 5; /* * 32 */ - if ((ULONG_PTR) sptr & 0x0f) - { - /* Unaligned */ - while (count--) - { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - xmm0 = _mm_set1_epi16(0x0001U); - xmm1 = _mm_set1_epi16(0x0001U); - xmm2 = _mm_set1_epi16(0x0001U); - xmm3 = _mm_set1_epi16(0x0001U); - xmm4 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; - xmm5 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; - xmm6 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; - xmm7 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; - xmm0 = _mm_sign_epi16(xmm0, xmm4); - xmm1 = _mm_sign_epi16(xmm1, xmm5); - xmm2 = _mm_sign_epi16(xmm2, xmm6); - xmm3 = _mm_sign_epi16(xmm3, xmm7); - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 8; - _mm_store_si128((__m128i *) dptr, xmm1); dptr += 8; - _mm_store_si128((__m128i *) dptr, xmm2); dptr += 8; - _mm_store_si128((__m128i *) dptr, xmm3); dptr += 8; - } - } - else - { - /* Aligned */ - while (count--) - { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - xmm0 = _mm_set1_epi16(0x0001U); - xmm1 = _mm_set1_epi16(0x0001U); - xmm2 = _mm_set1_epi16(0x0001U); - xmm3 = _mm_set1_epi16(0x0001U); - xmm4 = _mm_load_si128((__m128i *) sptr); sptr += 8; - xmm5 = _mm_load_si128((__m128i *) sptr); sptr += 8; - xmm6 = _mm_load_si128((__m128i *) sptr); sptr += 8; - xmm7 = _mm_load_si128((__m128i *) sptr); sptr += 8; - xmm0 = _mm_sign_epi16(xmm0, xmm4); - xmm1 = _mm_sign_epi16(xmm1, xmm5); - xmm2 = _mm_sign_epi16(xmm2, xmm6); - xmm3 = _mm_sign_epi16(xmm3, xmm7); - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 8; - _mm_store_si128((__m128i *) dptr, xmm1); dptr += 8; - _mm_store_si128((__m128i *) dptr, xmm2); dptr += 8; - _mm_store_si128((__m128i *) dptr, xmm3); dptr += 8; - } - } - - /* Do 8-short chunks using two XMM registers. */ - count = len >> 3; - len -= count << 3; - while (count--) - { - __m128i xmm0 = _mm_set1_epi16(0x0001U); - __m128i xmm1 = LOAD_SI128(sptr); sptr += 8; - xmm0 = _mm_sign_epi16(xmm0, xmm1); - _mm_store_si128((__m128i *) dptr, xmm0); dptr += 8; - } - - /* Do leftovers. */ - while (len--) - { - INT16 src = *sptr++; - *dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0); - } - - return PRIMITIVES_SUCCESS; -} -#endif /* WITH_SSE2 */ - /* ------------------------------------------------------------------------- */ void primitives_init_sign( const primitives_hints_t *hints, @@ -157,15 +47,8 @@ void primitives_init_sign( { /* Start with the default. */ prims->sign_16s = general_sign_16s; - /* Pick tuned versions if possible. */ - /* I didn't spot an IPP version of this. */ -#if defined(WITH_SSE2) - if ((hints->x86_flags & PRIM_X86_SSSE3_AVAILABLE) - && (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) - { - prims->sign_16s = ssse3_sign_16s; - } -#endif + + primitives_init_sign_opt(hints, prims); } /* ------------------------------------------------------------------------- */ @@ -174,3 +57,4 @@ void primitives_deinit_sign( { /* Nothing to do. */ } + diff --git a/libfreerdp/primitives/prim_sign.h b/libfreerdp/primitives/prim_sign.h new file mode 100644 index 000000000..3592990ec --- /dev/null +++ b/libfreerdp/primitives/prim_sign.h @@ -0,0 +1,30 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Sign operations. + * vi:ts=4 sw=4 + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. Algorithms used by + * this code may be covered by patents by HP, Microsoft, or other parties. + * + */ + +#ifdef __GNUC__ +# pragma once +#endif + +#ifndef __PRIM_SIGN_H_INCLUDED__ +#define __PRIM_SIGN_H_INCLUDED__ + +pstatus_t general_sign_16s(const INT16 *pSrc, INT16 *pDst, INT32 len); + +void primitives_init_sign_opt(const primitives_hints_t *hints, primitives_t *prims); + +#endif /* !__PRIM_SIGN_H_INCLUDED__ */ + diff --git a/libfreerdp/primitives/prim_sign_opt.c b/libfreerdp/primitives/prim_sign_opt.c new file mode 100644 index 000000000..81842b9bd --- /dev/null +++ b/libfreerdp/primitives/prim_sign_opt.c @@ -0,0 +1,149 @@ +/* FreeRDP: A Remote Desktop Protocol Client + * Optimized sign operations. + * vi:ts=4 sw=4: + * + * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain + * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include + +#ifdef WITH_SSE2 +#include +#include +#endif /* WITH_SSE2 */ + +#include "prim_internal.h" +#include "prim_sign.h" + + +#ifdef WITH_SSE2 +/* ------------------------------------------------------------------------- */ +pstatus_t ssse3_sign_16s( + const INT16 *pSrc, + INT16 *pDst, + INT32 len) +{ + const INT16 *sptr = (const INT16 *) pSrc; + INT16 *dptr = (INT16 *) pDst; + size_t count; + + if (len < 16) + { + return general_sign_16s(pSrc, pDst, len); + } + + /* Check for 16-byte alignment (eventually). */ + if ((ULONG_PTR) pDst & 0x01) + { + return general_sign_16s(pSrc, pDst, len); + } + + /* Seek 16-byte alignment. */ + while ((ULONG_PTR) dptr & 0x0f) + { + INT16 src = *sptr++; + *dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0); + if (--len == 0) return PRIMITIVES_SUCCESS; + } + + /* Do 32-short chunks using 8 XMM registers. */ + count = len >> 5; /* / 32 */ + len -= count << 5; /* * 32 */ + if ((ULONG_PTR) sptr & 0x0f) + { + /* Unaligned */ + while (count--) + { + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + xmm0 = _mm_set1_epi16(0x0001U); + xmm1 = _mm_set1_epi16(0x0001U); + xmm2 = _mm_set1_epi16(0x0001U); + xmm3 = _mm_set1_epi16(0x0001U); + xmm4 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; + xmm5 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; + xmm6 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; + xmm7 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; + xmm0 = _mm_sign_epi16(xmm0, xmm4); + xmm1 = _mm_sign_epi16(xmm1, xmm5); + xmm2 = _mm_sign_epi16(xmm2, xmm6); + xmm3 = _mm_sign_epi16(xmm3, xmm7); + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 8; + _mm_store_si128((__m128i *) dptr, xmm1); dptr += 8; + _mm_store_si128((__m128i *) dptr, xmm2); dptr += 8; + _mm_store_si128((__m128i *) dptr, xmm3); dptr += 8; + } + } + else + { + /* Aligned */ + while (count--) + { + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + xmm0 = _mm_set1_epi16(0x0001U); + xmm1 = _mm_set1_epi16(0x0001U); + xmm2 = _mm_set1_epi16(0x0001U); + xmm3 = _mm_set1_epi16(0x0001U); + xmm4 = _mm_load_si128((__m128i *) sptr); sptr += 8; + xmm5 = _mm_load_si128((__m128i *) sptr); sptr += 8; + xmm6 = _mm_load_si128((__m128i *) sptr); sptr += 8; + xmm7 = _mm_load_si128((__m128i *) sptr); sptr += 8; + xmm0 = _mm_sign_epi16(xmm0, xmm4); + xmm1 = _mm_sign_epi16(xmm1, xmm5); + xmm2 = _mm_sign_epi16(xmm2, xmm6); + xmm3 = _mm_sign_epi16(xmm3, xmm7); + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 8; + _mm_store_si128((__m128i *) dptr, xmm1); dptr += 8; + _mm_store_si128((__m128i *) dptr, xmm2); dptr += 8; + _mm_store_si128((__m128i *) dptr, xmm3); dptr += 8; + } + } + + /* Do 8-short chunks using two XMM registers. */ + count = len >> 3; + len -= count << 3; + while (count--) + { + __m128i xmm0 = _mm_set1_epi16(0x0001U); + __m128i xmm1 = LOAD_SI128(sptr); sptr += 8; + xmm0 = _mm_sign_epi16(xmm0, xmm1); + _mm_store_si128((__m128i *) dptr, xmm0); dptr += 8; + } + + /* Do leftovers. */ + while (len--) + { + INT16 src = *sptr++; + *dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0); + } + + return PRIMITIVES_SUCCESS; +} +#endif /* WITH_SSE2 */ + +/* ------------------------------------------------------------------------- */ +void primitives_init_sign_opt(const primitives_hints_t *hints, primitives_t *prims) +{ + /* Pick tuned versions if possible. */ + /* I didn't spot an IPP version of this. */ +#if defined(WITH_SSE2) + if ((hints->x86_flags & PRIM_X86_SSSE3_AVAILABLE) + && (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) + { + prims->sign_16s = ssse3_sign_16s; + } +#endif +} + diff --git a/libfreerdp/primitives/prim_templates.h b/libfreerdp/primitives/prim_templates.h index c0b6ac10d..b530637b7 100644 --- a/libfreerdp/primitives/prim_templates.h +++ b/libfreerdp/primitives/prim_templates.h @@ -44,7 +44,7 @@ * SCD = Source, Constant, Destination */ #define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \ -PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, INT32 val, _type_ *pDst, INT32 len) \ +pstatus_t _name_(const _type_ *pSrc, INT32 val, _type_ *pDst, INT32 len) \ { \ int shifts; \ UINT32 offBeatMask; \ @@ -188,7 +188,7 @@ PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, INT32 val, _type_ *pDst, INT32 * PRE = preload xmm0 with the constant. */ #define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \ -PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32 len) \ +pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32 len) \ { \ int shifts; \ UINT32 offBeatMask; \ @@ -293,7 +293,7 @@ PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32 * SSD = Source1, Source2, Destination */ #define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \ -PRIM_STATIC pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, INT32 len) \ +pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, INT32 len) \ { \ int shifts; \ UINT32 offBeatMask; \