From f19098da836266f254fdac3c795af4a4d9ae0d3c Mon Sep 17 00:00:00 2001 From: akallabeth Date: Fri, 7 Jun 2024 11:54:03 +0200 Subject: [PATCH] [primitives,prim_add] add new add_16s_inplace --- include/freerdp/primitives.h | 3 + libfreerdp/primitives/prim_add.c | 33 +++++-- libfreerdp/primitives/prim_add_opt.c | 127 +++++++++++++++++++++++++-- 3 files changed, 149 insertions(+), 14 deletions(-) diff --git a/include/freerdp/primitives.h b/include/freerdp/primitives.h index 29f103aa6..b00011bc7 100644 --- a/include/freerdp/primitives.h +++ b/include/freerdp/primitives.h @@ -102,6 +102,8 @@ typedef pstatus_t (*__alphaComp_argb_t)(const BYTE* WINPR_RESTRICT pSrc1, UINT32 typedef pstatus_t (*__add_16s_t)(const INT16* WINPR_RESTRICT pSrc1, const INT16* WINPR_RESTRICT pSrc2, INT16* WINPR_RESTRICT pDst, UINT32 len); +typedef pstatus_t (*__add_16s_inplace_t)(INT16* WINPR_RESTRICT WINPR_RESTRICT pSrcDst, + const INT16* WINPR_RESTRICT pSrc, UINT32 len); typedef pstatus_t (*__lShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len); typedef pstatus_t (*__lShiftC_16u_t)(const UINT16* pSrc, UINT32 val, UINT16* pSrcDst, UINT32 len); typedef pstatus_t (*__rShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len); @@ -183,6 +185,7 @@ typedef struct __zero_t zero; /* bzero or faster */ /* Arithmetic functions */ __add_16s_t add_16s; + __add_16s_inplace_t add_16s_inplace; /* And/or */ __andC_32u_t andC_32u; __orC_32u_t orC_32u; diff --git a/libfreerdp/primitives/prim_add.c b/libfreerdp/primitives/prim_add.c index 674e04f60..b1d4602c4 100644 --- a/libfreerdp/primitives/prim_add.c +++ b/libfreerdp/primitives/prim_add.c @@ -16,6 +16,8 @@ #include +#include + #include #include @@ -24,16 +26,18 @@ /* ---------------------------------------------------------------------------- * 16-bit signed add with saturation (under and over). */ -static pstatus_t general_add_16s(const INT16* pSrc1, const INT16* pSrc2, INT16* pDst, UINT32 len) +static pstatus_t general_add_16s(const INT16* WINPR_RESTRICT pSrc1, + const INT16* WINPR_RESTRICT pSrc2, INT16* WINPR_RESTRICT pDst, + UINT32 len) { while (len--) { INT32 k = (INT32)(*pSrc1++) + (INT32)(*pSrc2++); - if (k > 32767) - *pDst++ = ((INT16)32767); - else if (k < -32768) - *pDst++ = ((INT16)-32768); + if (k > INT16_MAX) + *pDst++ = ((INT16)INT16_MAX); + else if (k < INT16_MIN) + *pDst++ = ((INT16)INT16_MIN); else *pDst++ = (INT16)k; } @@ -41,8 +45,27 @@ static pstatus_t general_add_16s(const INT16* pSrc1, const INT16* pSrc2, INT16* return PRIMITIVES_SUCCESS; } +static pstatus_t general_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, + const INT16* WINPR_RESTRICT pSrc, UINT32 len) +{ + while (len--) + { + INT32 k = (INT32)(*pSrcDst) + (INT32)(*pSrc++); + + if (k > INT16_MAX) + *pSrcDst++ = ((INT16)INT16_MAX); + else if (k < INT16_MIN) + *pSrcDst++ = ((INT16)INT16_MIN); + else + *pSrcDst++ = (INT16)k; + } + + return PRIMITIVES_SUCCESS; +} + /* ------------------------------------------------------------------------- */ void primitives_init_add(primitives_t* prims) { prims->add_16s = general_add_16s; + prims->add_16s_inplace = general_add_16s_inplace; } diff --git a/libfreerdp/primitives/prim_add_opt.c b/libfreerdp/primitives/prim_add_opt.c index 88c8b6671..87a3ea620 100644 --- a/libfreerdp/primitives/prim_add_opt.c +++ b/libfreerdp/primitives/prim_add_opt.c @@ -25,21 +25,131 @@ #include #endif /* WITH_SSE2 */ -#ifdef WITH_IPP -#include -#endif /* WITH_IPP */ - #include "prim_internal.h" #include "prim_templates.h" static primitives_t* generic = NULL; #ifdef WITH_SSE2 -#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) /* ------------------------------------------------------------------------- */ SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16, generic->add_16s(sptr1++, sptr2++, dptr++, 1)) -#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ + +static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, + const INT16* WINPR_RESTRICT pSrc, UINT32 len) +{ + const int shifts = 2; + UINT32 offBeatMask; + INT16* dptr = pSrcDst; + const INT16* sptr = pSrc; + + size_t count; + if (len < 16) /* pointless if too small */ + return generic->add_16s_inplace(pSrcDst, pSrc, len); + + offBeatMask = (1 << (shifts - 1)) - 1; + if ((ULONG_PTR)pSrcDst & offBeatMask) + { + /* Incrementing the pointer skips over 16-byte boundary. */ + return generic->add_16s_inplace(pSrcDst, pSrc, len); + } + /* Get to the 16-byte boundary now. */ + const size_t rem = (ULONG_PTR)dptr & 0x0f; + if (rem != 0) + { + pstatus_t status = generic->add_16s_inplace(dptr, sptr, rem); + if (status != PRIMITIVES_SUCCESS) + return status; + dptr += rem; + sptr += rem; + } + /* Use 4 128-bit SSE registers. */ + count = len >> (7 - shifts); + len -= count << (7 - shifts); + if (((const ULONG_PTR)dptr & 0x0f) || ((const ULONG_PTR)sptr & 0x0f)) + { + /* Unaligned loads */ + while (count--) + { + const __m128i* sptr1 = dptr; + const __m128i* sptr2 = sptr; + __m128i* dptr1 = dptr; + sptr += 4 * sizeof(__m128i); + dptr += 4 * sizeof(__m128i); + + __m128i xmm0 = _mm_lddqu_si128(sptr1++); + __m128i xmm1 = _mm_lddqu_si128(sptr1++); + __m128i xmm2 = _mm_lddqu_si128(sptr1++); + __m128i xmm3 = _mm_lddqu_si128(sptr1++); + __m128i xmm4 = _mm_lddqu_si128(sptr2++); + __m128i xmm5 = _mm_lddqu_si128(sptr2++); + __m128i xmm6 = _mm_lddqu_si128(sptr2++); + __m128i xmm7 = _mm_lddqu_si128(sptr2++); + + xmm0 = _mm_adds_epi16(xmm0, xmm4); + xmm1 = _mm_adds_epi16(xmm1, xmm5); + xmm2 = _mm_adds_epi16(xmm2, xmm6); + xmm3 = _mm_adds_epi16(xmm3, xmm7); + + _mm_store_si128(dptr1++, xmm0); + _mm_store_si128(dptr1++, xmm1); + _mm_store_si128(dptr1++, xmm2); + _mm_store_si128(dptr1++, xmm3); + } + } + else + { + /* Aligned loads */ + while (count--) + { + const __m128i* sptr1 = dptr; + const __m128i* sptr2 = sptr; + __m128i* dptr1 = dptr; + sptr += 4 * sizeof(__m128i); + dptr += 4 * sizeof(__m128i); + + __m128i xmm0 = _mm_load_si128(sptr1++); + __m128i xmm1 = _mm_load_si128(sptr1++); + __m128i xmm2 = _mm_load_si128(sptr1++); + __m128i xmm3 = _mm_load_si128(sptr1++); + __m128i xmm4 = _mm_load_si128(sptr2++); + __m128i xmm5 = _mm_load_si128(sptr2++); + __m128i xmm6 = _mm_load_si128(sptr2++); + __m128i xmm7 = _mm_load_si128(sptr2++); + + xmm0 = _mm_adds_epi16(xmm0, xmm4); + xmm1 = _mm_adds_epi16(xmm1, xmm5); + xmm2 = _mm_adds_epi16(xmm2, xmm6); + xmm3 = _mm_adds_epi16(xmm3, xmm7); + + _mm_store_si128(dptr1, xmm0); + _mm_store_si128(dptr1, xmm1); + _mm_store_si128(dptr1, xmm2); + _mm_store_si128(dptr1, xmm3); + } + } + /* Use a single 128-bit SSE register. */ + count = len >> (5 - shifts); + len -= count << (5 - shifts); + while (count--) + { + const __m128i* sptr1 = sptr; + __m128i* dptr1 = dptr; + sptr += sizeof(__m128i); + dptr += sizeof(__m128i); + + __m128i xmm0 = LOAD_SI128(sptr1); + __m128i xmm1 = LOAD_SI128(dptr1); + xmm0 = _mm_adds_epi16(xmm0, xmm1); + _mm_store_si128(dptr, xmm0); + } + /* Finish off the remainder. */ + if (len > 0) + return generic->add_16s_inplace(dptr, sptr, len); + + return PRIMITIVES_SUCCESS; +} + #endif /* ------------------------------------------------------------------------- */ @@ -47,14 +157,13 @@ void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims) { generic = primitives_get_generic(); primitives_init_add(prims); -#ifdef WITH_IPP - prims->add_16s = (__add_16s_t)ippsAdd_16s; -#elif defined(WITH_SSE2) +#if defined(WITH_SSE2) if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) && IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */ { prims->add_16s = sse3_add_16s; + prims->add_16s_inplace = sse3_add_16s_inplace; } #endif