[primitives,prim_add] add new add_16s_inplace

This commit is contained in:
akallabeth 2024-06-07 11:54:03 +02:00
parent c780210624
commit f19098da83
No known key found for this signature in database
GPG Key ID: A49454A3FC909FD5
3 changed files with 149 additions and 14 deletions

View File

@ -102,6 +102,8 @@ typedef pstatus_t (*__alphaComp_argb_t)(const BYTE* WINPR_RESTRICT pSrc1, UINT32
typedef pstatus_t (*__add_16s_t)(const INT16* WINPR_RESTRICT pSrc1, typedef pstatus_t (*__add_16s_t)(const INT16* WINPR_RESTRICT pSrc1,
const INT16* WINPR_RESTRICT pSrc2, INT16* WINPR_RESTRICT pDst, const INT16* WINPR_RESTRICT pSrc2, INT16* WINPR_RESTRICT pDst,
UINT32 len); UINT32 len);
typedef pstatus_t (*__add_16s_inplace_t)(INT16* WINPR_RESTRICT WINPR_RESTRICT pSrcDst,
const INT16* WINPR_RESTRICT pSrc, UINT32 len);
typedef pstatus_t (*__lShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len); typedef pstatus_t (*__lShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len);
typedef pstatus_t (*__lShiftC_16u_t)(const UINT16* pSrc, UINT32 val, UINT16* pSrcDst, UINT32 len); typedef pstatus_t (*__lShiftC_16u_t)(const UINT16* pSrc, UINT32 val, UINT16* pSrcDst, UINT32 len);
typedef pstatus_t (*__rShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len); typedef pstatus_t (*__rShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len);
@ -183,6 +185,7 @@ typedef struct
__zero_t zero; /* bzero or faster */ __zero_t zero; /* bzero or faster */
/* Arithmetic functions */ /* Arithmetic functions */
__add_16s_t add_16s; __add_16s_t add_16s;
__add_16s_inplace_t add_16s_inplace;
/* And/or */ /* And/or */
__andC_32u_t andC_32u; __andC_32u_t andC_32u;
__orC_32u_t orC_32u; __orC_32u_t orC_32u;

View File

@ -16,6 +16,8 @@
#include <freerdp/config.h> #include <freerdp/config.h>
#include <stdint.h>
#include <freerdp/types.h> #include <freerdp/types.h>
#include <freerdp/primitives.h> #include <freerdp/primitives.h>
@ -24,16 +26,18 @@
/* ---------------------------------------------------------------------------- /* ----------------------------------------------------------------------------
* 16-bit signed add with saturation (under and over). * 16-bit signed add with saturation (under and over).
*/ */
static pstatus_t general_add_16s(const INT16* pSrc1, const INT16* pSrc2, INT16* pDst, UINT32 len) static pstatus_t general_add_16s(const INT16* WINPR_RESTRICT pSrc1,
const INT16* WINPR_RESTRICT pSrc2, INT16* WINPR_RESTRICT pDst,
UINT32 len)
{ {
while (len--) while (len--)
{ {
INT32 k = (INT32)(*pSrc1++) + (INT32)(*pSrc2++); INT32 k = (INT32)(*pSrc1++) + (INT32)(*pSrc2++);
if (k > 32767) if (k > INT16_MAX)
*pDst++ = ((INT16)32767); *pDst++ = ((INT16)INT16_MAX);
else if (k < -32768) else if (k < INT16_MIN)
*pDst++ = ((INT16)-32768); *pDst++ = ((INT16)INT16_MIN);
else else
*pDst++ = (INT16)k; *pDst++ = (INT16)k;
} }
@ -41,8 +45,27 @@ static pstatus_t general_add_16s(const INT16* pSrc1, const INT16* pSrc2, INT16*
return PRIMITIVES_SUCCESS; return PRIMITIVES_SUCCESS;
} }
static pstatus_t general_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst,
const INT16* WINPR_RESTRICT pSrc, UINT32 len)
{
while (len--)
{
INT32 k = (INT32)(*pSrcDst) + (INT32)(*pSrc++);
if (k > INT16_MAX)
*pSrcDst++ = ((INT16)INT16_MAX);
else if (k < INT16_MIN)
*pSrcDst++ = ((INT16)INT16_MIN);
else
*pSrcDst++ = (INT16)k;
}
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
void primitives_init_add(primitives_t* prims) void primitives_init_add(primitives_t* prims)
{ {
prims->add_16s = general_add_16s; prims->add_16s = general_add_16s;
prims->add_16s_inplace = general_add_16s_inplace;
} }

View File

@ -25,21 +25,131 @@
#include <pmmintrin.h> #include <pmmintrin.h>
#endif /* WITH_SSE2 */ #endif /* WITH_SSE2 */
#ifdef WITH_IPP
#include <ipps.h>
#endif /* WITH_IPP */
#include "prim_internal.h" #include "prim_internal.h"
#include "prim_templates.h" #include "prim_templates.h"
static primitives_t* generic = NULL; static primitives_t* generic = NULL;
#ifdef WITH_SSE2 #ifdef WITH_SSE2
#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16, SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16,
generic->add_16s(sptr1++, sptr2++, dptr++, 1)) generic->add_16s(sptr1++, sptr2++, dptr++, 1))
#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst,
const INT16* WINPR_RESTRICT pSrc, UINT32 len)
{
const int shifts = 2;
UINT32 offBeatMask;
INT16* dptr = pSrcDst;
const INT16* sptr = pSrc;
size_t count;
if (len < 16) /* pointless if too small */
return generic->add_16s_inplace(pSrcDst, pSrc, len);
offBeatMask = (1 << (shifts - 1)) - 1;
if ((ULONG_PTR)pSrcDst & offBeatMask)
{
/* Incrementing the pointer skips over 16-byte boundary. */
return generic->add_16s_inplace(pSrcDst, pSrc, len);
}
/* Get to the 16-byte boundary now. */
const size_t rem = (ULONG_PTR)dptr & 0x0f;
if (rem != 0)
{
pstatus_t status = generic->add_16s_inplace(dptr, sptr, rem);
if (status != PRIMITIVES_SUCCESS)
return status;
dptr += rem;
sptr += rem;
}
/* Use 4 128-bit SSE registers. */
count = len >> (7 - shifts);
len -= count << (7 - shifts);
if (((const ULONG_PTR)dptr & 0x0f) || ((const ULONG_PTR)sptr & 0x0f))
{
/* Unaligned loads */
while (count--)
{
const __m128i* sptr1 = dptr;
const __m128i* sptr2 = sptr;
__m128i* dptr1 = dptr;
sptr += 4 * sizeof(__m128i);
dptr += 4 * sizeof(__m128i);
__m128i xmm0 = _mm_lddqu_si128(sptr1++);
__m128i xmm1 = _mm_lddqu_si128(sptr1++);
__m128i xmm2 = _mm_lddqu_si128(sptr1++);
__m128i xmm3 = _mm_lddqu_si128(sptr1++);
__m128i xmm4 = _mm_lddqu_si128(sptr2++);
__m128i xmm5 = _mm_lddqu_si128(sptr2++);
__m128i xmm6 = _mm_lddqu_si128(sptr2++);
__m128i xmm7 = _mm_lddqu_si128(sptr2++);
xmm0 = _mm_adds_epi16(xmm0, xmm4);
xmm1 = _mm_adds_epi16(xmm1, xmm5);
xmm2 = _mm_adds_epi16(xmm2, xmm6);
xmm3 = _mm_adds_epi16(xmm3, xmm7);
_mm_store_si128(dptr1++, xmm0);
_mm_store_si128(dptr1++, xmm1);
_mm_store_si128(dptr1++, xmm2);
_mm_store_si128(dptr1++, xmm3);
}
}
else
{
/* Aligned loads */
while (count--)
{
const __m128i* sptr1 = dptr;
const __m128i* sptr2 = sptr;
__m128i* dptr1 = dptr;
sptr += 4 * sizeof(__m128i);
dptr += 4 * sizeof(__m128i);
__m128i xmm0 = _mm_load_si128(sptr1++);
__m128i xmm1 = _mm_load_si128(sptr1++);
__m128i xmm2 = _mm_load_si128(sptr1++);
__m128i xmm3 = _mm_load_si128(sptr1++);
__m128i xmm4 = _mm_load_si128(sptr2++);
__m128i xmm5 = _mm_load_si128(sptr2++);
__m128i xmm6 = _mm_load_si128(sptr2++);
__m128i xmm7 = _mm_load_si128(sptr2++);
xmm0 = _mm_adds_epi16(xmm0, xmm4);
xmm1 = _mm_adds_epi16(xmm1, xmm5);
xmm2 = _mm_adds_epi16(xmm2, xmm6);
xmm3 = _mm_adds_epi16(xmm3, xmm7);
_mm_store_si128(dptr1, xmm0);
_mm_store_si128(dptr1, xmm1);
_mm_store_si128(dptr1, xmm2);
_mm_store_si128(dptr1, xmm3);
}
}
/* Use a single 128-bit SSE register. */
count = len >> (5 - shifts);
len -= count << (5 - shifts);
while (count--)
{
const __m128i* sptr1 = sptr;
__m128i* dptr1 = dptr;
sptr += sizeof(__m128i);
dptr += sizeof(__m128i);
__m128i xmm0 = LOAD_SI128(sptr1);
__m128i xmm1 = LOAD_SI128(dptr1);
xmm0 = _mm_adds_epi16(xmm0, xmm1);
_mm_store_si128(dptr, xmm0);
}
/* Finish off the remainder. */
if (len > 0)
return generic->add_16s_inplace(dptr, sptr, len);
return PRIMITIVES_SUCCESS;
}
#endif #endif
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
@ -47,14 +157,13 @@ void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims)
{ {
generic = primitives_get_generic(); generic = primitives_get_generic();
primitives_init_add(prims); primitives_init_add(prims);
#ifdef WITH_IPP
prims->add_16s = (__add_16s_t)ippsAdd_16s;
#elif defined(WITH_SSE2)
#if defined(WITH_SSE2)
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) && if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */ IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
{ {
prims->add_16s = sse3_add_16s; prims->add_16s = sse3_add_16s;
prims->add_16s_inplace = sse3_add_16s_inplace;
} }
#endif #endif