[primitives,prim_add] add new add_16s_inplace

This commit is contained in:
akallabeth 2024-06-07 11:54:03 +02:00
parent c780210624
commit f19098da83
No known key found for this signature in database
GPG Key ID: A49454A3FC909FD5
3 changed files with 149 additions and 14 deletions

View File

@ -102,6 +102,8 @@ typedef pstatus_t (*__alphaComp_argb_t)(const BYTE* WINPR_RESTRICT pSrc1, UINT32
typedef pstatus_t (*__add_16s_t)(const INT16* WINPR_RESTRICT pSrc1,
const INT16* WINPR_RESTRICT pSrc2, INT16* WINPR_RESTRICT pDst,
UINT32 len);
typedef pstatus_t (*__add_16s_inplace_t)(INT16* WINPR_RESTRICT WINPR_RESTRICT pSrcDst,
const INT16* WINPR_RESTRICT pSrc, UINT32 len);
typedef pstatus_t (*__lShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len);
typedef pstatus_t (*__lShiftC_16u_t)(const UINT16* pSrc, UINT32 val, UINT16* pSrcDst, UINT32 len);
typedef pstatus_t (*__rShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len);
@ -183,6 +185,7 @@ typedef struct
__zero_t zero; /* bzero or faster */
/* Arithmetic functions */
__add_16s_t add_16s;
__add_16s_inplace_t add_16s_inplace;
/* And/or */
__andC_32u_t andC_32u;
__orC_32u_t orC_32u;

View File

@ -16,6 +16,8 @@
#include <freerdp/config.h>
#include <stdint.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
@ -24,16 +26,18 @@
/* ----------------------------------------------------------------------------
* 16-bit signed add with saturation (under and over).
*/
static pstatus_t general_add_16s(const INT16* pSrc1, const INT16* pSrc2, INT16* pDst, UINT32 len)
static pstatus_t general_add_16s(const INT16* WINPR_RESTRICT pSrc1,
const INT16* WINPR_RESTRICT pSrc2, INT16* WINPR_RESTRICT pDst,
UINT32 len)
{
while (len--)
{
INT32 k = (INT32)(*pSrc1++) + (INT32)(*pSrc2++);
if (k > 32767)
*pDst++ = ((INT16)32767);
else if (k < -32768)
*pDst++ = ((INT16)-32768);
if (k > INT16_MAX)
*pDst++ = ((INT16)INT16_MAX);
else if (k < INT16_MIN)
*pDst++ = ((INT16)INT16_MIN);
else
*pDst++ = (INT16)k;
}
@ -41,8 +45,27 @@ static pstatus_t general_add_16s(const INT16* pSrc1, const INT16* pSrc2, INT16*
return PRIMITIVES_SUCCESS;
}
static pstatus_t general_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst,
const INT16* WINPR_RESTRICT pSrc, UINT32 len)
{
while (len--)
{
INT32 k = (INT32)(*pSrcDst) + (INT32)(*pSrc++);
if (k > INT16_MAX)
*pSrcDst++ = ((INT16)INT16_MAX);
else if (k < INT16_MIN)
*pSrcDst++ = ((INT16)INT16_MIN);
else
*pSrcDst++ = (INT16)k;
}
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
void primitives_init_add(primitives_t* prims)
{
prims->add_16s = general_add_16s;
prims->add_16s_inplace = general_add_16s_inplace;
}

View File

@ -25,21 +25,131 @@
#include <pmmintrin.h>
#endif /* WITH_SSE2 */
#ifdef WITH_IPP
#include <ipps.h>
#endif /* WITH_IPP */
#include "prim_internal.h"
#include "prim_templates.h"
static primitives_t* generic = NULL;
#ifdef WITH_SSE2
#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
/* ------------------------------------------------------------------------- */
SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16,
generic->add_16s(sptr1++, sptr2++, dptr++, 1))
#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst,
const INT16* WINPR_RESTRICT pSrc, UINT32 len)
{
const int shifts = 2;
UINT32 offBeatMask;
INT16* dptr = pSrcDst;
const INT16* sptr = pSrc;
size_t count;
if (len < 16) /* pointless if too small */
return generic->add_16s_inplace(pSrcDst, pSrc, len);
offBeatMask = (1 << (shifts - 1)) - 1;
if ((ULONG_PTR)pSrcDst & offBeatMask)
{
/* Incrementing the pointer skips over 16-byte boundary. */
return generic->add_16s_inplace(pSrcDst, pSrc, len);
}
/* Get to the 16-byte boundary now. */
const size_t rem = (ULONG_PTR)dptr & 0x0f;
if (rem != 0)
{
pstatus_t status = generic->add_16s_inplace(dptr, sptr, rem);
if (status != PRIMITIVES_SUCCESS)
return status;
dptr += rem;
sptr += rem;
}
/* Use 4 128-bit SSE registers. */
count = len >> (7 - shifts);
len -= count << (7 - shifts);
if (((const ULONG_PTR)dptr & 0x0f) || ((const ULONG_PTR)sptr & 0x0f))
{
/* Unaligned loads */
while (count--)
{
const __m128i* sptr1 = dptr;
const __m128i* sptr2 = sptr;
__m128i* dptr1 = dptr;
sptr += 4 * sizeof(__m128i);
dptr += 4 * sizeof(__m128i);
__m128i xmm0 = _mm_lddqu_si128(sptr1++);
__m128i xmm1 = _mm_lddqu_si128(sptr1++);
__m128i xmm2 = _mm_lddqu_si128(sptr1++);
__m128i xmm3 = _mm_lddqu_si128(sptr1++);
__m128i xmm4 = _mm_lddqu_si128(sptr2++);
__m128i xmm5 = _mm_lddqu_si128(sptr2++);
__m128i xmm6 = _mm_lddqu_si128(sptr2++);
__m128i xmm7 = _mm_lddqu_si128(sptr2++);
xmm0 = _mm_adds_epi16(xmm0, xmm4);
xmm1 = _mm_adds_epi16(xmm1, xmm5);
xmm2 = _mm_adds_epi16(xmm2, xmm6);
xmm3 = _mm_adds_epi16(xmm3, xmm7);
_mm_store_si128(dptr1++, xmm0);
_mm_store_si128(dptr1++, xmm1);
_mm_store_si128(dptr1++, xmm2);
_mm_store_si128(dptr1++, xmm3);
}
}
else
{
/* Aligned loads */
while (count--)
{
const __m128i* sptr1 = dptr;
const __m128i* sptr2 = sptr;
__m128i* dptr1 = dptr;
sptr += 4 * sizeof(__m128i);
dptr += 4 * sizeof(__m128i);
__m128i xmm0 = _mm_load_si128(sptr1++);
__m128i xmm1 = _mm_load_si128(sptr1++);
__m128i xmm2 = _mm_load_si128(sptr1++);
__m128i xmm3 = _mm_load_si128(sptr1++);
__m128i xmm4 = _mm_load_si128(sptr2++);
__m128i xmm5 = _mm_load_si128(sptr2++);
__m128i xmm6 = _mm_load_si128(sptr2++);
__m128i xmm7 = _mm_load_si128(sptr2++);
xmm0 = _mm_adds_epi16(xmm0, xmm4);
xmm1 = _mm_adds_epi16(xmm1, xmm5);
xmm2 = _mm_adds_epi16(xmm2, xmm6);
xmm3 = _mm_adds_epi16(xmm3, xmm7);
_mm_store_si128(dptr1, xmm0);
_mm_store_si128(dptr1, xmm1);
_mm_store_si128(dptr1, xmm2);
_mm_store_si128(dptr1, xmm3);
}
}
/* Use a single 128-bit SSE register. */
count = len >> (5 - shifts);
len -= count << (5 - shifts);
while (count--)
{
const __m128i* sptr1 = sptr;
__m128i* dptr1 = dptr;
sptr += sizeof(__m128i);
dptr += sizeof(__m128i);
__m128i xmm0 = LOAD_SI128(sptr1);
__m128i xmm1 = LOAD_SI128(dptr1);
xmm0 = _mm_adds_epi16(xmm0, xmm1);
_mm_store_si128(dptr, xmm0);
}
/* Finish off the remainder. */
if (len > 0)
return generic->add_16s_inplace(dptr, sptr, len);
return PRIMITIVES_SUCCESS;
}
#endif
/* ------------------------------------------------------------------------- */
@ -47,14 +157,13 @@ void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims)
{
generic = primitives_get_generic();
primitives_init_add(prims);
#ifdef WITH_IPP
prims->add_16s = (__add_16s_t)ippsAdd_16s;
#elif defined(WITH_SSE2)
#if defined(WITH_SSE2)
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
{
prims->add_16s = sse3_add_16s;
prims->add_16s_inplace = sse3_add_16s_inplace;
}
#endif