[primitives,prim_add] add new add_16s_inplace
This commit is contained in:
parent
c780210624
commit
f19098da83
@ -102,6 +102,8 @@ typedef pstatus_t (*__alphaComp_argb_t)(const BYTE* WINPR_RESTRICT pSrc1, UINT32
|
||||
typedef pstatus_t (*__add_16s_t)(const INT16* WINPR_RESTRICT pSrc1,
|
||||
const INT16* WINPR_RESTRICT pSrc2, INT16* WINPR_RESTRICT pDst,
|
||||
UINT32 len);
|
||||
typedef pstatus_t (*__add_16s_inplace_t)(INT16* WINPR_RESTRICT WINPR_RESTRICT pSrcDst,
|
||||
const INT16* WINPR_RESTRICT pSrc, UINT32 len);
|
||||
typedef pstatus_t (*__lShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len);
|
||||
typedef pstatus_t (*__lShiftC_16u_t)(const UINT16* pSrc, UINT32 val, UINT16* pSrcDst, UINT32 len);
|
||||
typedef pstatus_t (*__rShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len);
|
||||
@ -183,6 +185,7 @@ typedef struct
|
||||
__zero_t zero; /* bzero or faster */
|
||||
/* Arithmetic functions */
|
||||
__add_16s_t add_16s;
|
||||
__add_16s_inplace_t add_16s_inplace;
|
||||
/* And/or */
|
||||
__andC_32u_t andC_32u;
|
||||
__orC_32u_t orC_32u;
|
||||
|
@ -16,6 +16,8 @@
|
||||
|
||||
#include <freerdp/config.h>
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include <freerdp/types.h>
|
||||
#include <freerdp/primitives.h>
|
||||
|
||||
@ -24,16 +26,18 @@
|
||||
/* ----------------------------------------------------------------------------
|
||||
* 16-bit signed add with saturation (under and over).
|
||||
*/
|
||||
static pstatus_t general_add_16s(const INT16* pSrc1, const INT16* pSrc2, INT16* pDst, UINT32 len)
|
||||
static pstatus_t general_add_16s(const INT16* WINPR_RESTRICT pSrc1,
|
||||
const INT16* WINPR_RESTRICT pSrc2, INT16* WINPR_RESTRICT pDst,
|
||||
UINT32 len)
|
||||
{
|
||||
while (len--)
|
||||
{
|
||||
INT32 k = (INT32)(*pSrc1++) + (INT32)(*pSrc2++);
|
||||
|
||||
if (k > 32767)
|
||||
*pDst++ = ((INT16)32767);
|
||||
else if (k < -32768)
|
||||
*pDst++ = ((INT16)-32768);
|
||||
if (k > INT16_MAX)
|
||||
*pDst++ = ((INT16)INT16_MAX);
|
||||
else if (k < INT16_MIN)
|
||||
*pDst++ = ((INT16)INT16_MIN);
|
||||
else
|
||||
*pDst++ = (INT16)k;
|
||||
}
|
||||
@ -41,8 +45,27 @@ static pstatus_t general_add_16s(const INT16* pSrc1, const INT16* pSrc2, INT16*
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static pstatus_t general_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst,
|
||||
const INT16* WINPR_RESTRICT pSrc, UINT32 len)
|
||||
{
|
||||
while (len--)
|
||||
{
|
||||
INT32 k = (INT32)(*pSrcDst) + (INT32)(*pSrc++);
|
||||
|
||||
if (k > INT16_MAX)
|
||||
*pSrcDst++ = ((INT16)INT16_MAX);
|
||||
else if (k < INT16_MIN)
|
||||
*pSrcDst++ = ((INT16)INT16_MIN);
|
||||
else
|
||||
*pSrcDst++ = (INT16)k;
|
||||
}
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
void primitives_init_add(primitives_t* prims)
|
||||
{
|
||||
prims->add_16s = general_add_16s;
|
||||
prims->add_16s_inplace = general_add_16s_inplace;
|
||||
}
|
||||
|
@ -25,21 +25,131 @@
|
||||
#include <pmmintrin.h>
|
||||
#endif /* WITH_SSE2 */
|
||||
|
||||
#ifdef WITH_IPP
|
||||
#include <ipps.h>
|
||||
#endif /* WITH_IPP */
|
||||
|
||||
#include "prim_internal.h"
|
||||
#include "prim_templates.h"
|
||||
|
||||
static primitives_t* generic = NULL;
|
||||
|
||||
#ifdef WITH_SSE2
|
||||
#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16,
|
||||
generic->add_16s(sptr1++, sptr2++, dptr++, 1))
|
||||
#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
|
||||
|
||||
static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst,
|
||||
const INT16* WINPR_RESTRICT pSrc, UINT32 len)
|
||||
{
|
||||
const int shifts = 2;
|
||||
UINT32 offBeatMask;
|
||||
INT16* dptr = pSrcDst;
|
||||
const INT16* sptr = pSrc;
|
||||
|
||||
size_t count;
|
||||
if (len < 16) /* pointless if too small */
|
||||
return generic->add_16s_inplace(pSrcDst, pSrc, len);
|
||||
|
||||
offBeatMask = (1 << (shifts - 1)) - 1;
|
||||
if ((ULONG_PTR)pSrcDst & offBeatMask)
|
||||
{
|
||||
/* Incrementing the pointer skips over 16-byte boundary. */
|
||||
return generic->add_16s_inplace(pSrcDst, pSrc, len);
|
||||
}
|
||||
/* Get to the 16-byte boundary now. */
|
||||
const size_t rem = (ULONG_PTR)dptr & 0x0f;
|
||||
if (rem != 0)
|
||||
{
|
||||
pstatus_t status = generic->add_16s_inplace(dptr, sptr, rem);
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return status;
|
||||
dptr += rem;
|
||||
sptr += rem;
|
||||
}
|
||||
/* Use 4 128-bit SSE registers. */
|
||||
count = len >> (7 - shifts);
|
||||
len -= count << (7 - shifts);
|
||||
if (((const ULONG_PTR)dptr & 0x0f) || ((const ULONG_PTR)sptr & 0x0f))
|
||||
{
|
||||
/* Unaligned loads */
|
||||
while (count--)
|
||||
{
|
||||
const __m128i* sptr1 = dptr;
|
||||
const __m128i* sptr2 = sptr;
|
||||
__m128i* dptr1 = dptr;
|
||||
sptr += 4 * sizeof(__m128i);
|
||||
dptr += 4 * sizeof(__m128i);
|
||||
|
||||
__m128i xmm0 = _mm_lddqu_si128(sptr1++);
|
||||
__m128i xmm1 = _mm_lddqu_si128(sptr1++);
|
||||
__m128i xmm2 = _mm_lddqu_si128(sptr1++);
|
||||
__m128i xmm3 = _mm_lddqu_si128(sptr1++);
|
||||
__m128i xmm4 = _mm_lddqu_si128(sptr2++);
|
||||
__m128i xmm5 = _mm_lddqu_si128(sptr2++);
|
||||
__m128i xmm6 = _mm_lddqu_si128(sptr2++);
|
||||
__m128i xmm7 = _mm_lddqu_si128(sptr2++);
|
||||
|
||||
xmm0 = _mm_adds_epi16(xmm0, xmm4);
|
||||
xmm1 = _mm_adds_epi16(xmm1, xmm5);
|
||||
xmm2 = _mm_adds_epi16(xmm2, xmm6);
|
||||
xmm3 = _mm_adds_epi16(xmm3, xmm7);
|
||||
|
||||
_mm_store_si128(dptr1++, xmm0);
|
||||
_mm_store_si128(dptr1++, xmm1);
|
||||
_mm_store_si128(dptr1++, xmm2);
|
||||
_mm_store_si128(dptr1++, xmm3);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Aligned loads */
|
||||
while (count--)
|
||||
{
|
||||
const __m128i* sptr1 = dptr;
|
||||
const __m128i* sptr2 = sptr;
|
||||
__m128i* dptr1 = dptr;
|
||||
sptr += 4 * sizeof(__m128i);
|
||||
dptr += 4 * sizeof(__m128i);
|
||||
|
||||
__m128i xmm0 = _mm_load_si128(sptr1++);
|
||||
__m128i xmm1 = _mm_load_si128(sptr1++);
|
||||
__m128i xmm2 = _mm_load_si128(sptr1++);
|
||||
__m128i xmm3 = _mm_load_si128(sptr1++);
|
||||
__m128i xmm4 = _mm_load_si128(sptr2++);
|
||||
__m128i xmm5 = _mm_load_si128(sptr2++);
|
||||
__m128i xmm6 = _mm_load_si128(sptr2++);
|
||||
__m128i xmm7 = _mm_load_si128(sptr2++);
|
||||
|
||||
xmm0 = _mm_adds_epi16(xmm0, xmm4);
|
||||
xmm1 = _mm_adds_epi16(xmm1, xmm5);
|
||||
xmm2 = _mm_adds_epi16(xmm2, xmm6);
|
||||
xmm3 = _mm_adds_epi16(xmm3, xmm7);
|
||||
|
||||
_mm_store_si128(dptr1, xmm0);
|
||||
_mm_store_si128(dptr1, xmm1);
|
||||
_mm_store_si128(dptr1, xmm2);
|
||||
_mm_store_si128(dptr1, xmm3);
|
||||
}
|
||||
}
|
||||
/* Use a single 128-bit SSE register. */
|
||||
count = len >> (5 - shifts);
|
||||
len -= count << (5 - shifts);
|
||||
while (count--)
|
||||
{
|
||||
const __m128i* sptr1 = sptr;
|
||||
__m128i* dptr1 = dptr;
|
||||
sptr += sizeof(__m128i);
|
||||
dptr += sizeof(__m128i);
|
||||
|
||||
__m128i xmm0 = LOAD_SI128(sptr1);
|
||||
__m128i xmm1 = LOAD_SI128(dptr1);
|
||||
xmm0 = _mm_adds_epi16(xmm0, xmm1);
|
||||
_mm_store_si128(dptr, xmm0);
|
||||
}
|
||||
/* Finish off the remainder. */
|
||||
if (len > 0)
|
||||
return generic->add_16s_inplace(dptr, sptr, len);
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
@ -47,14 +157,13 @@ void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
{
|
||||
generic = primitives_get_generic();
|
||||
primitives_init_add(prims);
|
||||
#ifdef WITH_IPP
|
||||
prims->add_16s = (__add_16s_t)ippsAdd_16s;
|
||||
#elif defined(WITH_SSE2)
|
||||
|
||||
#if defined(WITH_SSE2)
|
||||
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
|
||||
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
|
||||
{
|
||||
prims->add_16s = sse3_add_16s;
|
||||
prims->add_16s_inplace = sse3_add_16s_inplace;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user