[primitives,prim_add] add new add_16s_inplace
This commit is contained in:
parent
c780210624
commit
f19098da83
@ -102,6 +102,8 @@ typedef pstatus_t (*__alphaComp_argb_t)(const BYTE* WINPR_RESTRICT pSrc1, UINT32
|
|||||||
typedef pstatus_t (*__add_16s_t)(const INT16* WINPR_RESTRICT pSrc1,
|
typedef pstatus_t (*__add_16s_t)(const INT16* WINPR_RESTRICT pSrc1,
|
||||||
const INT16* WINPR_RESTRICT pSrc2, INT16* WINPR_RESTRICT pDst,
|
const INT16* WINPR_RESTRICT pSrc2, INT16* WINPR_RESTRICT pDst,
|
||||||
UINT32 len);
|
UINT32 len);
|
||||||
|
typedef pstatus_t (*__add_16s_inplace_t)(INT16* WINPR_RESTRICT WINPR_RESTRICT pSrcDst,
|
||||||
|
const INT16* WINPR_RESTRICT pSrc, UINT32 len);
|
||||||
typedef pstatus_t (*__lShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len);
|
typedef pstatus_t (*__lShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len);
|
||||||
typedef pstatus_t (*__lShiftC_16u_t)(const UINT16* pSrc, UINT32 val, UINT16* pSrcDst, UINT32 len);
|
typedef pstatus_t (*__lShiftC_16u_t)(const UINT16* pSrc, UINT32 val, UINT16* pSrcDst, UINT32 len);
|
||||||
typedef pstatus_t (*__rShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len);
|
typedef pstatus_t (*__rShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len);
|
||||||
@ -183,6 +185,7 @@ typedef struct
|
|||||||
__zero_t zero; /* bzero or faster */
|
__zero_t zero; /* bzero or faster */
|
||||||
/* Arithmetic functions */
|
/* Arithmetic functions */
|
||||||
__add_16s_t add_16s;
|
__add_16s_t add_16s;
|
||||||
|
__add_16s_inplace_t add_16s_inplace;
|
||||||
/* And/or */
|
/* And/or */
|
||||||
__andC_32u_t andC_32u;
|
__andC_32u_t andC_32u;
|
||||||
__orC_32u_t orC_32u;
|
__orC_32u_t orC_32u;
|
||||||
|
@ -16,6 +16,8 @@
|
|||||||
|
|
||||||
#include <freerdp/config.h>
|
#include <freerdp/config.h>
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
#include <freerdp/types.h>
|
#include <freerdp/types.h>
|
||||||
#include <freerdp/primitives.h>
|
#include <freerdp/primitives.h>
|
||||||
|
|
||||||
@ -24,16 +26,18 @@
|
|||||||
/* ----------------------------------------------------------------------------
|
/* ----------------------------------------------------------------------------
|
||||||
* 16-bit signed add with saturation (under and over).
|
* 16-bit signed add with saturation (under and over).
|
||||||
*/
|
*/
|
||||||
static pstatus_t general_add_16s(const INT16* pSrc1, const INT16* pSrc2, INT16* pDst, UINT32 len)
|
static pstatus_t general_add_16s(const INT16* WINPR_RESTRICT pSrc1,
|
||||||
|
const INT16* WINPR_RESTRICT pSrc2, INT16* WINPR_RESTRICT pDst,
|
||||||
|
UINT32 len)
|
||||||
{
|
{
|
||||||
while (len--)
|
while (len--)
|
||||||
{
|
{
|
||||||
INT32 k = (INT32)(*pSrc1++) + (INT32)(*pSrc2++);
|
INT32 k = (INT32)(*pSrc1++) + (INT32)(*pSrc2++);
|
||||||
|
|
||||||
if (k > 32767)
|
if (k > INT16_MAX)
|
||||||
*pDst++ = ((INT16)32767);
|
*pDst++ = ((INT16)INT16_MAX);
|
||||||
else if (k < -32768)
|
else if (k < INT16_MIN)
|
||||||
*pDst++ = ((INT16)-32768);
|
*pDst++ = ((INT16)INT16_MIN);
|
||||||
else
|
else
|
||||||
*pDst++ = (INT16)k;
|
*pDst++ = (INT16)k;
|
||||||
}
|
}
|
||||||
@ -41,8 +45,27 @@ static pstatus_t general_add_16s(const INT16* pSrc1, const INT16* pSrc2, INT16*
|
|||||||
return PRIMITIVES_SUCCESS;
|
return PRIMITIVES_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static pstatus_t general_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst,
|
||||||
|
const INT16* WINPR_RESTRICT pSrc, UINT32 len)
|
||||||
|
{
|
||||||
|
while (len--)
|
||||||
|
{
|
||||||
|
INT32 k = (INT32)(*pSrcDst) + (INT32)(*pSrc++);
|
||||||
|
|
||||||
|
if (k > INT16_MAX)
|
||||||
|
*pSrcDst++ = ((INT16)INT16_MAX);
|
||||||
|
else if (k < INT16_MIN)
|
||||||
|
*pSrcDst++ = ((INT16)INT16_MIN);
|
||||||
|
else
|
||||||
|
*pSrcDst++ = (INT16)k;
|
||||||
|
}
|
||||||
|
|
||||||
|
return PRIMITIVES_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
void primitives_init_add(primitives_t* prims)
|
void primitives_init_add(primitives_t* prims)
|
||||||
{
|
{
|
||||||
prims->add_16s = general_add_16s;
|
prims->add_16s = general_add_16s;
|
||||||
|
prims->add_16s_inplace = general_add_16s_inplace;
|
||||||
}
|
}
|
||||||
|
@ -25,21 +25,131 @@
|
|||||||
#include <pmmintrin.h>
|
#include <pmmintrin.h>
|
||||||
#endif /* WITH_SSE2 */
|
#endif /* WITH_SSE2 */
|
||||||
|
|
||||||
#ifdef WITH_IPP
|
|
||||||
#include <ipps.h>
|
|
||||||
#endif /* WITH_IPP */
|
|
||||||
|
|
||||||
#include "prim_internal.h"
|
#include "prim_internal.h"
|
||||||
#include "prim_templates.h"
|
#include "prim_templates.h"
|
||||||
|
|
||||||
static primitives_t* generic = NULL;
|
static primitives_t* generic = NULL;
|
||||||
|
|
||||||
#ifdef WITH_SSE2
|
#ifdef WITH_SSE2
|
||||||
#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
|
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16,
|
SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16,
|
||||||
generic->add_16s(sptr1++, sptr2++, dptr++, 1))
|
generic->add_16s(sptr1++, sptr2++, dptr++, 1))
|
||||||
#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
|
|
||||||
|
static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst,
|
||||||
|
const INT16* WINPR_RESTRICT pSrc, UINT32 len)
|
||||||
|
{
|
||||||
|
const int shifts = 2;
|
||||||
|
UINT32 offBeatMask;
|
||||||
|
INT16* dptr = pSrcDst;
|
||||||
|
const INT16* sptr = pSrc;
|
||||||
|
|
||||||
|
size_t count;
|
||||||
|
if (len < 16) /* pointless if too small */
|
||||||
|
return generic->add_16s_inplace(pSrcDst, pSrc, len);
|
||||||
|
|
||||||
|
offBeatMask = (1 << (shifts - 1)) - 1;
|
||||||
|
if ((ULONG_PTR)pSrcDst & offBeatMask)
|
||||||
|
{
|
||||||
|
/* Incrementing the pointer skips over 16-byte boundary. */
|
||||||
|
return generic->add_16s_inplace(pSrcDst, pSrc, len);
|
||||||
|
}
|
||||||
|
/* Get to the 16-byte boundary now. */
|
||||||
|
const size_t rem = (ULONG_PTR)dptr & 0x0f;
|
||||||
|
if (rem != 0)
|
||||||
|
{
|
||||||
|
pstatus_t status = generic->add_16s_inplace(dptr, sptr, rem);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return status;
|
||||||
|
dptr += rem;
|
||||||
|
sptr += rem;
|
||||||
|
}
|
||||||
|
/* Use 4 128-bit SSE registers. */
|
||||||
|
count = len >> (7 - shifts);
|
||||||
|
len -= count << (7 - shifts);
|
||||||
|
if (((const ULONG_PTR)dptr & 0x0f) || ((const ULONG_PTR)sptr & 0x0f))
|
||||||
|
{
|
||||||
|
/* Unaligned loads */
|
||||||
|
while (count--)
|
||||||
|
{
|
||||||
|
const __m128i* sptr1 = dptr;
|
||||||
|
const __m128i* sptr2 = sptr;
|
||||||
|
__m128i* dptr1 = dptr;
|
||||||
|
sptr += 4 * sizeof(__m128i);
|
||||||
|
dptr += 4 * sizeof(__m128i);
|
||||||
|
|
||||||
|
__m128i xmm0 = _mm_lddqu_si128(sptr1++);
|
||||||
|
__m128i xmm1 = _mm_lddqu_si128(sptr1++);
|
||||||
|
__m128i xmm2 = _mm_lddqu_si128(sptr1++);
|
||||||
|
__m128i xmm3 = _mm_lddqu_si128(sptr1++);
|
||||||
|
__m128i xmm4 = _mm_lddqu_si128(sptr2++);
|
||||||
|
__m128i xmm5 = _mm_lddqu_si128(sptr2++);
|
||||||
|
__m128i xmm6 = _mm_lddqu_si128(sptr2++);
|
||||||
|
__m128i xmm7 = _mm_lddqu_si128(sptr2++);
|
||||||
|
|
||||||
|
xmm0 = _mm_adds_epi16(xmm0, xmm4);
|
||||||
|
xmm1 = _mm_adds_epi16(xmm1, xmm5);
|
||||||
|
xmm2 = _mm_adds_epi16(xmm2, xmm6);
|
||||||
|
xmm3 = _mm_adds_epi16(xmm3, xmm7);
|
||||||
|
|
||||||
|
_mm_store_si128(dptr1++, xmm0);
|
||||||
|
_mm_store_si128(dptr1++, xmm1);
|
||||||
|
_mm_store_si128(dptr1++, xmm2);
|
||||||
|
_mm_store_si128(dptr1++, xmm3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Aligned loads */
|
||||||
|
while (count--)
|
||||||
|
{
|
||||||
|
const __m128i* sptr1 = dptr;
|
||||||
|
const __m128i* sptr2 = sptr;
|
||||||
|
__m128i* dptr1 = dptr;
|
||||||
|
sptr += 4 * sizeof(__m128i);
|
||||||
|
dptr += 4 * sizeof(__m128i);
|
||||||
|
|
||||||
|
__m128i xmm0 = _mm_load_si128(sptr1++);
|
||||||
|
__m128i xmm1 = _mm_load_si128(sptr1++);
|
||||||
|
__m128i xmm2 = _mm_load_si128(sptr1++);
|
||||||
|
__m128i xmm3 = _mm_load_si128(sptr1++);
|
||||||
|
__m128i xmm4 = _mm_load_si128(sptr2++);
|
||||||
|
__m128i xmm5 = _mm_load_si128(sptr2++);
|
||||||
|
__m128i xmm6 = _mm_load_si128(sptr2++);
|
||||||
|
__m128i xmm7 = _mm_load_si128(sptr2++);
|
||||||
|
|
||||||
|
xmm0 = _mm_adds_epi16(xmm0, xmm4);
|
||||||
|
xmm1 = _mm_adds_epi16(xmm1, xmm5);
|
||||||
|
xmm2 = _mm_adds_epi16(xmm2, xmm6);
|
||||||
|
xmm3 = _mm_adds_epi16(xmm3, xmm7);
|
||||||
|
|
||||||
|
_mm_store_si128(dptr1, xmm0);
|
||||||
|
_mm_store_si128(dptr1, xmm1);
|
||||||
|
_mm_store_si128(dptr1, xmm2);
|
||||||
|
_mm_store_si128(dptr1, xmm3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* Use a single 128-bit SSE register. */
|
||||||
|
count = len >> (5 - shifts);
|
||||||
|
len -= count << (5 - shifts);
|
||||||
|
while (count--)
|
||||||
|
{
|
||||||
|
const __m128i* sptr1 = sptr;
|
||||||
|
__m128i* dptr1 = dptr;
|
||||||
|
sptr += sizeof(__m128i);
|
||||||
|
dptr += sizeof(__m128i);
|
||||||
|
|
||||||
|
__m128i xmm0 = LOAD_SI128(sptr1);
|
||||||
|
__m128i xmm1 = LOAD_SI128(dptr1);
|
||||||
|
xmm0 = _mm_adds_epi16(xmm0, xmm1);
|
||||||
|
_mm_store_si128(dptr, xmm0);
|
||||||
|
}
|
||||||
|
/* Finish off the remainder. */
|
||||||
|
if (len > 0)
|
||||||
|
return generic->add_16s_inplace(dptr, sptr, len);
|
||||||
|
|
||||||
|
return PRIMITIVES_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
@ -47,14 +157,13 @@ void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims)
|
|||||||
{
|
{
|
||||||
generic = primitives_get_generic();
|
generic = primitives_get_generic();
|
||||||
primitives_init_add(prims);
|
primitives_init_add(prims);
|
||||||
#ifdef WITH_IPP
|
|
||||||
prims->add_16s = (__add_16s_t)ippsAdd_16s;
|
|
||||||
#elif defined(WITH_SSE2)
|
|
||||||
|
|
||||||
|
#if defined(WITH_SSE2)
|
||||||
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
|
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
|
||||||
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
|
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
|
||||||
{
|
{
|
||||||
prims->add_16s = sse3_add_16s;
|
prims->add_16s = sse3_add_16s;
|
||||||
|
prims->add_16s_inplace = sse3_add_16s_inplace;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
Reference in New Issue
Block a user