[primitives] add lShiftC_16s_inplace
This commit is contained in:
parent
dee9019e7c
commit
a5bb0bf203
@ -104,14 +104,14 @@ typedef pstatus_t (*__add_16s_t)(const INT16* WINPR_RESTRICT pSrc1,
|
||||
UINT32 len);
|
||||
typedef pstatus_t (*__add_16s_inplace_t)(INT16* WINPR_RESTRICT pSrcDst1,
|
||||
INT16* WINPR_RESTRICT pSrcDst2, UINT32 len);
|
||||
typedef pstatus_t (*__lShiftC_16s_inplace_t)(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 len);
|
||||
typedef pstatus_t (*__lShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len);
|
||||
typedef pstatus_t (*__lShiftC_16u_t)(const UINT16* pSrc, UINT32 val, UINT16* pSrcDst, UINT32 len);
|
||||
typedef pstatus_t (*__rShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len);
|
||||
typedef pstatus_t (*__rShiftC_16u_t)(const UINT16* pSrc, UINT32 val, UINT16* pSrcDst, UINT32 len);
|
||||
typedef pstatus_t (*__shiftC_16s_t)(const INT16* pSrc, INT32 val, INT16* pSrcDst, UINT32 len);
|
||||
typedef pstatus_t (*__shiftC_16u_t)(const UINT16* pSrc, INT32 val, UINT16* pSrcDst, UINT32 len);
|
||||
typedef pstatus_t (*__sign_16s_t)(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst,
|
||||
UINT32 len);
|
||||
typedef pstatus_t (*__sign_16s_t)(const INT16* pSrc, INT16* pSrcDst, UINT32 len);
|
||||
typedef pstatus_t (*__yCbCrToRGB_16s8u_P3AC4R_t)(const INT16* const WINPR_RESTRICT pSrc[3],
|
||||
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
|
||||
UINT32 dstStep, UINT32 DstFormat,
|
||||
@ -221,6 +221,7 @@ typedef struct
|
||||
* pSrcDst1 = pSrcDst2 = pSrcDst1 + pSrcDst2
|
||||
*/
|
||||
__add_16s_inplace_t add_16s_inplace;
|
||||
__lShiftC_16s_inplace_t lShiftC_16s_inplace;
|
||||
} primitives_t;
|
||||
|
||||
typedef enum
|
||||
|
@ -867,7 +867,7 @@ static INLINE void progressive_rfx_decode_block(const primitives_t* prims,
|
||||
if (!shift)
|
||||
return;
|
||||
|
||||
prims->lShiftC_16s(buffer, shift, buffer, length);
|
||||
prims->lShiftC_16s_inplace(buffer, shift, length);
|
||||
}
|
||||
|
||||
static INLINE int progressive_rfx_decode_component(
|
||||
|
@ -48,7 +48,7 @@ static INLINE void rfx_quantization_decode_block(const primitives_t* WINPR_RESTR
|
||||
if (factor == 0)
|
||||
return;
|
||||
|
||||
prims->lShiftC_16s(buffer, factor, buffer, buffer_size);
|
||||
prims->lShiftC_16s_inplace(buffer, factor, buffer_size);
|
||||
}
|
||||
|
||||
void rfx_quantization_decode(INT16* WINPR_RESTRICT buffer, const UINT32* WINPR_RESTRICT quantVals)
|
||||
|
@ -20,6 +20,25 @@
|
||||
|
||||
#include "prim_internal.h"
|
||||
/* ------------------------------------------------------------------------- */
|
||||
static INLINE INT16 shift(INT16 val, UINT32 sh)
|
||||
{
|
||||
return val << sh;
|
||||
}
|
||||
|
||||
static INLINE pstatus_t general_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val,
|
||||
UINT32 len)
|
||||
{
|
||||
if (val == 0)
|
||||
return PRIMITIVES_SUCCESS;
|
||||
if (val >= 16)
|
||||
return -1;
|
||||
|
||||
for (UINT32 x = 0; x < len; x++)
|
||||
pSrcDst[x] = shift(pSrcDst[x], val);
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
static INLINE pstatus_t general_lShiftC_16s(const INT16* pSrc, UINT32 val, INT16* pDst, UINT32 len)
|
||||
{
|
||||
if (val == 0)
|
||||
@ -27,8 +46,8 @@ static INLINE pstatus_t general_lShiftC_16s(const INT16* pSrc, UINT32 val, INT16
|
||||
if (val >= 16)
|
||||
return -1;
|
||||
|
||||
while (len--)
|
||||
*pDst++ = (INT16)((UINT16)*pSrc++ << val);
|
||||
for (UINT32 x = 0; x < len; x++)
|
||||
pDst[x] = shift(pSrc[x], val);
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
@ -105,6 +124,7 @@ static INLINE pstatus_t general_shiftC_16u(const UINT16* pSrc, INT32 val, UINT16
|
||||
void primitives_init_shift(primitives_t* prims)
|
||||
{
|
||||
/* Start with the default. */
|
||||
prims->lShiftC_16s_inplace = general_lShiftC_16s_inplace;
|
||||
prims->lShiftC_16s = general_lShiftC_16s;
|
||||
prims->rShiftC_16s = general_rShiftC_16s;
|
||||
prims->lShiftC_16u = general_lShiftC_16u;
|
||||
|
@ -42,6 +42,98 @@ SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, _mm_slli_epi16,
|
||||
/* ------------------------------------------------------------------------- */
|
||||
SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16,
|
||||
*dptr++ = *sptr++ >> val)
|
||||
|
||||
static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 len)
|
||||
{
|
||||
const INT32 shifts = 2;
|
||||
int count;
|
||||
if (val == 0)
|
||||
return PRIMITIVES_SUCCESS;
|
||||
if (val >= 16)
|
||||
return -1;
|
||||
if (len < 16) /* pointless if too small */
|
||||
return generic->lShiftC_16s_inplace(pSrcDst, val, len);
|
||||
|
||||
UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
|
||||
if ((ULONG_PTR)pSrcDst & offBeatMask)
|
||||
{
|
||||
/* Incrementing the pointer skips over 16-byte boundary. */
|
||||
return generic->lShiftC_16s_inplace(pSrcDst, val, len);
|
||||
}
|
||||
/* Get to the 16-byte boundary now. */
|
||||
const UINT32 rem = ((UINT_PTR)pSrcDst & 0x0f) / sizeof(INT16);
|
||||
if (rem > 0)
|
||||
{
|
||||
const UINT32 add = 16 - rem;
|
||||
pstatus_t status = generic->lShiftC_16s_inplace(pSrcDst, val, add);
|
||||
if (status != PRIMITIVES_SUCCESS)
|
||||
return status;
|
||||
pSrcDst += add;
|
||||
len -= add;
|
||||
}
|
||||
|
||||
/* Use 8 128-bit SSE registers. */
|
||||
count = len >> (8 - shifts);
|
||||
len -= count << (8 - shifts);
|
||||
|
||||
while (count--)
|
||||
{
|
||||
const __m128i* src = (const __m128i*)pSrcDst;
|
||||
|
||||
__m128i xmm0 = _mm_load_si128(src++);
|
||||
__m128i xmm1 = _mm_load_si128(src++);
|
||||
__m128i xmm2 = _mm_load_si128(src++);
|
||||
__m128i xmm3 = _mm_load_si128(src++);
|
||||
__m128i xmm4 = _mm_load_si128(src++);
|
||||
__m128i xmm5 = _mm_load_si128(src++);
|
||||
__m128i xmm6 = _mm_load_si128(src++);
|
||||
__m128i xmm7 = _mm_load_si128(src);
|
||||
|
||||
xmm0 = _mm_slli_epi16(xmm0, val);
|
||||
xmm1 = _mm_slli_epi16(xmm1, val);
|
||||
xmm2 = _mm_slli_epi16(xmm2, val);
|
||||
xmm3 = _mm_slli_epi16(xmm3, val);
|
||||
xmm4 = _mm_slli_epi16(xmm4, val);
|
||||
xmm5 = _mm_slli_epi16(xmm5, val);
|
||||
xmm6 = _mm_slli_epi16(xmm6, val);
|
||||
xmm7 = _mm_slli_epi16(xmm7, val);
|
||||
|
||||
__m128i* dst = (__m128i*)pSrcDst;
|
||||
|
||||
_mm_store_si128(dst++, xmm0);
|
||||
_mm_store_si128(dst++, xmm1);
|
||||
_mm_store_si128(dst++, xmm2);
|
||||
_mm_store_si128(dst++, xmm3);
|
||||
_mm_store_si128(dst++, xmm4);
|
||||
_mm_store_si128(dst++, xmm5);
|
||||
_mm_store_si128(dst++, xmm6);
|
||||
_mm_store_si128(dst++, xmm7);
|
||||
|
||||
pSrcDst = (INT16*)dst;
|
||||
}
|
||||
|
||||
/* Use a single 128-bit SSE register. */
|
||||
count = len >> (5 - shifts);
|
||||
len -= count << (5 - shifts);
|
||||
while (count--)
|
||||
{
|
||||
const __m128i* src = (const __m128i*)pSrcDst;
|
||||
__m128i xmm0 = LOAD_SI128(src);
|
||||
|
||||
xmm0 = _mm_slli_epi16(xmm0, val);
|
||||
|
||||
__m128i* dst = (__m128i*)pSrcDst;
|
||||
_mm_store_si128(dst++, xmm0);
|
||||
pSrcDst = (INT16*)dst;
|
||||
}
|
||||
|
||||
/* Finish off the remainder. */
|
||||
if (len > 0)
|
||||
return generic->lShiftC_16s_inplace(pSrcDst, val, len);
|
||||
|
||||
return PRIMITIVES_SUCCESS;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
|
||||
@ -59,6 +151,7 @@ void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims)
|
||||
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
|
||||
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
|
||||
{
|
||||
prims->lShiftC_16s_inplace = sse2_lShiftC_16s_inplace;
|
||||
prims->lShiftC_16s = sse2_lShiftC_16s;
|
||||
prims->rShiftC_16s = sse2_rShiftC_16s;
|
||||
prims->lShiftC_16u = sse2_lShiftC_16u;
|
||||
|
Loading…
Reference in New Issue
Block a user