[primitives] add lShiftC_16s_inplace
This commit is contained in:
parent
dee9019e7c
commit
a5bb0bf203
@ -104,14 +104,14 @@ typedef pstatus_t (*__add_16s_t)(const INT16* WINPR_RESTRICT pSrc1,
|
|||||||
UINT32 len);
|
UINT32 len);
|
||||||
typedef pstatus_t (*__add_16s_inplace_t)(INT16* WINPR_RESTRICT pSrcDst1,
|
typedef pstatus_t (*__add_16s_inplace_t)(INT16* WINPR_RESTRICT pSrcDst1,
|
||||||
INT16* WINPR_RESTRICT pSrcDst2, UINT32 len);
|
INT16* WINPR_RESTRICT pSrcDst2, UINT32 len);
|
||||||
|
typedef pstatus_t (*__lShiftC_16s_inplace_t)(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 len);
|
||||||
typedef pstatus_t (*__lShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len);
|
typedef pstatus_t (*__lShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len);
|
||||||
typedef pstatus_t (*__lShiftC_16u_t)(const UINT16* pSrc, UINT32 val, UINT16* pSrcDst, UINT32 len);
|
typedef pstatus_t (*__lShiftC_16u_t)(const UINT16* pSrc, UINT32 val, UINT16* pSrcDst, UINT32 len);
|
||||||
typedef pstatus_t (*__rShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len);
|
typedef pstatus_t (*__rShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len);
|
||||||
typedef pstatus_t (*__rShiftC_16u_t)(const UINT16* pSrc, UINT32 val, UINT16* pSrcDst, UINT32 len);
|
typedef pstatus_t (*__rShiftC_16u_t)(const UINT16* pSrc, UINT32 val, UINT16* pSrcDst, UINT32 len);
|
||||||
typedef pstatus_t (*__shiftC_16s_t)(const INT16* pSrc, INT32 val, INT16* pSrcDst, UINT32 len);
|
typedef pstatus_t (*__shiftC_16s_t)(const INT16* pSrc, INT32 val, INT16* pSrcDst, UINT32 len);
|
||||||
typedef pstatus_t (*__shiftC_16u_t)(const UINT16* pSrc, INT32 val, UINT16* pSrcDst, UINT32 len);
|
typedef pstatus_t (*__shiftC_16u_t)(const UINT16* pSrc, INT32 val, UINT16* pSrcDst, UINT32 len);
|
||||||
typedef pstatus_t (*__sign_16s_t)(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst,
|
typedef pstatus_t (*__sign_16s_t)(const INT16* pSrc, INT16* pSrcDst, UINT32 len);
|
||||||
UINT32 len);
|
|
||||||
typedef pstatus_t (*__yCbCrToRGB_16s8u_P3AC4R_t)(const INT16* const WINPR_RESTRICT pSrc[3],
|
typedef pstatus_t (*__yCbCrToRGB_16s8u_P3AC4R_t)(const INT16* const WINPR_RESTRICT pSrc[3],
|
||||||
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
|
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
|
||||||
UINT32 dstStep, UINT32 DstFormat,
|
UINT32 dstStep, UINT32 DstFormat,
|
||||||
@ -221,6 +221,7 @@ typedef struct
|
|||||||
* pSrcDst1 = pSrcDst2 = pSrcDst1 + pSrcDst2
|
* pSrcDst1 = pSrcDst2 = pSrcDst1 + pSrcDst2
|
||||||
*/
|
*/
|
||||||
__add_16s_inplace_t add_16s_inplace;
|
__add_16s_inplace_t add_16s_inplace;
|
||||||
|
__lShiftC_16s_inplace_t lShiftC_16s_inplace;
|
||||||
} primitives_t;
|
} primitives_t;
|
||||||
|
|
||||||
typedef enum
|
typedef enum
|
||||||
|
@ -867,7 +867,7 @@ static INLINE void progressive_rfx_decode_block(const primitives_t* prims,
|
|||||||
if (!shift)
|
if (!shift)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
prims->lShiftC_16s(buffer, shift, buffer, length);
|
prims->lShiftC_16s_inplace(buffer, shift, length);
|
||||||
}
|
}
|
||||||
|
|
||||||
static INLINE int progressive_rfx_decode_component(
|
static INLINE int progressive_rfx_decode_component(
|
||||||
|
@ -48,7 +48,7 @@ static INLINE void rfx_quantization_decode_block(const primitives_t* WINPR_RESTR
|
|||||||
if (factor == 0)
|
if (factor == 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
prims->lShiftC_16s(buffer, factor, buffer, buffer_size);
|
prims->lShiftC_16s_inplace(buffer, factor, buffer_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
void rfx_quantization_decode(INT16* WINPR_RESTRICT buffer, const UINT32* WINPR_RESTRICT quantVals)
|
void rfx_quantization_decode(INT16* WINPR_RESTRICT buffer, const UINT32* WINPR_RESTRICT quantVals)
|
||||||
|
@ -20,6 +20,25 @@
|
|||||||
|
|
||||||
#include "prim_internal.h"
|
#include "prim_internal.h"
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
|
static INLINE INT16 shift(INT16 val, UINT32 sh)
|
||||||
|
{
|
||||||
|
return val << sh;
|
||||||
|
}
|
||||||
|
|
||||||
|
static INLINE pstatus_t general_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val,
|
||||||
|
UINT32 len)
|
||||||
|
{
|
||||||
|
if (val == 0)
|
||||||
|
return PRIMITIVES_SUCCESS;
|
||||||
|
if (val >= 16)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
for (UINT32 x = 0; x < len; x++)
|
||||||
|
pSrcDst[x] = shift(pSrcDst[x], val);
|
||||||
|
|
||||||
|
return PRIMITIVES_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
static INLINE pstatus_t general_lShiftC_16s(const INT16* pSrc, UINT32 val, INT16* pDst, UINT32 len)
|
static INLINE pstatus_t general_lShiftC_16s(const INT16* pSrc, UINT32 val, INT16* pDst, UINT32 len)
|
||||||
{
|
{
|
||||||
if (val == 0)
|
if (val == 0)
|
||||||
@ -27,8 +46,8 @@ static INLINE pstatus_t general_lShiftC_16s(const INT16* pSrc, UINT32 val, INT16
|
|||||||
if (val >= 16)
|
if (val >= 16)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
while (len--)
|
for (UINT32 x = 0; x < len; x++)
|
||||||
*pDst++ = (INT16)((UINT16)*pSrc++ << val);
|
pDst[x] = shift(pSrc[x], val);
|
||||||
|
|
||||||
return PRIMITIVES_SUCCESS;
|
return PRIMITIVES_SUCCESS;
|
||||||
}
|
}
|
||||||
@ -105,6 +124,7 @@ static INLINE pstatus_t general_shiftC_16u(const UINT16* pSrc, INT32 val, UINT16
|
|||||||
void primitives_init_shift(primitives_t* prims)
|
void primitives_init_shift(primitives_t* prims)
|
||||||
{
|
{
|
||||||
/* Start with the default. */
|
/* Start with the default. */
|
||||||
|
prims->lShiftC_16s_inplace = general_lShiftC_16s_inplace;
|
||||||
prims->lShiftC_16s = general_lShiftC_16s;
|
prims->lShiftC_16s = general_lShiftC_16s;
|
||||||
prims->rShiftC_16s = general_rShiftC_16s;
|
prims->rShiftC_16s = general_rShiftC_16s;
|
||||||
prims->lShiftC_16u = general_lShiftC_16u;
|
prims->lShiftC_16u = general_lShiftC_16u;
|
||||||
|
@ -42,6 +42,98 @@ SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, _mm_slli_epi16,
|
|||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16,
|
SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16,
|
||||||
*dptr++ = *sptr++ >> val)
|
*dptr++ = *sptr++ >> val)
|
||||||
|
|
||||||
|
static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 len)
|
||||||
|
{
|
||||||
|
const INT32 shifts = 2;
|
||||||
|
int count;
|
||||||
|
if (val == 0)
|
||||||
|
return PRIMITIVES_SUCCESS;
|
||||||
|
if (val >= 16)
|
||||||
|
return -1;
|
||||||
|
if (len < 16) /* pointless if too small */
|
||||||
|
return generic->lShiftC_16s_inplace(pSrcDst, val, len);
|
||||||
|
|
||||||
|
UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
|
||||||
|
if ((ULONG_PTR)pSrcDst & offBeatMask)
|
||||||
|
{
|
||||||
|
/* Incrementing the pointer skips over 16-byte boundary. */
|
||||||
|
return generic->lShiftC_16s_inplace(pSrcDst, val, len);
|
||||||
|
}
|
||||||
|
/* Get to the 16-byte boundary now. */
|
||||||
|
const UINT32 rem = ((UINT_PTR)pSrcDst & 0x0f) / sizeof(INT16);
|
||||||
|
if (rem > 0)
|
||||||
|
{
|
||||||
|
const UINT32 add = 16 - rem;
|
||||||
|
pstatus_t status = generic->lShiftC_16s_inplace(pSrcDst, val, add);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return status;
|
||||||
|
pSrcDst += add;
|
||||||
|
len -= add;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Use 8 128-bit SSE registers. */
|
||||||
|
count = len >> (8 - shifts);
|
||||||
|
len -= count << (8 - shifts);
|
||||||
|
|
||||||
|
while (count--)
|
||||||
|
{
|
||||||
|
const __m128i* src = (const __m128i*)pSrcDst;
|
||||||
|
|
||||||
|
__m128i xmm0 = _mm_load_si128(src++);
|
||||||
|
__m128i xmm1 = _mm_load_si128(src++);
|
||||||
|
__m128i xmm2 = _mm_load_si128(src++);
|
||||||
|
__m128i xmm3 = _mm_load_si128(src++);
|
||||||
|
__m128i xmm4 = _mm_load_si128(src++);
|
||||||
|
__m128i xmm5 = _mm_load_si128(src++);
|
||||||
|
__m128i xmm6 = _mm_load_si128(src++);
|
||||||
|
__m128i xmm7 = _mm_load_si128(src);
|
||||||
|
|
||||||
|
xmm0 = _mm_slli_epi16(xmm0, val);
|
||||||
|
xmm1 = _mm_slli_epi16(xmm1, val);
|
||||||
|
xmm2 = _mm_slli_epi16(xmm2, val);
|
||||||
|
xmm3 = _mm_slli_epi16(xmm3, val);
|
||||||
|
xmm4 = _mm_slli_epi16(xmm4, val);
|
||||||
|
xmm5 = _mm_slli_epi16(xmm5, val);
|
||||||
|
xmm6 = _mm_slli_epi16(xmm6, val);
|
||||||
|
xmm7 = _mm_slli_epi16(xmm7, val);
|
||||||
|
|
||||||
|
__m128i* dst = (__m128i*)pSrcDst;
|
||||||
|
|
||||||
|
_mm_store_si128(dst++, xmm0);
|
||||||
|
_mm_store_si128(dst++, xmm1);
|
||||||
|
_mm_store_si128(dst++, xmm2);
|
||||||
|
_mm_store_si128(dst++, xmm3);
|
||||||
|
_mm_store_si128(dst++, xmm4);
|
||||||
|
_mm_store_si128(dst++, xmm5);
|
||||||
|
_mm_store_si128(dst++, xmm6);
|
||||||
|
_mm_store_si128(dst++, xmm7);
|
||||||
|
|
||||||
|
pSrcDst = (INT16*)dst;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Use a single 128-bit SSE register. */
|
||||||
|
count = len >> (5 - shifts);
|
||||||
|
len -= count << (5 - shifts);
|
||||||
|
while (count--)
|
||||||
|
{
|
||||||
|
const __m128i* src = (const __m128i*)pSrcDst;
|
||||||
|
__m128i xmm0 = LOAD_SI128(src);
|
||||||
|
|
||||||
|
xmm0 = _mm_slli_epi16(xmm0, val);
|
||||||
|
|
||||||
|
__m128i* dst = (__m128i*)pSrcDst;
|
||||||
|
_mm_store_si128(dst++, xmm0);
|
||||||
|
pSrcDst = (INT16*)dst;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Finish off the remainder. */
|
||||||
|
if (len > 0)
|
||||||
|
return generic->lShiftC_16s_inplace(pSrcDst, val, len);
|
||||||
|
|
||||||
|
return PRIMITIVES_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
|
/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
|
||||||
@ -59,6 +151,7 @@ void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims)
|
|||||||
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
|
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
|
||||||
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
|
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
|
||||||
{
|
{
|
||||||
|
prims->lShiftC_16s_inplace = sse2_lShiftC_16s_inplace;
|
||||||
prims->lShiftC_16s = sse2_lShiftC_16s;
|
prims->lShiftC_16s = sse2_lShiftC_16s;
|
||||||
prims->rShiftC_16s = sse2_rShiftC_16s;
|
prims->rShiftC_16s = sse2_rShiftC_16s;
|
||||||
prims->lShiftC_16u = sse2_lShiftC_16u;
|
prims->lShiftC_16u = sse2_lShiftC_16u;
|
||||||
|
Loading…
Reference in New Issue
Block a user