[primitives] add lShiftC_16s_inplace

This commit is contained in:
akallabeth 2024-06-07 14:35:06 +02:00
parent dee9019e7c
commit a5bb0bf203
No known key found for this signature in database
GPG Key ID: A49454A3FC909FD5
5 changed files with 120 additions and 6 deletions

View File

@ -104,14 +104,14 @@ typedef pstatus_t (*__add_16s_t)(const INT16* WINPR_RESTRICT pSrc1,
UINT32 len); UINT32 len);
typedef pstatus_t (*__add_16s_inplace_t)(INT16* WINPR_RESTRICT pSrcDst1, typedef pstatus_t (*__add_16s_inplace_t)(INT16* WINPR_RESTRICT pSrcDst1,
INT16* WINPR_RESTRICT pSrcDst2, UINT32 len); INT16* WINPR_RESTRICT pSrcDst2, UINT32 len);
typedef pstatus_t (*__lShiftC_16s_inplace_t)(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 len);
typedef pstatus_t (*__lShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len); typedef pstatus_t (*__lShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len);
typedef pstatus_t (*__lShiftC_16u_t)(const UINT16* pSrc, UINT32 val, UINT16* pSrcDst, UINT32 len); typedef pstatus_t (*__lShiftC_16u_t)(const UINT16* pSrc, UINT32 val, UINT16* pSrcDst, UINT32 len);
typedef pstatus_t (*__rShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len); typedef pstatus_t (*__rShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len);
typedef pstatus_t (*__rShiftC_16u_t)(const UINT16* pSrc, UINT32 val, UINT16* pSrcDst, UINT32 len); typedef pstatus_t (*__rShiftC_16u_t)(const UINT16* pSrc, UINT32 val, UINT16* pSrcDst, UINT32 len);
typedef pstatus_t (*__shiftC_16s_t)(const INT16* pSrc, INT32 val, INT16* pSrcDst, UINT32 len); typedef pstatus_t (*__shiftC_16s_t)(const INT16* pSrc, INT32 val, INT16* pSrcDst, UINT32 len);
typedef pstatus_t (*__shiftC_16u_t)(const UINT16* pSrc, INT32 val, UINT16* pSrcDst, UINT32 len); typedef pstatus_t (*__shiftC_16u_t)(const UINT16* pSrc, INT32 val, UINT16* pSrcDst, UINT32 len);
typedef pstatus_t (*__sign_16s_t)(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst, typedef pstatus_t (*__sign_16s_t)(const INT16* pSrc, INT16* pSrcDst, UINT32 len);
UINT32 len);
typedef pstatus_t (*__yCbCrToRGB_16s8u_P3AC4R_t)(const INT16* const WINPR_RESTRICT pSrc[3], typedef pstatus_t (*__yCbCrToRGB_16s8u_P3AC4R_t)(const INT16* const WINPR_RESTRICT pSrc[3],
UINT32 srcStep, BYTE* WINPR_RESTRICT pDst, UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
UINT32 dstStep, UINT32 DstFormat, UINT32 dstStep, UINT32 DstFormat,
@ -221,6 +221,7 @@ typedef struct
* pSrcDst1 = pSrcDst2 = pSrcDst1 + pSrcDst2 * pSrcDst1 = pSrcDst2 = pSrcDst1 + pSrcDst2
*/ */
__add_16s_inplace_t add_16s_inplace; __add_16s_inplace_t add_16s_inplace;
__lShiftC_16s_inplace_t lShiftC_16s_inplace;
} primitives_t; } primitives_t;
typedef enum typedef enum

View File

@ -867,7 +867,7 @@ static INLINE void progressive_rfx_decode_block(const primitives_t* prims,
if (!shift) if (!shift)
return; return;
prims->lShiftC_16s(buffer, shift, buffer, length); prims->lShiftC_16s_inplace(buffer, shift, length);
} }
static INLINE int progressive_rfx_decode_component( static INLINE int progressive_rfx_decode_component(

View File

@ -48,7 +48,7 @@ static INLINE void rfx_quantization_decode_block(const primitives_t* WINPR_RESTR
if (factor == 0) if (factor == 0)
return; return;
prims->lShiftC_16s(buffer, factor, buffer, buffer_size); prims->lShiftC_16s_inplace(buffer, factor, buffer_size);
} }
void rfx_quantization_decode(INT16* WINPR_RESTRICT buffer, const UINT32* WINPR_RESTRICT quantVals) void rfx_quantization_decode(INT16* WINPR_RESTRICT buffer, const UINT32* WINPR_RESTRICT quantVals)

View File

@ -20,6 +20,25 @@
#include "prim_internal.h" #include "prim_internal.h"
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
static INLINE INT16 shift(INT16 val, UINT32 sh)
{
return val << sh;
}
static INLINE pstatus_t general_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val,
UINT32 len)
{
if (val == 0)
return PRIMITIVES_SUCCESS;
if (val >= 16)
return -1;
for (UINT32 x = 0; x < len; x++)
pSrcDst[x] = shift(pSrcDst[x], val);
return PRIMITIVES_SUCCESS;
}
static INLINE pstatus_t general_lShiftC_16s(const INT16* pSrc, UINT32 val, INT16* pDst, UINT32 len) static INLINE pstatus_t general_lShiftC_16s(const INT16* pSrc, UINT32 val, INT16* pDst, UINT32 len)
{ {
if (val == 0) if (val == 0)
@ -27,8 +46,8 @@ static INLINE pstatus_t general_lShiftC_16s(const INT16* pSrc, UINT32 val, INT16
if (val >= 16) if (val >= 16)
return -1; return -1;
while (len--) for (UINT32 x = 0; x < len; x++)
*pDst++ = (INT16)((UINT16)*pSrc++ << val); pDst[x] = shift(pSrc[x], val);
return PRIMITIVES_SUCCESS; return PRIMITIVES_SUCCESS;
} }
@ -105,6 +124,7 @@ static INLINE pstatus_t general_shiftC_16u(const UINT16* pSrc, INT32 val, UINT16
void primitives_init_shift(primitives_t* prims) void primitives_init_shift(primitives_t* prims)
{ {
/* Start with the default. */ /* Start with the default. */
prims->lShiftC_16s_inplace = general_lShiftC_16s_inplace;
prims->lShiftC_16s = general_lShiftC_16s; prims->lShiftC_16s = general_lShiftC_16s;
prims->rShiftC_16s = general_rShiftC_16s; prims->rShiftC_16s = general_rShiftC_16s;
prims->lShiftC_16u = general_lShiftC_16u; prims->lShiftC_16u = general_lShiftC_16u;

View File

@ -42,6 +42,98 @@ SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, _mm_slli_epi16,
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16, SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16,
*dptr++ = *sptr++ >> val) *dptr++ = *sptr++ >> val)
static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 len)
{
const INT32 shifts = 2;
int count;
if (val == 0)
return PRIMITIVES_SUCCESS;
if (val >= 16)
return -1;
if (len < 16) /* pointless if too small */
return generic->lShiftC_16s_inplace(pSrcDst, val, len);
UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
if ((ULONG_PTR)pSrcDst & offBeatMask)
{
/* Incrementing the pointer skips over 16-byte boundary. */
return generic->lShiftC_16s_inplace(pSrcDst, val, len);
}
/* Get to the 16-byte boundary now. */
const UINT32 rem = ((UINT_PTR)pSrcDst & 0x0f) / sizeof(INT16);
if (rem > 0)
{
const UINT32 add = 16 - rem;
pstatus_t status = generic->lShiftC_16s_inplace(pSrcDst, val, add);
if (status != PRIMITIVES_SUCCESS)
return status;
pSrcDst += add;
len -= add;
}
/* Use 8 128-bit SSE registers. */
count = len >> (8 - shifts);
len -= count << (8 - shifts);
while (count--)
{
const __m128i* src = (const __m128i*)pSrcDst;
__m128i xmm0 = _mm_load_si128(src++);
__m128i xmm1 = _mm_load_si128(src++);
__m128i xmm2 = _mm_load_si128(src++);
__m128i xmm3 = _mm_load_si128(src++);
__m128i xmm4 = _mm_load_si128(src++);
__m128i xmm5 = _mm_load_si128(src++);
__m128i xmm6 = _mm_load_si128(src++);
__m128i xmm7 = _mm_load_si128(src);
xmm0 = _mm_slli_epi16(xmm0, val);
xmm1 = _mm_slli_epi16(xmm1, val);
xmm2 = _mm_slli_epi16(xmm2, val);
xmm3 = _mm_slli_epi16(xmm3, val);
xmm4 = _mm_slli_epi16(xmm4, val);
xmm5 = _mm_slli_epi16(xmm5, val);
xmm6 = _mm_slli_epi16(xmm6, val);
xmm7 = _mm_slli_epi16(xmm7, val);
__m128i* dst = (__m128i*)pSrcDst;
_mm_store_si128(dst++, xmm0);
_mm_store_si128(dst++, xmm1);
_mm_store_si128(dst++, xmm2);
_mm_store_si128(dst++, xmm3);
_mm_store_si128(dst++, xmm4);
_mm_store_si128(dst++, xmm5);
_mm_store_si128(dst++, xmm6);
_mm_store_si128(dst++, xmm7);
pSrcDst = (INT16*)dst;
}
/* Use a single 128-bit SSE register. */
count = len >> (5 - shifts);
len -= count << (5 - shifts);
while (count--)
{
const __m128i* src = (const __m128i*)pSrcDst;
__m128i xmm0 = LOAD_SI128(src);
xmm0 = _mm_slli_epi16(xmm0, val);
__m128i* dst = (__m128i*)pSrcDst;
_mm_store_si128(dst++, xmm0);
pSrcDst = (INT16*)dst;
}
/* Finish off the remainder. */
if (len > 0)
return generic->lShiftC_16s_inplace(pSrcDst, val, len);
return PRIMITIVES_SUCCESS;
}
#endif #endif
/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s /* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
@ -59,6 +151,7 @@ void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims)
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) && if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
{ {
prims->lShiftC_16s_inplace = sse2_lShiftC_16s_inplace;
prims->lShiftC_16s = sse2_lShiftC_16s; prims->lShiftC_16s = sse2_lShiftC_16s;
prims->rShiftC_16s = sse2_rShiftC_16s; prims->rShiftC_16s = sse2_rShiftC_16s;
prims->lShiftC_16u = sse2_lShiftC_16u; prims->lShiftC_16u = sse2_lShiftC_16u;