From a5bb0bf2030a041d4aacfc706bacba403eb94e24 Mon Sep 17 00:00:00 2001 From: akallabeth Date: Fri, 7 Jun 2024 14:35:06 +0200 Subject: [PATCH] [primitives] add lShiftC_16s_inplace --- include/freerdp/primitives.h | 5 +- libfreerdp/codec/progressive.c | 2 +- libfreerdp/codec/rfx_quantization.c | 2 +- libfreerdp/primitives/prim_shift.c | 24 ++++++- libfreerdp/primitives/prim_shift_opt.c | 93 ++++++++++++++++++++++++++ 5 files changed, 120 insertions(+), 6 deletions(-) diff --git a/include/freerdp/primitives.h b/include/freerdp/primitives.h index 20c92e59e..f3a6b3835 100644 --- a/include/freerdp/primitives.h +++ b/include/freerdp/primitives.h @@ -104,14 +104,14 @@ typedef pstatus_t (*__add_16s_t)(const INT16* WINPR_RESTRICT pSrc1, UINT32 len); typedef pstatus_t (*__add_16s_inplace_t)(INT16* WINPR_RESTRICT pSrcDst1, INT16* WINPR_RESTRICT pSrcDst2, UINT32 len); +typedef pstatus_t (*__lShiftC_16s_inplace_t)(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 len); typedef pstatus_t (*__lShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len); typedef pstatus_t (*__lShiftC_16u_t)(const UINT16* pSrc, UINT32 val, UINT16* pSrcDst, UINT32 len); typedef pstatus_t (*__rShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len); typedef pstatus_t (*__rShiftC_16u_t)(const UINT16* pSrc, UINT32 val, UINT16* pSrcDst, UINT32 len); typedef pstatus_t (*__shiftC_16s_t)(const INT16* pSrc, INT32 val, INT16* pSrcDst, UINT32 len); typedef pstatus_t (*__shiftC_16u_t)(const UINT16* pSrc, INT32 val, UINT16* pSrcDst, UINT32 len); -typedef pstatus_t (*__sign_16s_t)(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst, - UINT32 len); +typedef pstatus_t (*__sign_16s_t)(const INT16* pSrc, INT16* pSrcDst, UINT32 len); typedef pstatus_t (*__yCbCrToRGB_16s8u_P3AC4R_t)(const INT16* const WINPR_RESTRICT pSrc[3], UINT32 srcStep, BYTE* WINPR_RESTRICT pDst, UINT32 dstStep, UINT32 DstFormat, @@ -221,6 +221,7 @@ typedef struct * pSrcDst1 = pSrcDst2 = pSrcDst1 + pSrcDst2 */ __add_16s_inplace_t add_16s_inplace; + __lShiftC_16s_inplace_t lShiftC_16s_inplace; } primitives_t; typedef enum diff --git a/libfreerdp/codec/progressive.c b/libfreerdp/codec/progressive.c index 692731c6b..fff4b8998 100644 --- a/libfreerdp/codec/progressive.c +++ b/libfreerdp/codec/progressive.c @@ -867,7 +867,7 @@ static INLINE void progressive_rfx_decode_block(const primitives_t* prims, if (!shift) return; - prims->lShiftC_16s(buffer, shift, buffer, length); + prims->lShiftC_16s_inplace(buffer, shift, length); } static INLINE int progressive_rfx_decode_component( diff --git a/libfreerdp/codec/rfx_quantization.c b/libfreerdp/codec/rfx_quantization.c index 838f9bb7f..db9977547 100644 --- a/libfreerdp/codec/rfx_quantization.c +++ b/libfreerdp/codec/rfx_quantization.c @@ -48,7 +48,7 @@ static INLINE void rfx_quantization_decode_block(const primitives_t* WINPR_RESTR if (factor == 0) return; - prims->lShiftC_16s(buffer, factor, buffer, buffer_size); + prims->lShiftC_16s_inplace(buffer, factor, buffer_size); } void rfx_quantization_decode(INT16* WINPR_RESTRICT buffer, const UINT32* WINPR_RESTRICT quantVals) diff --git a/libfreerdp/primitives/prim_shift.c b/libfreerdp/primitives/prim_shift.c index 3729266d3..3677fd113 100644 --- a/libfreerdp/primitives/prim_shift.c +++ b/libfreerdp/primitives/prim_shift.c @@ -20,6 +20,25 @@ #include "prim_internal.h" /* ------------------------------------------------------------------------- */ +static INLINE INT16 shift(INT16 val, UINT32 sh) +{ + return val << sh; +} + +static INLINE pstatus_t general_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, + UINT32 len) +{ + if (val == 0) + return PRIMITIVES_SUCCESS; + if (val >= 16) + return -1; + + for (UINT32 x = 0; x < len; x++) + pSrcDst[x] = shift(pSrcDst[x], val); + + return PRIMITIVES_SUCCESS; +} + static INLINE pstatus_t general_lShiftC_16s(const INT16* pSrc, UINT32 val, INT16* pDst, UINT32 len) { if (val == 0) @@ -27,8 +46,8 @@ static INLINE pstatus_t general_lShiftC_16s(const INT16* pSrc, UINT32 val, INT16 if (val >= 16) return -1; - while (len--) - *pDst++ = (INT16)((UINT16)*pSrc++ << val); + for (UINT32 x = 0; x < len; x++) + pDst[x] = shift(pSrc[x], val); return PRIMITIVES_SUCCESS; } @@ -105,6 +124,7 @@ static INLINE pstatus_t general_shiftC_16u(const UINT16* pSrc, INT32 val, UINT16 void primitives_init_shift(primitives_t* prims) { /* Start with the default. */ + prims->lShiftC_16s_inplace = general_lShiftC_16s_inplace; prims->lShiftC_16s = general_lShiftC_16s; prims->rShiftC_16s = general_rShiftC_16s; prims->lShiftC_16u = general_lShiftC_16u; diff --git a/libfreerdp/primitives/prim_shift_opt.c b/libfreerdp/primitives/prim_shift_opt.c index 6a046b716..a4dd3c6f0 100644 --- a/libfreerdp/primitives/prim_shift_opt.c +++ b/libfreerdp/primitives/prim_shift_opt.c @@ -42,6 +42,98 @@ SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, _mm_slli_epi16, /* ------------------------------------------------------------------------- */ SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16, *dptr++ = *sptr++ >> val) + +static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 len) +{ + const INT32 shifts = 2; + int count; + if (val == 0) + return PRIMITIVES_SUCCESS; + if (val >= 16) + return -1; + if (len < 16) /* pointless if too small */ + return generic->lShiftC_16s_inplace(pSrcDst, val, len); + + UINT32 offBeatMask = (1 << (shifts - 1)) - 1; + if ((ULONG_PTR)pSrcDst & offBeatMask) + { + /* Incrementing the pointer skips over 16-byte boundary. */ + return generic->lShiftC_16s_inplace(pSrcDst, val, len); + } + /* Get to the 16-byte boundary now. */ + const UINT32 rem = ((UINT_PTR)pSrcDst & 0x0f) / sizeof(INT16); + if (rem > 0) + { + const UINT32 add = 16 - rem; + pstatus_t status = generic->lShiftC_16s_inplace(pSrcDst, val, add); + if (status != PRIMITIVES_SUCCESS) + return status; + pSrcDst += add; + len -= add; + } + + /* Use 8 128-bit SSE registers. */ + count = len >> (8 - shifts); + len -= count << (8 - shifts); + + while (count--) + { + const __m128i* src = (const __m128i*)pSrcDst; + + __m128i xmm0 = _mm_load_si128(src++); + __m128i xmm1 = _mm_load_si128(src++); + __m128i xmm2 = _mm_load_si128(src++); + __m128i xmm3 = _mm_load_si128(src++); + __m128i xmm4 = _mm_load_si128(src++); + __m128i xmm5 = _mm_load_si128(src++); + __m128i xmm6 = _mm_load_si128(src++); + __m128i xmm7 = _mm_load_si128(src); + + xmm0 = _mm_slli_epi16(xmm0, val); + xmm1 = _mm_slli_epi16(xmm1, val); + xmm2 = _mm_slli_epi16(xmm2, val); + xmm3 = _mm_slli_epi16(xmm3, val); + xmm4 = _mm_slli_epi16(xmm4, val); + xmm5 = _mm_slli_epi16(xmm5, val); + xmm6 = _mm_slli_epi16(xmm6, val); + xmm7 = _mm_slli_epi16(xmm7, val); + + __m128i* dst = (__m128i*)pSrcDst; + + _mm_store_si128(dst++, xmm0); + _mm_store_si128(dst++, xmm1); + _mm_store_si128(dst++, xmm2); + _mm_store_si128(dst++, xmm3); + _mm_store_si128(dst++, xmm4); + _mm_store_si128(dst++, xmm5); + _mm_store_si128(dst++, xmm6); + _mm_store_si128(dst++, xmm7); + + pSrcDst = (INT16*)dst; + } + + /* Use a single 128-bit SSE register. */ + count = len >> (5 - shifts); + len -= count << (5 - shifts); + while (count--) + { + const __m128i* src = (const __m128i*)pSrcDst; + __m128i xmm0 = LOAD_SI128(src); + + xmm0 = _mm_slli_epi16(xmm0, val); + + __m128i* dst = (__m128i*)pSrcDst; + _mm_store_si128(dst++, xmm0); + pSrcDst = (INT16*)dst; + } + + /* Finish off the remainder. */ + if (len > 0) + return generic->lShiftC_16s_inplace(pSrcDst, val, len); + + return PRIMITIVES_SUCCESS; +} + #endif /* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s @@ -59,6 +151,7 @@ void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims) if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) && IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) { + prims->lShiftC_16s_inplace = sse2_lShiftC_16s_inplace; prims->lShiftC_16s = sse2_lShiftC_16s; prims->rShiftC_16s = sse2_rShiftC_16s; prims->lShiftC_16u = sse2_lShiftC_16u;