FreeRDP/libfreerdp/primitives/sse/prim_shift_sse3.c
akallabeth 896ea3c445
[primitives,codec] guard SSE code with platform
SSE optimized code might be used in multiarch/universal builds.
So not only guard with WITH_SSE2 but also with architecture defines from
winpr/platform.h
2024-06-25 09:56:52 +02:00

166 lines
5.1 KiB
C

/* FreeRDP: A Remote Desktop Protocol Client
* Shift operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <freerdp/config.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_shift.h"
#include "prim_internal.h"
#include "prim_templates.h"
#if defined(SSE2_ENABLED)
#include <emmintrin.h>
#include <pmmintrin.h>
static primitives_t* generic = NULL;
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, _mm_slli_epi16,
*dptr++ = (INT16)((UINT16)*sptr++ << val))
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s, _mm_srai_epi16,
*dptr++ = *sptr++ >> val)
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, _mm_slli_epi16,
*dptr++ = (INT16)((UINT16)*sptr++ << val))
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16,
*dptr++ = *sptr++ >> val)
static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 len)
{
const INT32 shifts = 2;
int count;
if (val == 0)
return PRIMITIVES_SUCCESS;
if (val >= 16)
return -1;
if (len < 16) /* pointless if too small */
return generic->lShiftC_16s_inplace(pSrcDst, val, len);
UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
if ((ULONG_PTR)pSrcDst & offBeatMask)
{
/* Incrementing the pointer skips over 16-byte boundary. */
return generic->lShiftC_16s_inplace(pSrcDst, val, len);
}
/* Get to the 16-byte boundary now. */
const UINT32 rem = ((UINT_PTR)pSrcDst & 0x0f) / sizeof(INT16);
if (rem > 0)
{
const UINT32 add = 16 - rem;
pstatus_t status = generic->lShiftC_16s_inplace(pSrcDst, val, add);
if (status != PRIMITIVES_SUCCESS)
return status;
pSrcDst += add;
len -= add;
}
/* Use 8 128-bit SSE registers. */
count = len >> (8 - shifts);
len -= count << (8 - shifts);
while (count--)
{
const __m128i* src = (const __m128i*)pSrcDst;
__m128i xmm0 = _mm_load_si128(src++);
__m128i xmm1 = _mm_load_si128(src++);
__m128i xmm2 = _mm_load_si128(src++);
__m128i xmm3 = _mm_load_si128(src++);
__m128i xmm4 = _mm_load_si128(src++);
__m128i xmm5 = _mm_load_si128(src++);
__m128i xmm6 = _mm_load_si128(src++);
__m128i xmm7 = _mm_load_si128(src);
xmm0 = _mm_slli_epi16(xmm0, val);
xmm1 = _mm_slli_epi16(xmm1, val);
xmm2 = _mm_slli_epi16(xmm2, val);
xmm3 = _mm_slli_epi16(xmm3, val);
xmm4 = _mm_slli_epi16(xmm4, val);
xmm5 = _mm_slli_epi16(xmm5, val);
xmm6 = _mm_slli_epi16(xmm6, val);
xmm7 = _mm_slli_epi16(xmm7, val);
__m128i* dst = (__m128i*)pSrcDst;
_mm_store_si128(dst++, xmm0);
_mm_store_si128(dst++, xmm1);
_mm_store_si128(dst++, xmm2);
_mm_store_si128(dst++, xmm3);
_mm_store_si128(dst++, xmm4);
_mm_store_si128(dst++, xmm5);
_mm_store_si128(dst++, xmm6);
_mm_store_si128(dst++, xmm7);
pSrcDst = (INT16*)dst;
}
/* Use a single 128-bit SSE register. */
count = len >> (5 - shifts);
len -= count << (5 - shifts);
while (count--)
{
const __m128i* src = (const __m128i*)pSrcDst;
__m128i xmm0 = LOAD_SI128(src);
xmm0 = _mm_slli_epi16(xmm0, val);
__m128i* dst = (__m128i*)pSrcDst;
_mm_store_si128(dst++, xmm0);
pSrcDst = (INT16*)dst;
}
/* Finish off the remainder. */
if (len > 0)
return generic->lShiftC_16s_inplace(pSrcDst, val, len);
return PRIMITIVES_SUCCESS;
}
#endif
/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
* depending on the sign of val. To avoid using the deprecated inplace
* routines, a wrapper can use the src for the dest.
*/
/* ------------------------------------------------------------------------- */
void primitives_init_shift_sse3(primitives_t* WINPR_RESTRICT prims)
{
#if defined(SSE2_ENABLED)
generic = primitives_get_generic();
primitives_init_shift(prims);
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
{
WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
prims->lShiftC_16s_inplace = sse2_lShiftC_16s_inplace;
prims->lShiftC_16s = sse2_lShiftC_16s;
prims->rShiftC_16s = sse2_rShiftC_16s;
prims->lShiftC_16u = sse2_lShiftC_16u;
prims->rShiftC_16u = sse2_rShiftC_16u;
}
#else
WLog_VRB(PRIM_TAG, "undefined WITH_SSE2");
WINPR_UNUSED(prims);
#endif
}