343d210a75
This way we use certain compiler flags (like -msse3) only on files containing optimized code. This avoids problems that occured when using these flags compiling generic code and running it on platforms that don't support these optimizations (i.e. NEON optimization on ARM platforms).
417 lines
14 KiB
C
417 lines
14 KiB
C
/* prim_templates.h
|
|
* vi:ts=4 sw=4
|
|
*
|
|
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
|
|
* Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
* not use this file except in compliance with the License. You may obtain
|
|
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
* or implied. See the License for the specific language governing
|
|
* permissions and limitations under the License. Algorithms used by
|
|
* this code may be covered by patents by HP, Microsoft, or other parties.
|
|
*/
|
|
|
|
#ifdef __GNUC__
|
|
# pragma once
|
|
#endif
|
|
|
|
#ifndef __PRIM_TEMPLATES_H_INCLUDED__
|
|
#define __PRIM_TEMPLATES_H_INCLUDED__
|
|
|
|
/* These are prototypes for SSE (potentially NEON) routines that do a
|
|
* simple SSE operation over an array of data. Since so much of this
|
|
* code is shared except for the operation itself, these prototypes are
|
|
* used rather than duplicating code. The naming convention depends on
|
|
* the parameters: S=Source param; C=Constant; D=Destination.
|
|
* All the macros have parameters for a fallback procedure if the data
|
|
* is too small and an operation "the slow way" for use at 16-byte edges.
|
|
*/
|
|
|
|
/* SSE3 note: If someone needs to support an SSE2 version of these without
|
|
* SSE3 support, an alternative version could be added that merely checks
|
|
* that 16-byte alignment on both destination and source(s) can be
|
|
* achieved, rather than use LDDQU for unaligned reads.
|
|
*/
|
|
|
|
/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
|
|
* It easily can't do that if the value is stored in a variable.
|
|
* So don't save it as an intermediate value.
|
|
*/
|
|
|
|
/* ----------------------------------------------------------------------------
|
|
* SCD = Source, Constant, Destination
|
|
*/
|
|
#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
|
|
pstatus_t _name_(const _type_ *pSrc, INT32 val, _type_ *pDst, INT32 len) \
|
|
{ \
|
|
int shifts; \
|
|
UINT32 offBeatMask; \
|
|
const _type_ *sptr = pSrc; \
|
|
_type_ *dptr = pDst; \
|
|
size_t count; \
|
|
if (len < 16) /* pointless if too small */ \
|
|
{ \
|
|
return _fallback_(pSrc, val, pDst, len); \
|
|
} \
|
|
if (sizeof(_type_) == 1) shifts = 1; \
|
|
else if (sizeof(_type_) == 2) shifts = 2; \
|
|
else if (sizeof(_type_) == 4) shifts = 3; \
|
|
else if (sizeof(_type_) == 8) shifts = 4; \
|
|
offBeatMask = (1 << (shifts - 1)) - 1; \
|
|
if ((ULONG_PTR) pDst & offBeatMask) \
|
|
{ \
|
|
/* Incrementing the pointer skips over 16-byte boundary. */ \
|
|
return _fallback_(pSrc, val, pDst, len); \
|
|
} \
|
|
/* Get to the 16-byte boundary now. */ \
|
|
while ((ULONG_PTR) dptr & 0x0f) \
|
|
{ \
|
|
_slowWay_; \
|
|
if (--len == 0) return PRIMITIVES_SUCCESS; \
|
|
} \
|
|
/* Use 8 128-bit SSE registers. */ \
|
|
count = len >> (8-shifts); \
|
|
len -= count << (8-shifts); \
|
|
if ((ULONG_PTR) sptr & 0x0f) \
|
|
{ \
|
|
while (count--) \
|
|
{ \
|
|
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
|
|
xmm0 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm5 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm6 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm7 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm0 = _op_(xmm0, val); \
|
|
xmm1 = _op_(xmm1, val); \
|
|
xmm2 = _op_(xmm2, val); \
|
|
xmm3 = _op_(xmm3, val); \
|
|
xmm4 = _op_(xmm4, val); \
|
|
xmm5 = _op_(xmm5, val); \
|
|
xmm6 = _op_(xmm6, val); \
|
|
xmm7 = _op_(xmm7, val); \
|
|
_mm_store_si128((__m128i *) dptr, xmm0); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm1); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm2); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm3); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm4); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm5); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm6); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm7); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
} \
|
|
} \
|
|
else \
|
|
{ \
|
|
while (count--) \
|
|
{ \
|
|
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
|
|
xmm0 = _mm_load_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm1 = _mm_load_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm2 = _mm_load_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm3 = _mm_load_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm4 = _mm_load_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm5 = _mm_load_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm6 = _mm_load_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm7 = _mm_load_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm0 = _op_(xmm0, val); \
|
|
xmm1 = _op_(xmm1, val); \
|
|
xmm2 = _op_(xmm2, val); \
|
|
xmm3 = _op_(xmm3, val); \
|
|
xmm4 = _op_(xmm4, val); \
|
|
xmm5 = _op_(xmm5, val); \
|
|
xmm6 = _op_(xmm6, val); \
|
|
xmm7 = _op_(xmm7, val); \
|
|
_mm_store_si128((__m128i *) dptr, xmm0); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm1); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm2); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm3); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm4); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm5); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm6); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm7); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
} \
|
|
} \
|
|
/* Use a single 128-bit SSE register. */ \
|
|
count = len >> (5-shifts); \
|
|
len -= count << (5-shifts); \
|
|
while (count--) \
|
|
{ \
|
|
__m128i xmm0 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
|
|
xmm0 = _op_(xmm0, val); \
|
|
_mm_store_si128((__m128i *) dptr, xmm0); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
} \
|
|
/* Finish off the remainder. */ \
|
|
while (len--) { _slowWay_; } \
|
|
return PRIMITIVES_SUCCESS; \
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------------
|
|
* SCD = Source, Constant, Destination
|
|
* PRE = preload xmm0 with the constant.
|
|
*/
|
|
#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
|
|
pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32 len) \
|
|
{ \
|
|
int shifts; \
|
|
UINT32 offBeatMask; \
|
|
const _type_ *sptr = pSrc; \
|
|
_type_ *dptr = pDst; \
|
|
size_t count; \
|
|
__m128i xmm0; \
|
|
if (len < 16) /* pointless if too small */ \
|
|
{ \
|
|
return _fallback_(pSrc, val, pDst, len); \
|
|
} \
|
|
if (sizeof(_type_) == 1) shifts = 1; \
|
|
else if (sizeof(_type_) == 2) shifts = 2; \
|
|
else if (sizeof(_type_) == 4) shifts = 3; \
|
|
else if (sizeof(_type_) == 8) shifts = 4; \
|
|
offBeatMask = (1 << (shifts - 1)) - 1; \
|
|
if ((ULONG_PTR) pDst & offBeatMask) \
|
|
{ \
|
|
/* Incrementing the pointer skips over 16-byte boundary. */ \
|
|
return _fallback_(pSrc, val, pDst, len); \
|
|
} \
|
|
/* Get to the 16-byte boundary now. */ \
|
|
while ((ULONG_PTR) dptr & 0x0f) \
|
|
{ \
|
|
_slowWay_; \
|
|
if (--len == 0) return PRIMITIVES_SUCCESS; \
|
|
} \
|
|
/* Use 4 128-bit SSE registers. */ \
|
|
count = len >> (7-shifts); \
|
|
len -= count << (7-shifts); \
|
|
xmm0 = _mm_set1_epi32(val); \
|
|
if ((ULONG_PTR) sptr & 0x0f) \
|
|
{ \
|
|
while (count--) \
|
|
{ \
|
|
__m128i xmm1, xmm2, xmm3, xmm4; \
|
|
xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm1 = _op_(xmm1, xmm0); \
|
|
xmm2 = _op_(xmm2, xmm0); \
|
|
xmm3 = _op_(xmm3, xmm0); \
|
|
xmm4 = _op_(xmm4, xmm0); \
|
|
_mm_store_si128((__m128i *) dptr, xmm1); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm2); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm3); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm4); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
} \
|
|
} \
|
|
else \
|
|
{ \
|
|
while (count--) \
|
|
{ \
|
|
__m128i xmm1, xmm2, xmm3, xmm4; \
|
|
xmm1 = _mm_load_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm2 = _mm_load_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm3 = _mm_load_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm4 = _mm_load_si128((__m128i *) sptr); \
|
|
sptr += (16/sizeof(_type_)); \
|
|
xmm1 = _op_(xmm1, xmm0); \
|
|
xmm2 = _op_(xmm2, xmm0); \
|
|
xmm3 = _op_(xmm3, xmm0); \
|
|
xmm4 = _op_(xmm4, xmm0); \
|
|
_mm_store_si128((__m128i *) dptr, xmm1); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm2); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm3); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm4); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
} \
|
|
} \
|
|
/* Use a single 128-bit SSE register. */ \
|
|
count = len >> (5-shifts); \
|
|
len -= count << (5-shifts); \
|
|
while (count--) \
|
|
{ \
|
|
__m128i xmm1 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
|
|
xmm1 = _op_(xmm1, xmm0); \
|
|
_mm_store_si128((__m128i *) dptr, xmm1); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
} \
|
|
/* Finish off the remainder. */ \
|
|
while (len--) { _slowWay_; } \
|
|
return PRIMITIVES_SUCCESS; \
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------------
|
|
* SSD = Source1, Source2, Destination
|
|
*/
|
|
#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
|
|
pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, INT32 len) \
|
|
{ \
|
|
int shifts; \
|
|
UINT32 offBeatMask; \
|
|
const _type_ *sptr1 = pSrc1; \
|
|
const _type_ *sptr2 = pSrc2; \
|
|
_type_ *dptr = pDst; \
|
|
size_t count; \
|
|
if (len < 16) /* pointless if too small */ \
|
|
{ \
|
|
return _fallback_(pSrc1, pSrc2, pDst, len); \
|
|
} \
|
|
if (sizeof(_type_) == 1) shifts = 1; \
|
|
else if (sizeof(_type_) == 2) shifts = 2; \
|
|
else if (sizeof(_type_) == 4) shifts = 3; \
|
|
else if (sizeof(_type_) == 8) shifts = 4; \
|
|
offBeatMask = (1 << (shifts - 1)) - 1; \
|
|
if ((ULONG_PTR) pDst & offBeatMask) \
|
|
{ \
|
|
/* Incrementing the pointer skips over 16-byte boundary. */ \
|
|
return _fallback_(pSrc1, pSrc2, pDst, len); \
|
|
} \
|
|
/* Get to the 16-byte boundary now. */ \
|
|
while ((ULONG_PTR) dptr & 0x0f) \
|
|
{ \
|
|
_slowWay_; \
|
|
if (--len == 0) return PRIMITIVES_SUCCESS; \
|
|
} \
|
|
/* Use 4 128-bit SSE registers. */ \
|
|
count = len >> (7-shifts); \
|
|
len -= count << (7-shifts); \
|
|
if (((ULONG_PTR) sptr1 & 0x0f) || ((ULONG_PTR) sptr2 & 0x0f)) \
|
|
{ \
|
|
/* Unaligned loads */ \
|
|
while (count--) \
|
|
{ \
|
|
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
|
|
xmm0 = _mm_lddqu_si128((__m128i *) sptr1); \
|
|
sptr1 += (16/sizeof(_type_)); \
|
|
xmm1 = _mm_lddqu_si128((__m128i *) sptr1); \
|
|
sptr1 += (16/sizeof(_type_)); \
|
|
xmm2 = _mm_lddqu_si128((__m128i *) sptr1); \
|
|
sptr1 += (16/sizeof(_type_)); \
|
|
xmm3 = _mm_lddqu_si128((__m128i *) sptr1); \
|
|
sptr1 += (16/sizeof(_type_)); \
|
|
xmm4 = _mm_lddqu_si128((__m128i *) sptr2); \
|
|
sptr2 += (16/sizeof(_type_)); \
|
|
xmm5 = _mm_lddqu_si128((__m128i *) sptr2); \
|
|
sptr2 += (16/sizeof(_type_)); \
|
|
xmm6 = _mm_lddqu_si128((__m128i *) sptr2); \
|
|
sptr2 += (16/sizeof(_type_)); \
|
|
xmm7 = _mm_lddqu_si128((__m128i *) sptr2); \
|
|
sptr2 += (16/sizeof(_type_)); \
|
|
xmm0 = _op_(xmm0, xmm4); \
|
|
xmm1 = _op_(xmm1, xmm5); \
|
|
xmm2 = _op_(xmm2, xmm6); \
|
|
xmm3 = _op_(xmm3, xmm7); \
|
|
_mm_store_si128((__m128i *) dptr, xmm0); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm1); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm2); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm3); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
} \
|
|
} \
|
|
else \
|
|
{ \
|
|
/* Aligned loads */ \
|
|
while (count--) \
|
|
{ \
|
|
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
|
|
xmm0 = _mm_load_si128((__m128i *) sptr1); \
|
|
sptr1 += (16/sizeof(_type_)); \
|
|
xmm1 = _mm_load_si128((__m128i *) sptr1); \
|
|
sptr1 += (16/sizeof(_type_)); \
|
|
xmm2 = _mm_load_si128((__m128i *) sptr1); \
|
|
sptr1 += (16/sizeof(_type_)); \
|
|
xmm3 = _mm_load_si128((__m128i *) sptr1); \
|
|
sptr1 += (16/sizeof(_type_)); \
|
|
xmm4 = _mm_load_si128((__m128i *) sptr2); \
|
|
sptr2 += (16/sizeof(_type_)); \
|
|
xmm5 = _mm_load_si128((__m128i *) sptr2); \
|
|
sptr2 += (16/sizeof(_type_)); \
|
|
xmm6 = _mm_load_si128((__m128i *) sptr2); \
|
|
sptr2 += (16/sizeof(_type_)); \
|
|
xmm7 = _mm_load_si128((__m128i *) sptr2); \
|
|
sptr2 += (16/sizeof(_type_)); \
|
|
xmm0 = _op_(xmm0, xmm4); \
|
|
xmm1 = _op_(xmm1, xmm5); \
|
|
xmm2 = _op_(xmm2, xmm6); \
|
|
xmm3 = _op_(xmm3, xmm7); \
|
|
_mm_store_si128((__m128i *) dptr, xmm0); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm1); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm2); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
_mm_store_si128((__m128i *) dptr, xmm3); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
} \
|
|
} \
|
|
/* Use a single 128-bit SSE register. */ \
|
|
count = len >> (5-shifts); \
|
|
len -= count << (5-shifts); \
|
|
while (count--) \
|
|
{ \
|
|
__m128i xmm0, xmm1; \
|
|
xmm0 = LOAD_SI128(sptr1); sptr1 += (16/sizeof(_type_)); \
|
|
xmm1 = LOAD_SI128(sptr2); sptr2 += (16/sizeof(_type_)); \
|
|
xmm0 = _op_(xmm0, xmm1); \
|
|
_mm_store_si128((__m128i *) dptr, xmm0); \
|
|
dptr += (16/sizeof(_type_)); \
|
|
} \
|
|
/* Finish off the remainder. */ \
|
|
while (len--) { _slowWay_; } \
|
|
return PRIMITIVES_SUCCESS; \
|
|
}
|
|
|
|
#endif /* !__PRIM_TEMPLATES_H_INCLUDED__ */
|