Updated primitives API and tests.
This commit is contained in:
parent
e860fde4bc
commit
99c418766c
@ -81,62 +81,62 @@ typedef pstatus_t (*__copy_8u_AC4r_t)(
|
|||||||
typedef pstatus_t (*__set_8u_t)(
|
typedef pstatus_t (*__set_8u_t)(
|
||||||
BYTE val,
|
BYTE val,
|
||||||
BYTE* pDst,
|
BYTE* pDst,
|
||||||
INT32 len);
|
UINT32 len);
|
||||||
typedef pstatus_t (*__set_32s_t)(
|
typedef pstatus_t (*__set_32s_t)(
|
||||||
INT32 val,
|
INT32 val,
|
||||||
INT32* pDst,
|
INT32* pDst,
|
||||||
INT32 len);
|
UINT32 len);
|
||||||
typedef pstatus_t (*__set_32u_t)(
|
typedef pstatus_t (*__set_32u_t)(
|
||||||
UINT32 val,
|
UINT32 val,
|
||||||
UINT32* pDst,
|
UINT32* pDst,
|
||||||
INT32 len);
|
UINT32 len);
|
||||||
typedef pstatus_t (*__zero_t)(
|
typedef pstatus_t (*__zero_t)(
|
||||||
void* pDst,
|
void* pDst,
|
||||||
size_t bytes);
|
size_t bytes);
|
||||||
typedef pstatus_t (*__alphaComp_argb_t)(
|
typedef pstatus_t (*__alphaComp_argb_t)(
|
||||||
const BYTE* pSrc1, INT32 src1Step,
|
const BYTE* pSrc1, UINT32 src1Step,
|
||||||
const BYTE* pSrc2, INT32 src2Step,
|
const BYTE* pSrc2, UINT32 src2Step,
|
||||||
BYTE* pDst, INT32 dstStep,
|
BYTE* pDst, UINT32 dstStep,
|
||||||
INT32 width, INT32 height);
|
UINT32 width, UINT32 height);
|
||||||
typedef pstatus_t (*__add_16s_t)(
|
typedef pstatus_t (*__add_16s_t)(
|
||||||
const INT16* pSrc1,
|
const INT16* pSrc1,
|
||||||
const INT16* pSrc2,
|
const INT16* pSrc2,
|
||||||
INT16* pDst,
|
INT16* pDst,
|
||||||
INT32 len);
|
UINT32 len);
|
||||||
typedef pstatus_t (*__lShiftC_16s_t)(
|
typedef pstatus_t (*__lShiftC_16s_t)(
|
||||||
const INT16* pSrc,
|
const INT16* pSrc,
|
||||||
INT32 val,
|
UINT32 val,
|
||||||
INT16* pSrcDst,
|
INT16* pSrcDst,
|
||||||
INT32 len);
|
UINT32 len);
|
||||||
typedef pstatus_t (*__lShiftC_16u_t)(
|
typedef pstatus_t (*__lShiftC_16u_t)(
|
||||||
const UINT16* pSrc,
|
const UINT16* pSrc,
|
||||||
INT32 val,
|
UINT32 val,
|
||||||
UINT16* pSrcDst,
|
UINT16* pSrcDst,
|
||||||
INT32 len);
|
UINT32 len);
|
||||||
typedef pstatus_t (*__rShiftC_16s_t)(
|
typedef pstatus_t (*__rShiftC_16s_t)(
|
||||||
const INT16* pSrc,
|
const INT16* pSrc,
|
||||||
INT32 val,
|
UINT32 val,
|
||||||
INT16* pSrcDst,
|
INT16* pSrcDst,
|
||||||
INT32 len);
|
UINT32 len);
|
||||||
typedef pstatus_t (*__rShiftC_16u_t)(
|
typedef pstatus_t (*__rShiftC_16u_t)(
|
||||||
const UINT16* pSrc,
|
const UINT16* pSrc,
|
||||||
INT32 val,
|
UINT32 val,
|
||||||
UINT16* pSrcDst,
|
UINT16* pSrcDst,
|
||||||
INT32 len);
|
UINT32 len);
|
||||||
typedef pstatus_t (*__shiftC_16s_t)(
|
typedef pstatus_t (*__shiftC_16s_t)(
|
||||||
const INT16* pSrc,
|
const INT16* pSrc,
|
||||||
INT32 val,
|
INT32 val,
|
||||||
INT16* pSrcDst,
|
INT16* pSrcDst,
|
||||||
INT32 len);
|
UINT32 len);
|
||||||
typedef pstatus_t (*__shiftC_16u_t)(
|
typedef pstatus_t (*__shiftC_16u_t)(
|
||||||
const UINT16* pSrc,
|
const UINT16* pSrc,
|
||||||
INT32 val,
|
INT32 val,
|
||||||
UINT16* pSrcDst,
|
UINT16* pSrcDst,
|
||||||
INT32 len);
|
UINT32 len);
|
||||||
typedef pstatus_t (*__sign_16s_t)(
|
typedef pstatus_t (*__sign_16s_t)(
|
||||||
const INT16* pSrc,
|
const INT16* pSrc,
|
||||||
INT16* pDst,
|
INT16* pDst,
|
||||||
INT32 len);
|
UINT32 len);
|
||||||
typedef pstatus_t (*__yCbCrToRGB_16s8u_P3AC4R_t)(
|
typedef pstatus_t (*__yCbCrToRGB_16s8u_P3AC4R_t)(
|
||||||
const INT16* pSrc[3], INT32 srcStep,
|
const INT16* pSrc[3], INT32 srcStep,
|
||||||
BYTE* pDst, INT32 dstStep, UINT32 DstFormat,
|
BYTE* pDst, INT32 dstStep, UINT32 DstFormat,
|
||||||
@ -154,8 +154,8 @@ typedef pstatus_t (*__RGBToYCbCr_16s16s_P3P3_t)(
|
|||||||
INT16* pDst[3], INT32 dstStep,
|
INT16* pDst[3], INT32 dstStep,
|
||||||
const prim_size_t* roi);
|
const prim_size_t* roi);
|
||||||
typedef pstatus_t (*__RGBToRGB_16s8u_P3AC4R_t)(
|
typedef pstatus_t (*__RGBToRGB_16s8u_P3AC4R_t)(
|
||||||
const INT16* pSrc[3], INT32 srcStep,
|
const INT16* const pSrc[3], UINT32 srcStep,
|
||||||
BYTE* pDst, INT32 dstStep, UINT32 DstFormat,
|
BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
|
||||||
const prim_size_t* roi);
|
const prim_size_t* roi);
|
||||||
typedef pstatus_t (*__YCoCgToRGB_8u_AC4R_t)(
|
typedef pstatus_t (*__YCoCgToRGB_8u_AC4R_t)(
|
||||||
const BYTE* pSrc, INT32 srcStep,
|
const BYTE* pSrc, INT32 srcStep,
|
||||||
@ -177,11 +177,11 @@ typedef pstatus_t (*__YUV444ToRGB_8u_P3AC4R_t)(
|
|||||||
BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
|
BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
|
||||||
const prim_size_t* roi);
|
const prim_size_t* roi);
|
||||||
typedef pstatus_t (*__RGBToYUV420_8u_P3AC4R_t)(
|
typedef pstatus_t (*__RGBToYUV420_8u_P3AC4R_t)(
|
||||||
const BYTE* pSrc, UINT32 srcStep,
|
const BYTE* pSrc, UINT32 SrcFormat, UINT32 srcStep,
|
||||||
BYTE* pDst[3], UINT32 dstStep[3],
|
BYTE* pDst[3], UINT32 dstStep[3],
|
||||||
const prim_size_t* roi);
|
const prim_size_t* roi);
|
||||||
typedef pstatus_t (*__RGBToYUV444_8u_P3AC4R_t)(
|
typedef pstatus_t (*__RGBToYUV444_8u_P3AC4R_t)(
|
||||||
const BYTE* pSrc, UINT32 srcStep,
|
const BYTE* pSrc, UINT32 SrcFormat, UINT32 srcStep,
|
||||||
BYTE* pDst[3], UINT32 dstStep[3],
|
BYTE* pDst[3], UINT32 dstStep[3],
|
||||||
const prim_size_t* roi);
|
const prim_size_t* roi);
|
||||||
typedef pstatus_t (*__YUV420CombineToYUV444_t)(
|
typedef pstatus_t (*__YUV420CombineToYUV444_t)(
|
||||||
|
@ -1567,7 +1567,7 @@ INT32 avc420_compress(H264_CONTEXT* h264, BYTE* pSrcData, DWORD SrcFormat,
|
|||||||
roi.width = nSrcWidth;
|
roi.width = nSrcWidth;
|
||||||
roi.height = nSrcHeight;
|
roi.height = nSrcHeight;
|
||||||
|
|
||||||
prims->RGBToYUV420_8u_P3AC4R(pSrcData, nSrcStep, pYUVData, iStride, &roi);
|
prims->RGBToYUV420_8u_P3AC4R(pSrcData, SrcFormat, nSrcStep, pYUVData, iStride, &roi);
|
||||||
|
|
||||||
status = h264->subsystem->Compress(h264, ppDstData, pDstSize, 0);
|
status = h264->subsystem->Compress(h264, ppDstData, pDstSize, 0);
|
||||||
|
|
||||||
|
@ -33,11 +33,11 @@
|
|||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
static pstatus_t general_YCoCgToRGB_8u_AC4R(
|
static pstatus_t general_YCoCgToRGB_8u_AC4R(
|
||||||
const BYTE* pSrc, INT32 srcStep,
|
const BYTE* pSrc, INT32 srcStep,
|
||||||
BYTE* pDst, UINT32 DstFormat, INT32 dstStep,
|
BYTE* pDst, UINT32 DstFormat, INT32 dstStep,
|
||||||
UINT32 width, UINT32 height,
|
UINT32 width, UINT32 height,
|
||||||
UINT8 shift,
|
UINT8 shift,
|
||||||
BOOL withAlpha)
|
BOOL withAlpha)
|
||||||
{
|
{
|
||||||
BYTE A;
|
BYTE A;
|
||||||
UINT32 x, y;
|
UINT32 x, y;
|
||||||
@ -66,8 +66,11 @@ static pstatus_t general_YCoCgToRGB_8u_AC4R(
|
|||||||
R = T + Co;
|
R = T + Co;
|
||||||
G = Y + Cg;
|
G = Y + Cg;
|
||||||
B = T - Co;
|
B = T - Co;
|
||||||
color = GetColor(DstFormat, MINMAX(R, 0, 255), MINMAX(G, 0, 255), MINMAX(B, 0,
|
|
||||||
255), A);
|
color = GetColor(DstFormat,
|
||||||
|
MINMAX(R, 0, 255), MINMAX(G, 0, 255),
|
||||||
|
MINMAX(B, 0, 255), A);
|
||||||
|
WriteColor(dptr, DstFormat, color);
|
||||||
dptr += GetBytesPerPixel(DstFormat);
|
dptr += GetBytesPerPixel(DstFormat);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -40,8 +40,8 @@ static primitives_t* generic = NULL;
|
|||||||
#ifdef WITH_SSE2
|
#ifdef WITH_SSE2
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
|
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
|
||||||
const BYTE* pSrc, INT32 srcStep,
|
const BYTE* pSrc, UINT32 srcStep,
|
||||||
BYTE* pDst, INT32 dstStep,
|
BYTE* pDst, UINT32 DstFormat, UINT32 dstStep,
|
||||||
UINT32 width, UINT32 height,
|
UINT32 width, UINT32 height,
|
||||||
UINT8 shift,
|
UINT8 shift,
|
||||||
BOOL withAlpha)
|
BOOL withAlpha)
|
||||||
@ -70,8 +70,8 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
|
|||||||
{
|
{
|
||||||
/* Too small, or we'll never hit a 16-byte boundary. Punt. */
|
/* Too small, or we'll never hit a 16-byte boundary. Punt. */
|
||||||
return generic->YCoCgToRGB_8u_AC4R(
|
return generic->YCoCgToRGB_8u_AC4R(
|
||||||
pSrc, srcStep, pDst, dstStep,
|
pSrc, srcStep, pDst, DstFormat, dstStep,
|
||||||
width, height, shift, withAlpha, TRUE);
|
width, height, shift, withAlpha);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (h = 0; h < height; h++)
|
for (h = 0; h < height; h++)
|
||||||
@ -82,12 +82,16 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
|
|||||||
/* Get to a 16-byte destination boundary. */
|
/* Get to a 16-byte destination boundary. */
|
||||||
if ((ULONG_PTR) dptr & 0x0f)
|
if ((ULONG_PTR) dptr & 0x0f)
|
||||||
{
|
{
|
||||||
|
pstatus_t status;
|
||||||
int startup = (16 - ((ULONG_PTR) dptr & 0x0f)) / 4;
|
int startup = (16 - ((ULONG_PTR) dptr & 0x0f)) / 4;
|
||||||
|
|
||||||
if (startup > width) startup = width;
|
if (startup > width) startup = width;
|
||||||
|
|
||||||
generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep,
|
status = generic->YCoCgToRGB_8u_AC4R(
|
||||||
startup, 1, shift, withAlpha, TRUE);
|
sptr, srcStep, dptr, DstFormat, dstStep,
|
||||||
|
startup, 1, shift, withAlpha);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return status;
|
||||||
sptr += startup * sizeof(UINT32);
|
sptr += startup * sizeof(UINT32);
|
||||||
dptr += startup * sizeof(UINT32);
|
dptr += startup * sizeof(UINT32);
|
||||||
w -= startup;
|
w -= startup;
|
||||||
@ -195,8 +199,13 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
|
|||||||
/* Handle any remainder pixels. */
|
/* Handle any remainder pixels. */
|
||||||
if (w > 0)
|
if (w > 0)
|
||||||
{
|
{
|
||||||
generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep,
|
pstatus_t status;
|
||||||
w, 1, shift, withAlpha, TRUE);
|
status = generic->YCoCgToRGB_8u_AC4R(
|
||||||
|
sptr, srcStep, dptr, DstFormat, dstStep,
|
||||||
|
w, 1, shift, withAlpha);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return status;
|
||||||
|
|
||||||
sptr += w * sizeof(UINT32);
|
sptr += w * sizeof(UINT32);
|
||||||
dptr += w * sizeof(UINT32);
|
dptr += w * sizeof(UINT32);
|
||||||
}
|
}
|
||||||
@ -210,8 +219,8 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
|
|||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
|
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
|
||||||
const BYTE* pSrc, INT32 srcStep,
|
const BYTE* pSrc, UINT32 srcStep,
|
||||||
BYTE* pDst, INT32 dstStep,
|
BYTE* pDst, UINT32 DstFormat, UINT32 dstStep,
|
||||||
UINT32 width, UINT32 height,
|
UINT32 width, UINT32 height,
|
||||||
UINT8 shift,
|
UINT8 shift,
|
||||||
BOOL withAlpha)
|
BOOL withAlpha)
|
||||||
@ -240,9 +249,8 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
|
|||||||
{
|
{
|
||||||
/* Too small, or we'll never hit a 16-byte boundary. Punt. */
|
/* Too small, or we'll never hit a 16-byte boundary. Punt. */
|
||||||
return generic->YCoCgToRGB_8u_AC4R(
|
return generic->YCoCgToRGB_8u_AC4R(
|
||||||
pSrc, srcStep,
|
pSrc, srcStep, pDst, DstFormat, dstStep,
|
||||||
pDst, dstStep, width, height, shift,
|
width, height, shift, withAlpha);
|
||||||
withAlpha, FALSE);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (h = 0; h < height; h++)
|
for (h = 0; h < height; h++)
|
||||||
@ -253,12 +261,17 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
|
|||||||
/* Get to a 16-byte destination boundary. */
|
/* Get to a 16-byte destination boundary. */
|
||||||
if ((ULONG_PTR) dptr & 0x0f)
|
if ((ULONG_PTR) dptr & 0x0f)
|
||||||
{
|
{
|
||||||
|
pstatus_t status;
|
||||||
int startup = (16 - ((ULONG_PTR) dptr & 0x0f)) / 4;
|
int startup = (16 - ((ULONG_PTR) dptr & 0x0f)) / 4;
|
||||||
|
|
||||||
if (startup > width) startup = width;
|
if (startup > width) startup = width;
|
||||||
|
|
||||||
generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep,
|
status = generic->YCoCgToRGB_8u_AC4R(
|
||||||
startup, 1, shift, withAlpha, FALSE);
|
sptr, srcStep, dptr, DstFormat,
|
||||||
|
dstStep, startup, 1, shift, withAlpha);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return status;
|
||||||
|
|
||||||
sptr += startup * sizeof(UINT32);
|
sptr += startup * sizeof(UINT32);
|
||||||
dptr += startup * sizeof(UINT32);
|
dptr += startup * sizeof(UINT32);
|
||||||
w -= startup;
|
w -= startup;
|
||||||
@ -370,8 +383,13 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
|
|||||||
/* Handle any remainder pixels. */
|
/* Handle any remainder pixels. */
|
||||||
if (w > 0)
|
if (w > 0)
|
||||||
{
|
{
|
||||||
generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep,
|
pstatus_t status;
|
||||||
w, 1, shift, withAlpha, FALSE);
|
status = generic->YCoCgToRGB_8u_AC4R(
|
||||||
|
sptr, srcStep, dptr, DstFormat, dstStep,
|
||||||
|
w, 1, shift, withAlpha);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return status;
|
||||||
|
|
||||||
sptr += w * sizeof(UINT32);
|
sptr += w * sizeof(UINT32);
|
||||||
dptr += w * sizeof(UINT32);
|
dptr += w * sizeof(UINT32);
|
||||||
}
|
}
|
||||||
@ -388,21 +406,29 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
|
|||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(
|
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(
|
||||||
const BYTE* pSrc, INT32 srcStep,
|
const BYTE* pSrc, INT32 srcStep,
|
||||||
BYTE* pDst, INT32 dstStep,
|
BYTE* pDst, UINT32 DstFormat, INT32 dstStep,
|
||||||
UINT32 width, UINT32 height,
|
UINT32 width, UINT32 height,
|
||||||
UINT8 shift,
|
UINT8 shift,
|
||||||
BOOL withAlpha,
|
BOOL withAlpha)
|
||||||
BOOL invert)
|
|
||||||
{
|
{
|
||||||
if (invert)
|
// TODO: Need to implement proper color conversion!!!
|
||||||
|
return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat,
|
||||||
|
dstStep, width, height, shift, withAlpha);
|
||||||
|
|
||||||
|
switch(DstFormat)
|
||||||
{
|
{
|
||||||
return ssse3_YCoCgRToRGB_8u_AC4R_invert(pSrc, srcStep, pDst, dstStep,
|
case PIXEL_FORMAT_BGRX32:
|
||||||
width, height, shift, withAlpha);
|
case PIXEL_FORMAT_BGRA32:
|
||||||
}
|
return ssse3_YCoCgRToRGB_8u_AC4R_invert(
|
||||||
else
|
pSrc, srcStep, pDst, DstFormat, dstStep,
|
||||||
{
|
width, height, shift, withAlpha);
|
||||||
return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(pSrc, srcStep, pDst, dstStep,
|
case PIXEL_FORMAT_RGBX32:
|
||||||
width, height, shift, withAlpha);
|
case PIXEL_FORMAT_RGBA32:
|
||||||
|
return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
|
||||||
|
pSrc, srcStep, pDst, DstFormat, dstStep,
|
||||||
|
width, height, shift, withAlpha);
|
||||||
|
default:
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif /* WITH_SSE2 */
|
#endif /* WITH_SSE2 */
|
||||||
|
@ -232,9 +232,9 @@ static pstatus_t general_YUV444SplitToYUV420(
|
|||||||
{
|
{
|
||||||
/* Filter */
|
/* Filter */
|
||||||
const INT32 u = pSrcU[2 * x] + pSrcU[2 * x + 1] + pSrcU1[2 * x]
|
const INT32 u = pSrcU[2 * x] + pSrcU[2 * x + 1] + pSrcU1[2 * x]
|
||||||
+ pSrcU1[2 * x + 1];
|
+ pSrcU1[2 * x + 1];
|
||||||
const INT32 v = pSrcV[2 * x] + pSrcV[2 * x + 1] + pSrcV1[2 * x]
|
const INT32 v = pSrcV[2 * x] + pSrcV[2 * x + 1] + pSrcV1[2 * x]
|
||||||
+ pSrcV1[2 * x + 1];
|
+ pSrcV1[2 * x + 1];
|
||||||
pU[x] = CLIP(u / 4L);
|
pU[x] = CLIP(u / 4L);
|
||||||
pV[x] = CLIP(v / 4L);
|
pV[x] = CLIP(v / 4L);
|
||||||
}
|
}
|
||||||
@ -331,7 +331,7 @@ static INLINE BYTE* writePixel(BYTE* dst, UINT32 format, BYTE Y, BYTE U, BYTE V)
|
|||||||
const BYTE r = YUV2R(Y, U, V);
|
const BYTE r = YUV2R(Y, U, V);
|
||||||
const BYTE g = YUV2G(Y, U, V);
|
const BYTE g = YUV2G(Y, U, V);
|
||||||
const BYTE b = YUV2B(Y, U, V);
|
const BYTE b = YUV2B(Y, U, V);
|
||||||
UINT32 color = GetColor(format, r, g, b, 0);
|
UINT32 color = GetColor(format, r, g, b, 0xFF);
|
||||||
WriteColor(dst, format, color);
|
WriteColor(dst, format, color);
|
||||||
return dst + GetBytesPerPixel(format);
|
return dst + GetBytesPerPixel(format);
|
||||||
}
|
}
|
||||||
@ -500,9 +500,10 @@ static INLINE BYTE RGB2V(INT32 R, INT32 G, INT32 B)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static pstatus_t general_RGBToYUV444_8u_P3AC4R(
|
static pstatus_t general_RGBToYUV444_8u_P3AC4R(
|
||||||
const BYTE* pSrc, const UINT32 srcStep,
|
const BYTE* pSrc, UINT32 SrcFormat, const UINT32 srcStep,
|
||||||
BYTE* pDst[3], UINT32 dstStep[3], const prim_size_t* roi)
|
BYTE* pDst[3], UINT32 dstStep[3], const prim_size_t* roi)
|
||||||
{
|
{
|
||||||
|
const UINT32 bpp = GetBytesPerPixel(SrcFormat);
|
||||||
UINT32 x, y;
|
UINT32 x, y;
|
||||||
UINT32 nWidth, nHeight;
|
UINT32 nWidth, nHeight;
|
||||||
nWidth = roi->width;
|
nWidth = roi->width;
|
||||||
@ -517,9 +518,10 @@ static pstatus_t general_RGBToYUV444_8u_P3AC4R(
|
|||||||
|
|
||||||
for (x = 0; x < nWidth; x++)
|
for (x = 0; x < nWidth; x++)
|
||||||
{
|
{
|
||||||
const BYTE B = pRGB[4 * x + 0];
|
BYTE B, G, R;
|
||||||
const BYTE G = pRGB[4 * x + 1];
|
const UINT32 color = ReadColor(&pRGB[x * bpp], SrcFormat);
|
||||||
const BYTE R = pRGB[4 * x + 2];
|
SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL);
|
||||||
|
|
||||||
pY[x] = RGB2Y(R, G, B);
|
pY[x] = RGB2Y(R, G, B);
|
||||||
pU[x] = RGB2U(R, G, B);
|
pU[x] = RGB2U(R, G, B);
|
||||||
pV[x] = RGB2V(R, G, B);
|
pV[x] = RGB2V(R, G, B);
|
||||||
@ -530,9 +532,10 @@ static pstatus_t general_RGBToYUV444_8u_P3AC4R(
|
|||||||
}
|
}
|
||||||
|
|
||||||
static pstatus_t general_RGBToYUV420_8u_P3AC4R(
|
static pstatus_t general_RGBToYUV420_8u_P3AC4R(
|
||||||
const BYTE* pSrc, UINT32 srcStep,
|
const BYTE* pSrc, UINT32 SrcFormat, UINT32 srcStep,
|
||||||
BYTE* pDst[3], UINT32 dstStep[3], const prim_size_t* roi)
|
BYTE* pDst[3], UINT32 dstStep[3], const prim_size_t* roi)
|
||||||
{
|
{
|
||||||
|
const UINT32 bpp = GetBytesPerPixel(SrcFormat);
|
||||||
UINT32 x, y;
|
UINT32 x, y;
|
||||||
UINT32 halfWidth;
|
UINT32 halfWidth;
|
||||||
UINT32 halfHeight;
|
UINT32 halfHeight;
|
||||||
@ -555,39 +558,50 @@ static pstatus_t general_RGBToYUV420_8u_P3AC4R(
|
|||||||
|
|
||||||
for (x = 0; x < halfWidth; x++)
|
for (x = 0; x < halfWidth; x++)
|
||||||
{
|
{
|
||||||
INT32 R, G, B;
|
UINT32 color;
|
||||||
INT32 Ra, Ga, Ba;
|
INT32 Ra, Ga, Ba;
|
||||||
const UINT32 val2x = (x * 2);
|
const UINT32 val2x = (x * 2);
|
||||||
const UINT32 val2x1 = val2x + 1;
|
const UINT32 val2x1 = val2x + 1;
|
||||||
|
BYTE B, G, R;
|
||||||
|
|
||||||
/* 1st pixel */
|
/* 1st pixel */
|
||||||
Ba = B = pRGB[val2x * 4 + 0];
|
color = ReadColor(&pRGB[val2x * bpp], SrcFormat);
|
||||||
Ga = G = pRGB[val2x * 4 + 1];
|
SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL);
|
||||||
Ra = R = pRGB[val2x * 4 + 2];
|
|
||||||
|
Ba = B;
|
||||||
|
Ga = G;
|
||||||
|
Ra = R;
|
||||||
pY[val2x] = RGB2Y(R, G, B);
|
pY[val2x] = RGB2Y(R, G, B);
|
||||||
|
|
||||||
if (val2x1 < nWidth)
|
if (val2x1 < nWidth)
|
||||||
{
|
{
|
||||||
/* 2nd pixel */
|
/* 2nd pixel */
|
||||||
Ba += B = pRGB[val2x * 4 + 4];
|
color = ReadColor(&pRGB[val2x1 * bpp], SrcFormat);
|
||||||
Ga += G = pRGB[val2x * 4 + 5];
|
SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL);
|
||||||
Ra += R = pRGB[val2x * 4 + 6];
|
Ba += B;
|
||||||
|
Ga += G;
|
||||||
|
Ra += R;
|
||||||
pY[val2x1] = RGB2Y(R, G, B);
|
pY[val2x1] = RGB2Y(R, G, B);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (val2y1 < nHeight)
|
if (val2y1 < nHeight)
|
||||||
{
|
{
|
||||||
/* 3rd pixel */
|
/* 3rd pixel */
|
||||||
Ba += B = pRGB1[val2x * 4 + 0];
|
color = ReadColor(&pRGB1[val2x * bpp], SrcFormat);
|
||||||
Ga += G = pRGB1[val2x * 4 + 1];
|
SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL);
|
||||||
Ra += R = pRGB1[val2x * 4 + 2];
|
Ba += B;
|
||||||
|
Ga += G;
|
||||||
|
Ra += R;
|
||||||
pY1[val2x] = RGB2Y(R, G, B);
|
pY1[val2x] = RGB2Y(R, G, B);
|
||||||
|
|
||||||
if (val2x1 < nWidth)
|
if (val2x1 < nWidth)
|
||||||
{
|
{
|
||||||
/* 4th pixel */
|
/* 4th pixel */
|
||||||
Ba += B = pRGB1[val2x * 4 + 4];
|
color = ReadColor(&pRGB1[val2x1 * bpp], SrcFormat);
|
||||||
Ga += G = pRGB1[val2x * 4 + 5];
|
SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL);
|
||||||
Ra += R = pRGB1[val2x * 4 + 6];
|
Ba += B;
|
||||||
|
Ga += G;
|
||||||
|
Ra += R;
|
||||||
pY1[val2x1] = RGB2Y(R, G, B);
|
pY1[val2x1] = RGB2Y(R, G, B);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -35,6 +35,11 @@ static pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(
|
|||||||
UINT32 i, nWidth, nHeight, VaddDst, VaddY, VaddU, VaddV;
|
UINT32 i, nWidth, nHeight, VaddDst, VaddY, VaddU, VaddV;
|
||||||
__m128i r0, r1, r2, r3, r4, r5, r6, r7;
|
__m128i r0, r1, r2, r3, r4, r5, r6, r7;
|
||||||
__m128i* buffer;
|
__m128i* buffer;
|
||||||
|
|
||||||
|
// TODO: Need to implement proper color conversion!!!!!
|
||||||
|
return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep,
|
||||||
|
DstFormat, roi);
|
||||||
|
|
||||||
/* last_line: if the last (U,V doubled) line should be skipped, set to 10B
|
/* last_line: if the last (U,V doubled) line should be skipped, set to 10B
|
||||||
* last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */
|
* last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */
|
||||||
buffer = _aligned_malloc(4 * 16, 16);
|
buffer = _aligned_malloc(4 * 16, 16);
|
||||||
|
@ -30,7 +30,7 @@ static pstatus_t general_add_16s(
|
|||||||
const INT16* pSrc1,
|
const INT16* pSrc1,
|
||||||
const INT16* pSrc2,
|
const INT16* pSrc2,
|
||||||
INT16* pDst,
|
INT16* pDst,
|
||||||
INT32 len)
|
UINT32 len)
|
||||||
{
|
{
|
||||||
while (len--)
|
while (len--)
|
||||||
{
|
{
|
||||||
|
@ -40,7 +40,7 @@ static primitives_t* generic = NULL;
|
|||||||
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
|
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s,
|
SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s,
|
||||||
_mm_adds_epi16, generic->add_16s(sptr1++, sptr2++, dptr++, 1))
|
_mm_adds_epi16, generic->add_16s(sptr1++, sptr2++, dptr++, 1))
|
||||||
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
|
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -36,23 +36,19 @@
|
|||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
static pstatus_t general_alphaComp_argb(
|
static pstatus_t general_alphaComp_argb(
|
||||||
const BYTE* pSrc1, INT32 src1Step,
|
const BYTE* pSrc1, UINT32 src1Step,
|
||||||
const BYTE* pSrc2, INT32 src2Step,
|
const BYTE* pSrc2, UINT32 src2Step,
|
||||||
BYTE* pDst, INT32 dstStep,
|
BYTE* pDst, UINT32 dstStep,
|
||||||
INT32 width, INT32 height)
|
UINT32 width, UINT32 height)
|
||||||
{
|
{
|
||||||
const UINT32* sptr1 = (const UINT32*) pSrc1;
|
UINT32 y;
|
||||||
const UINT32* sptr2 = (const UINT32*) pSrc2;
|
|
||||||
UINT32* dptr = (UINT32*) pDst;
|
|
||||||
int linebytes = width * sizeof(UINT32);
|
|
||||||
int src1Jump = (src1Step - linebytes) / sizeof(UINT32);
|
|
||||||
int src2Jump = (src2Step - linebytes) / sizeof(UINT32);
|
|
||||||
int dstJump = (dstStep - linebytes) / sizeof(UINT32);
|
|
||||||
int y;
|
|
||||||
|
|
||||||
for (y = 0; y < height; y++)
|
for (y = 0; y < height; y++)
|
||||||
{
|
{
|
||||||
int x;
|
const UINT32* sptr1 = (const UINT32*) (pSrc1 + y * src1Step);
|
||||||
|
const UINT32* sptr2 = (const UINT32*) (pSrc2 + y * src2Step);
|
||||||
|
UINT32* dptr = (UINT32*) (pDst + y * dstStep);
|
||||||
|
UINT32 x;
|
||||||
|
|
||||||
for (x = 0; x < width; x++)
|
for (x = 0; x < width; x++)
|
||||||
{
|
{
|
||||||
@ -92,10 +88,6 @@ static pstatus_t general_alphaComp_argb(
|
|||||||
*dptr++ = rb | ag;
|
*dptr++ = rb | ag;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sptr1 += src1Jump;
|
|
||||||
sptr2 += src2Jump;
|
|
||||||
dptr += dstJump;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return PRIMITIVES_SUCCESS;
|
return PRIMITIVES_SUCCESS;
|
||||||
|
@ -46,10 +46,10 @@ static primitives_t* generic = NULL;
|
|||||||
#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
|
#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
|
||||||
|
|
||||||
pstatus_t sse2_alphaComp_argb(
|
pstatus_t sse2_alphaComp_argb(
|
||||||
const BYTE* pSrc1, INT32 src1Step,
|
const BYTE* pSrc1, UINT32 src1Step,
|
||||||
const BYTE* pSrc2, INT32 src2Step,
|
const BYTE* pSrc2, UINT32 src2Step,
|
||||||
BYTE* pDst, INT32 dstStep,
|
BYTE* pDst, UINT32 dstStep,
|
||||||
INT32 width, INT32 height)
|
UINT32 width, UINT32 height)
|
||||||
{
|
{
|
||||||
const UINT32* sptr1 = (const UINT32*) pSrc1;
|
const UINT32* sptr1 = (const UINT32*) pSrc1;
|
||||||
const UINT32* sptr2 = (const UINT32*) pSrc2;
|
const UINT32* sptr2 = (const UINT32*) pSrc2;
|
||||||
@ -62,7 +62,7 @@ pstatus_t sse2_alphaComp_argb(
|
|||||||
if (width < 4) /* pointless if too small */
|
if (width < 4) /* pointless if too small */
|
||||||
{
|
{
|
||||||
return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step,
|
return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step,
|
||||||
pDst, dstStep, width, height);
|
pDst, dstStep, width, height);
|
||||||
}
|
}
|
||||||
|
|
||||||
dptr = (UINT32*) pDst;
|
dptr = (UINT32*) pDst;
|
||||||
@ -108,9 +108,13 @@ pstatus_t sse2_alphaComp_argb(
|
|||||||
|
|
||||||
if (leadIn)
|
if (leadIn)
|
||||||
{
|
{
|
||||||
generic->alphaComp_argb((const BYTE*) sptr1,
|
pstatus_t status;
|
||||||
src1Step, (const BYTE*) sptr2, src2Step,
|
status = generic->alphaComp_argb((const BYTE*) sptr1,
|
||||||
(BYTE*) dptr, dstStep, leadIn, 1);
|
src1Step, (const BYTE*) sptr2, src2Step,
|
||||||
|
(BYTE*) dptr, dstStep, leadIn, 1);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return status;
|
||||||
|
|
||||||
sptr1 += leadIn;
|
sptr1 += leadIn;
|
||||||
sptr2 += leadIn;
|
sptr2 += leadIn;
|
||||||
dptr += leadIn;
|
dptr += leadIn;
|
||||||
@ -181,9 +185,13 @@ pstatus_t sse2_alphaComp_argb(
|
|||||||
/* Finish off the remainder. */
|
/* Finish off the remainder. */
|
||||||
if (pixels)
|
if (pixels)
|
||||||
{
|
{
|
||||||
generic->alphaComp_argb((const BYTE*) sptr1, src1Step,
|
pstatus_t status;
|
||||||
(const BYTE*) sptr2, src2Step,
|
status = generic->alphaComp_argb((const BYTE*) sptr1, src1Step,
|
||||||
(BYTE*) dptr, dstStep, pixels, 1);
|
(const BYTE*) sptr2, src2Step,
|
||||||
|
(BYTE*) dptr, dstStep, pixels, 1);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return status;
|
||||||
|
|
||||||
sptr1 += pixels;
|
sptr1 += pixels;
|
||||||
sptr2 += pixels;
|
sptr2 += pixels;
|
||||||
dptr += pixels;
|
dptr += pixels;
|
||||||
@ -212,7 +220,7 @@ static pstatus_t ipp_alphaComp_argb(
|
|||||||
sz.width = width;
|
sz.width = width;
|
||||||
sz.height = height;
|
sz.height = height;
|
||||||
return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step,
|
return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step,
|
||||||
pDst, dstStep, sz, ippAlphaOver);
|
pDst, dstStep, sz, ippAlphaOver);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -262,7 +262,7 @@ static pstatus_t general_RGBToYCbCr_16s16s_P3P3(
|
|||||||
|
|
||||||
for (y = 0; y < roi->height; y++)
|
for (y = 0; y < roi->height; y++)
|
||||||
{
|
{
|
||||||
int x;
|
UINT32 x;
|
||||||
|
|
||||||
for (x = 0; x < roi->width; ++x)
|
for (x = 0; x < roi->width; ++x)
|
||||||
{
|
{
|
||||||
@ -305,10 +305,10 @@ static pstatus_t general_RGBToYCbCr_16s16s_P3P3(
|
|||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
static pstatus_t general_RGBToRGB_16s8u_P3AC4R(
|
static pstatus_t general_RGBToRGB_16s8u_P3AC4R(
|
||||||
const INT16* pSrc[3], /* 16-bit R,G, and B arrays */
|
const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
|
||||||
INT32 srcStep, /* bytes between rows in source data */
|
UINT32 srcStep, /* bytes between rows in source data */
|
||||||
BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */
|
BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */
|
||||||
INT32 dstStep, /* bytes between rows in dest data */
|
UINT32 dstStep, /* bytes between rows in dest data */
|
||||||
UINT32 DstFormat,
|
UINT32 DstFormat,
|
||||||
const prim_size_t* roi) /* region of interest */
|
const prim_size_t* roi) /* region of interest */
|
||||||
{
|
{
|
||||||
|
@ -91,7 +91,7 @@ static pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(
|
|||||||
{
|
{
|
||||||
/* We can't maintain 16-byte alignment. */
|
/* We can't maintain 16-byte alignment. */
|
||||||
return generic->yCbCrToRGB_16s16s_P3P3(pSrc, srcStep,
|
return generic->yCbCrToRGB_16s16s_P3P3(pSrc, srcStep,
|
||||||
pDst, dstStep, roi);
|
pDst, dstStep, roi);
|
||||||
}
|
}
|
||||||
|
|
||||||
zero = _mm_setzero_si128();
|
zero = _mm_setzero_si128();
|
||||||
@ -228,7 +228,7 @@ static pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(
|
|||||||
{
|
{
|
||||||
/* We can't maintain 16-byte alignment. */
|
/* We can't maintain 16-byte alignment. */
|
||||||
return generic->RGBToYCbCr_16s16s_P3P3(pSrc, srcStep,
|
return generic->RGBToYCbCr_16s16s_P3P3(pSrc, srcStep,
|
||||||
pDst, dstStep, roi);
|
pDst, dstStep, roi);
|
||||||
}
|
}
|
||||||
|
|
||||||
min = _mm_set1_epi16(-128 * 32);
|
min = _mm_set1_epi16(-128 * 32);
|
||||||
@ -357,10 +357,10 @@ static pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(
|
|||||||
_mm_set1_epi32(0xFFFFFFFFU)
|
_mm_set1_epi32(0xFFFFFFFFU)
|
||||||
|
|
||||||
pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
|
pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
|
||||||
const INT16* pSrc[3], /* 16-bit R,G, and B arrays */
|
const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
|
||||||
INT32 srcStep, /* bytes between rows in source data */
|
UINT32 srcStep, /* bytes between rows in source data */
|
||||||
BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */
|
BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */
|
||||||
INT32 dstStep, /* bytes between rows in dest data */
|
UINT32 dstStep, /* bytes between rows in dest data */
|
||||||
UINT32 DstFormat,
|
UINT32 DstFormat,
|
||||||
const prim_size_t* roi) /* region of interest */
|
const prim_size_t* roi) /* region of interest */
|
||||||
{
|
{
|
||||||
@ -385,9 +385,13 @@ pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
|
|||||||
|| (dstStep & 0x0f))
|
|| (dstStep & 0x0f))
|
||||||
{
|
{
|
||||||
return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst,
|
return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst,
|
||||||
dstStep, DstFormat, roi);
|
dstStep, DstFormat, roi);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: Need to update SSE code to allow color conversion!!!
|
||||||
|
return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst,
|
||||||
|
dstStep, DstFormat, roi);
|
||||||
|
|
||||||
out = (BYTE*) pDst;
|
out = (BYTE*) pDst;
|
||||||
srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
|
srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
|
||||||
dstbump = (dstStep - (roi->width * sizeof(UINT32)));
|
dstbump = (dstStep - (roi->width * sizeof(UINT32)));
|
||||||
|
@ -29,7 +29,7 @@
|
|||||||
static pstatus_t general_set_8u(
|
static pstatus_t general_set_8u(
|
||||||
BYTE val,
|
BYTE val,
|
||||||
BYTE* pDst,
|
BYTE* pDst,
|
||||||
INT32 len)
|
UINT32 len)
|
||||||
{
|
{
|
||||||
memset((void*) pDst, (int) val, (size_t) len);
|
memset((void*) pDst, (int) val, (size_t) len);
|
||||||
return PRIMITIVES_SUCCESS;
|
return PRIMITIVES_SUCCESS;
|
||||||
@ -48,7 +48,7 @@ static pstatus_t general_zero(
|
|||||||
static pstatus_t general_set_32s(
|
static pstatus_t general_set_32s(
|
||||||
INT32 val,
|
INT32 val,
|
||||||
INT32* pDst,
|
INT32* pDst,
|
||||||
INT32 len)
|
UINT32 len)
|
||||||
{
|
{
|
||||||
INT32* dptr = (INT32*) pDst;
|
INT32* dptr = (INT32*) pDst;
|
||||||
size_t span, remaining;
|
size_t span, remaining;
|
||||||
@ -85,7 +85,7 @@ static pstatus_t general_set_32s(
|
|||||||
static pstatus_t general_set_32u(
|
static pstatus_t general_set_32u(
|
||||||
UINT32 val,
|
UINT32 val,
|
||||||
UINT32* pDst,
|
UINT32* pDst,
|
||||||
INT32 len)
|
UINT32 len)
|
||||||
{
|
{
|
||||||
UINT32* dptr = (UINT32*) pDst;
|
UINT32* dptr = (UINT32*) pDst;
|
||||||
size_t span, remaining;
|
size_t span, remaining;
|
||||||
|
@ -40,7 +40,7 @@ static primitives_t* generic = NULL;
|
|||||||
static pstatus_t sse2_set_8u(
|
static pstatus_t sse2_set_8u(
|
||||||
BYTE val,
|
BYTE val,
|
||||||
BYTE* pDst,
|
BYTE* pDst,
|
||||||
INT32 len)
|
UINT32 len)
|
||||||
{
|
{
|
||||||
BYTE byte, *dptr;
|
BYTE byte, *dptr;
|
||||||
__m128i xmm0;
|
__m128i xmm0;
|
||||||
@ -126,7 +126,7 @@ static pstatus_t sse2_set_8u(
|
|||||||
static pstatus_t sse2_set_32u(
|
static pstatus_t sse2_set_32u(
|
||||||
UINT32 val,
|
UINT32 val,
|
||||||
UINT32* pDst,
|
UINT32* pDst,
|
||||||
INT32 len)
|
UINT32 len)
|
||||||
{
|
{
|
||||||
const primitives_t* prim = primitives_get_generic();
|
const primitives_t* prim = primitives_get_generic();
|
||||||
UINT32* dptr = (UINT32*) pDst;
|
UINT32* dptr = (UINT32*) pDst;
|
||||||
@ -218,7 +218,7 @@ static pstatus_t sse2_set_32u(
|
|||||||
static pstatus_t sse2_set_32s(
|
static pstatus_t sse2_set_32s(
|
||||||
INT32 val,
|
INT32 val,
|
||||||
INT32* pDst,
|
INT32* pDst,
|
||||||
INT32 len)
|
UINT32 len)
|
||||||
{
|
{
|
||||||
UINT32 uval = *((UINT32*) &val);
|
UINT32 uval = *((UINT32*) &val);
|
||||||
return sse2_set_32u(uval, (UINT32*) pDst, len);
|
return sse2_set_32u(uval, (UINT32*) pDst, len);
|
||||||
|
@ -24,9 +24,9 @@
|
|||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
static pstatus_t general_lShiftC_16s(
|
static pstatus_t general_lShiftC_16s(
|
||||||
const INT16* pSrc,
|
const INT16* pSrc,
|
||||||
INT32 val,
|
UINT32 val,
|
||||||
INT16* pDst,
|
INT16* pDst,
|
||||||
INT32 len)
|
UINT32 len)
|
||||||
{
|
{
|
||||||
if (val == 0) return PRIMITIVES_SUCCESS;
|
if (val == 0) return PRIMITIVES_SUCCESS;
|
||||||
|
|
||||||
@ -38,9 +38,9 @@ static pstatus_t general_lShiftC_16s(
|
|||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
static pstatus_t general_rShiftC_16s(
|
static pstatus_t general_rShiftC_16s(
|
||||||
const INT16* pSrc,
|
const INT16* pSrc,
|
||||||
INT32 val,
|
UINT32 val,
|
||||||
INT16* pDst,
|
INT16* pDst,
|
||||||
INT32 len)
|
UINT32 len)
|
||||||
{
|
{
|
||||||
if (val == 0) return PRIMITIVES_SUCCESS;
|
if (val == 0) return PRIMITIVES_SUCCESS;
|
||||||
|
|
||||||
@ -52,9 +52,9 @@ static pstatus_t general_rShiftC_16s(
|
|||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
static pstatus_t general_lShiftC_16u(
|
static pstatus_t general_lShiftC_16u(
|
||||||
const UINT16* pSrc,
|
const UINT16* pSrc,
|
||||||
INT32 val,
|
UINT32 val,
|
||||||
UINT16* pDst,
|
UINT16* pDst,
|
||||||
INT32 len)
|
UINT32 len)
|
||||||
{
|
{
|
||||||
if (val == 0) return PRIMITIVES_SUCCESS;
|
if (val == 0) return PRIMITIVES_SUCCESS;
|
||||||
|
|
||||||
@ -66,9 +66,9 @@ static pstatus_t general_lShiftC_16u(
|
|||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
static pstatus_t general_rShiftC_16u(
|
static pstatus_t general_rShiftC_16u(
|
||||||
const UINT16* pSrc,
|
const UINT16* pSrc,
|
||||||
INT32 val,
|
UINT32 val,
|
||||||
UINT16* pDst,
|
UINT16* pDst,
|
||||||
INT32 len)
|
UINT32 len)
|
||||||
{
|
{
|
||||||
if (val == 0) return PRIMITIVES_SUCCESS;
|
if (val == 0) return PRIMITIVES_SUCCESS;
|
||||||
|
|
||||||
@ -82,7 +82,7 @@ static pstatus_t general_shiftC_16s(
|
|||||||
const INT16* pSrc,
|
const INT16* pSrc,
|
||||||
INT32 val,
|
INT32 val,
|
||||||
INT16* pDst,
|
INT16* pDst,
|
||||||
INT32 len)
|
UINT32 len)
|
||||||
{
|
{
|
||||||
if (val == 0) return PRIMITIVES_SUCCESS;
|
if (val == 0) return PRIMITIVES_SUCCESS;
|
||||||
|
|
||||||
@ -95,7 +95,7 @@ static pstatus_t general_shiftC_16u(
|
|||||||
const UINT16* pSrc,
|
const UINT16* pSrc,
|
||||||
INT32 val,
|
INT32 val,
|
||||||
UINT16* pDst,
|
UINT16* pDst,
|
||||||
INT32 len)
|
UINT32 len)
|
||||||
{
|
{
|
||||||
if (val == 0) return PRIMITIVES_SUCCESS;
|
if (val == 0) return PRIMITIVES_SUCCESS;
|
||||||
|
|
||||||
|
@ -39,16 +39,16 @@ static primitives_t* generic = NULL;
|
|||||||
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
|
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s,
|
SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s,
|
||||||
_mm_slli_epi16, *dptr++ = *sptr++ << val)
|
_mm_slli_epi16, *dptr++ = *sptr++ << val)
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s,
|
SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s,
|
||||||
_mm_srai_epi16, *dptr++ = *sptr++ >> val)
|
_mm_srai_epi16, *dptr++ = *sptr++ >> val)
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u,
|
SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u,
|
||||||
_mm_slli_epi16, *dptr++ = *sptr++ << val)
|
_mm_slli_epi16, *dptr++ = *sptr++ << val)
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u,
|
SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u,
|
||||||
_mm_srli_epi16, *dptr++ = *sptr++ >> val)
|
_mm_srli_epi16, *dptr++ = *sptr++ >> val)
|
||||||
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
|
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -28,7 +28,7 @@
|
|||||||
static pstatus_t general_sign_16s(
|
static pstatus_t general_sign_16s(
|
||||||
const INT16* pSrc,
|
const INT16* pSrc,
|
||||||
INT16* pDst,
|
INT16* pDst,
|
||||||
INT32 len)
|
UINT32 len)
|
||||||
{
|
{
|
||||||
while (len--)
|
while (len--)
|
||||||
{
|
{
|
||||||
|
@ -35,7 +35,7 @@ static primitives_t* generic = NULL;
|
|||||||
static pstatus_t ssse3_sign_16s(
|
static pstatus_t ssse3_sign_16s(
|
||||||
const INT16* pSrc,
|
const INT16* pSrc,
|
||||||
INT16* pDst,
|
INT16* pDst,
|
||||||
INT32 len)
|
UINT32 len)
|
||||||
{
|
{
|
||||||
const INT16* sptr = (const INT16*) pSrc;
|
const INT16* sptr = (const INT16*) pSrc;
|
||||||
INT16* dptr = (INT16*) pDst;
|
INT16* dptr = (INT16*) pDst;
|
||||||
|
@ -44,143 +44,143 @@
|
|||||||
* SCD = Source, Constant, Destination
|
* SCD = Source, Constant, Destination
|
||||||
*/
|
*/
|
||||||
#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
|
#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
|
||||||
static pstatus_t _name_(const _type_ *pSrc, INT32 val, _type_ *pDst, INT32 len) \
|
static pstatus_t _name_(const _type_ *pSrc, UINT32 val, _type_ *pDst, UINT32 len) \
|
||||||
{ \
|
{ \
|
||||||
INT32 shifts; \
|
INT32 shifts; \
|
||||||
UINT32 offBeatMask; \
|
UINT32 offBeatMask; \
|
||||||
const _type_ *sptr = pSrc; \
|
const _type_ *sptr = pSrc; \
|
||||||
_type_ *dptr = pDst; \
|
_type_ *dptr = pDst; \
|
||||||
size_t count; \
|
size_t count; \
|
||||||
if (len < 16) /* pointless if too small */ \
|
if (len < 16) /* pointless if too small */ \
|
||||||
{ \
|
{ \
|
||||||
return _fallback_(pSrc, val, pDst, len); \
|
return _fallback_(pSrc, val, pDst, len); \
|
||||||
} \
|
} \
|
||||||
if (sizeof(_type_) == 1) shifts = 1; \
|
if (sizeof(_type_) == 1) shifts = 1; \
|
||||||
else if (sizeof(_type_) == 2) shifts = 2; \
|
else if (sizeof(_type_) == 2) shifts = 2; \
|
||||||
else if (sizeof(_type_) == 4) shifts = 3; \
|
else if (sizeof(_type_) == 4) shifts = 3; \
|
||||||
else if (sizeof(_type_) == 8) shifts = 4; \
|
else if (sizeof(_type_) == 8) shifts = 4; \
|
||||||
offBeatMask = (1 << (shifts - 1)) - 1; \
|
offBeatMask = (1 << (shifts - 1)) - 1; \
|
||||||
if ((ULONG_PTR) pDst & offBeatMask) \
|
if ((ULONG_PTR) pDst & offBeatMask) \
|
||||||
{ \
|
{ \
|
||||||
/* Incrementing the pointer skips over 16-byte boundary. */ \
|
/* Incrementing the pointer skips over 16-byte boundary. */ \
|
||||||
return _fallback_(pSrc, val, pDst, len); \
|
return _fallback_(pSrc, val, pDst, len); \
|
||||||
} \
|
} \
|
||||||
/* Get to the 16-byte boundary now. */ \
|
/* Get to the 16-byte boundary now. */ \
|
||||||
while ((ULONG_PTR) dptr & 0x0f) \
|
while ((ULONG_PTR) dptr & 0x0f) \
|
||||||
{ \
|
{ \
|
||||||
_slowWay_; \
|
_slowWay_; \
|
||||||
if (--len == 0) return PRIMITIVES_SUCCESS; \
|
if (--len == 0) return PRIMITIVES_SUCCESS; \
|
||||||
} \
|
} \
|
||||||
/* Use 8 128-bit SSE registers. */ \
|
/* Use 8 128-bit SSE registers. */ \
|
||||||
count = len >> (8-shifts); \
|
count = len >> (8-shifts); \
|
||||||
len -= count << (8-shifts); \
|
len -= count << (8-shifts); \
|
||||||
if ((ULONG_PTR) sptr & 0x0f) \
|
if ((ULONG_PTR) sptr & 0x0f) \
|
||||||
{ \
|
{ \
|
||||||
while (count--) \
|
while (count--) \
|
||||||
{ \
|
{ \
|
||||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
|
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
|
||||||
xmm0 = _mm_lddqu_si128((__m128i *) sptr); \
|
xmm0 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
|
xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
|
xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
|
xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
|
xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm5 = _mm_lddqu_si128((__m128i *) sptr); \
|
xmm5 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm6 = _mm_lddqu_si128((__m128i *) sptr); \
|
xmm6 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm7 = _mm_lddqu_si128((__m128i *) sptr); \
|
xmm7 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm0 = _op_(xmm0, val); \
|
xmm0 = _op_(xmm0, val); \
|
||||||
xmm1 = _op_(xmm1, val); \
|
xmm1 = _op_(xmm1, val); \
|
||||||
xmm2 = _op_(xmm2, val); \
|
xmm2 = _op_(xmm2, val); \
|
||||||
xmm3 = _op_(xmm3, val); \
|
xmm3 = _op_(xmm3, val); \
|
||||||
xmm4 = _op_(xmm4, val); \
|
xmm4 = _op_(xmm4, val); \
|
||||||
xmm5 = _op_(xmm5, val); \
|
xmm5 = _op_(xmm5, val); \
|
||||||
xmm6 = _op_(xmm6, val); \
|
xmm6 = _op_(xmm6, val); \
|
||||||
xmm7 = _op_(xmm7, val); \
|
xmm7 = _op_(xmm7, val); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm0); \
|
_mm_store_si128((__m128i *) dptr, xmm0); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm1); \
|
_mm_store_si128((__m128i *) dptr, xmm1); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm2); \
|
_mm_store_si128((__m128i *) dptr, xmm2); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm3); \
|
_mm_store_si128((__m128i *) dptr, xmm3); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm4); \
|
_mm_store_si128((__m128i *) dptr, xmm4); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm5); \
|
_mm_store_si128((__m128i *) dptr, xmm5); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm6); \
|
_mm_store_si128((__m128i *) dptr, xmm6); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm7); \
|
_mm_store_si128((__m128i *) dptr, xmm7); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
else \
|
else \
|
||||||
{ \
|
{ \
|
||||||
while (count--) \
|
while (count--) \
|
||||||
{ \
|
{ \
|
||||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
|
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
|
||||||
xmm0 = _mm_load_si128((__m128i *) sptr); \
|
xmm0 = _mm_load_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm1 = _mm_load_si128((__m128i *) sptr); \
|
xmm1 = _mm_load_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm2 = _mm_load_si128((__m128i *) sptr); \
|
xmm2 = _mm_load_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm3 = _mm_load_si128((__m128i *) sptr); \
|
xmm3 = _mm_load_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm4 = _mm_load_si128((__m128i *) sptr); \
|
xmm4 = _mm_load_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm5 = _mm_load_si128((__m128i *) sptr); \
|
xmm5 = _mm_load_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm6 = _mm_load_si128((__m128i *) sptr); \
|
xmm6 = _mm_load_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm7 = _mm_load_si128((__m128i *) sptr); \
|
xmm7 = _mm_load_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm0 = _op_(xmm0, val); \
|
xmm0 = _op_(xmm0, val); \
|
||||||
xmm1 = _op_(xmm1, val); \
|
xmm1 = _op_(xmm1, val); \
|
||||||
xmm2 = _op_(xmm2, val); \
|
xmm2 = _op_(xmm2, val); \
|
||||||
xmm3 = _op_(xmm3, val); \
|
xmm3 = _op_(xmm3, val); \
|
||||||
xmm4 = _op_(xmm4, val); \
|
xmm4 = _op_(xmm4, val); \
|
||||||
xmm5 = _op_(xmm5, val); \
|
xmm5 = _op_(xmm5, val); \
|
||||||
xmm6 = _op_(xmm6, val); \
|
xmm6 = _op_(xmm6, val); \
|
||||||
xmm7 = _op_(xmm7, val); \
|
xmm7 = _op_(xmm7, val); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm0); \
|
_mm_store_si128((__m128i *) dptr, xmm0); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm1); \
|
_mm_store_si128((__m128i *) dptr, xmm1); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm2); \
|
_mm_store_si128((__m128i *) dptr, xmm2); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm3); \
|
_mm_store_si128((__m128i *) dptr, xmm3); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm4); \
|
_mm_store_si128((__m128i *) dptr, xmm4); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm5); \
|
_mm_store_si128((__m128i *) dptr, xmm5); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm6); \
|
_mm_store_si128((__m128i *) dptr, xmm6); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm7); \
|
_mm_store_si128((__m128i *) dptr, xmm7); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
/* Use a single 128-bit SSE register. */ \
|
/* Use a single 128-bit SSE register. */ \
|
||||||
count = len >> (5-shifts); \
|
count = len >> (5-shifts); \
|
||||||
len -= count << (5-shifts); \
|
len -= count << (5-shifts); \
|
||||||
while (count--) \
|
while (count--) \
|
||||||
{ \
|
{ \
|
||||||
__m128i xmm0 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
|
__m128i xmm0 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
|
||||||
xmm0 = _op_(xmm0, val); \
|
xmm0 = _op_(xmm0, val); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm0); \
|
_mm_store_si128((__m128i *) dptr, xmm0); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
} \
|
} \
|
||||||
/* Finish off the remainder. */ \
|
/* Finish off the remainder. */ \
|
||||||
while (len--) { _slowWay_; } \
|
while (len--) { _slowWay_; } \
|
||||||
return PRIMITIVES_SUCCESS; \
|
return PRIMITIVES_SUCCESS; \
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------------
|
/* ----------------------------------------------------------------------------
|
||||||
@ -189,228 +189,230 @@
|
|||||||
*/
|
*/
|
||||||
#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
|
#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
|
||||||
pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32 len) \
|
pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32 len) \
|
||||||
{ \
|
{ \
|
||||||
int shifts; \
|
int shifts; \
|
||||||
UINT32 offBeatMask; \
|
UINT32 offBeatMask; \
|
||||||
const _type_ *sptr = pSrc; \
|
const _type_ *sptr = pSrc; \
|
||||||
_type_ *dptr = pDst; \
|
_type_ *dptr = pDst; \
|
||||||
size_t count; \
|
size_t count; \
|
||||||
__m128i xmm0; \
|
__m128i xmm0; \
|
||||||
if (len < 16) /* pointless if too small */ \
|
if (len < 16) /* pointless if too small */ \
|
||||||
{ \
|
{ \
|
||||||
return _fallback_(pSrc, val, pDst, len); \
|
return _fallback_(pSrc, val, pDst, len); \
|
||||||
} \
|
} \
|
||||||
if (sizeof(_type_) == 1) shifts = 1; \
|
if (sizeof(_type_) == 1) shifts = 1; \
|
||||||
else if (sizeof(_type_) == 2) shifts = 2; \
|
else if (sizeof(_type_) == 2) shifts = 2; \
|
||||||
else if (sizeof(_type_) == 4) shifts = 3; \
|
else if (sizeof(_type_) == 4) shifts = 3; \
|
||||||
else if (sizeof(_type_) == 8) shifts = 4; \
|
else if (sizeof(_type_) == 8) shifts = 4; \
|
||||||
offBeatMask = (1 << (shifts - 1)) - 1; \
|
offBeatMask = (1 << (shifts - 1)) - 1; \
|
||||||
if ((ULONG_PTR) pDst & offBeatMask) \
|
if ((ULONG_PTR) pDst & offBeatMask) \
|
||||||
{ \
|
{ \
|
||||||
/* Incrementing the pointer skips over 16-byte boundary. */ \
|
/* Incrementing the pointer skips over 16-byte boundary. */ \
|
||||||
return _fallback_(pSrc, val, pDst, len); \
|
return _fallback_(pSrc, val, pDst, len); \
|
||||||
} \
|
} \
|
||||||
/* Get to the 16-byte boundary now. */ \
|
/* Get to the 16-byte boundary now. */ \
|
||||||
while ((ULONG_PTR) dptr & 0x0f) \
|
while ((ULONG_PTR) dptr & 0x0f) \
|
||||||
{ \
|
{ \
|
||||||
_slowWay_; \
|
_slowWay_; \
|
||||||
if (--len == 0) return PRIMITIVES_SUCCESS; \
|
if (--len == 0) return PRIMITIVES_SUCCESS; \
|
||||||
} \
|
} \
|
||||||
/* Use 4 128-bit SSE registers. */ \
|
/* Use 4 128-bit SSE registers. */ \
|
||||||
count = len >> (7-shifts); \
|
count = len >> (7-shifts); \
|
||||||
len -= count << (7-shifts); \
|
len -= count << (7-shifts); \
|
||||||
xmm0 = _mm_set1_epi32(val); \
|
xmm0 = _mm_set1_epi32(val); \
|
||||||
if ((ULONG_PTR) sptr & 0x0f) \
|
if ((ULONG_PTR) sptr & 0x0f) \
|
||||||
{ \
|
{ \
|
||||||
while (count--) \
|
while (count--) \
|
||||||
{ \
|
{ \
|
||||||
__m128i xmm1, xmm2, xmm3, xmm4; \
|
__m128i xmm1, xmm2, xmm3, xmm4; \
|
||||||
xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
|
xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
|
xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
|
xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
|
xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm1 = _op_(xmm1, xmm0); \
|
xmm1 = _op_(xmm1, xmm0); \
|
||||||
xmm2 = _op_(xmm2, xmm0); \
|
xmm2 = _op_(xmm2, xmm0); \
|
||||||
xmm3 = _op_(xmm3, xmm0); \
|
xmm3 = _op_(xmm3, xmm0); \
|
||||||
xmm4 = _op_(xmm4, xmm0); \
|
xmm4 = _op_(xmm4, xmm0); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm1); \
|
_mm_store_si128((__m128i *) dptr, xmm1); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm2); \
|
_mm_store_si128((__m128i *) dptr, xmm2); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm3); \
|
_mm_store_si128((__m128i *) dptr, xmm3); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm4); \
|
_mm_store_si128((__m128i *) dptr, xmm4); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
else \
|
else \
|
||||||
{ \
|
{ \
|
||||||
while (count--) \
|
while (count--) \
|
||||||
{ \
|
{ \
|
||||||
__m128i xmm1, xmm2, xmm3, xmm4; \
|
__m128i xmm1, xmm2, xmm3, xmm4; \
|
||||||
xmm1 = _mm_load_si128((__m128i *) sptr); \
|
xmm1 = _mm_load_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm2 = _mm_load_si128((__m128i *) sptr); \
|
xmm2 = _mm_load_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm3 = _mm_load_si128((__m128i *) sptr); \
|
xmm3 = _mm_load_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm4 = _mm_load_si128((__m128i *) sptr); \
|
xmm4 = _mm_load_si128((__m128i *) sptr); \
|
||||||
sptr += (16/sizeof(_type_)); \
|
sptr += (16/sizeof(_type_)); \
|
||||||
xmm1 = _op_(xmm1, xmm0); \
|
xmm1 = _op_(xmm1, xmm0); \
|
||||||
xmm2 = _op_(xmm2, xmm0); \
|
xmm2 = _op_(xmm2, xmm0); \
|
||||||
xmm3 = _op_(xmm3, xmm0); \
|
xmm3 = _op_(xmm3, xmm0); \
|
||||||
xmm4 = _op_(xmm4, xmm0); \
|
xmm4 = _op_(xmm4, xmm0); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm1); \
|
_mm_store_si128((__m128i *) dptr, xmm1); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm2); \
|
_mm_store_si128((__m128i *) dptr, xmm2); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm3); \
|
_mm_store_si128((__m128i *) dptr, xmm3); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm4); \
|
_mm_store_si128((__m128i *) dptr, xmm4); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
/* Use a single 128-bit SSE register. */ \
|
/* Use a single 128-bit SSE register. */ \
|
||||||
count = len >> (5-shifts); \
|
count = len >> (5-shifts); \
|
||||||
len -= count << (5-shifts); \
|
len -= count << (5-shifts); \
|
||||||
while (count--) \
|
while (count--) \
|
||||||
{ \
|
{ \
|
||||||
__m128i xmm1 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
|
__m128i xmm1 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
|
||||||
xmm1 = _op_(xmm1, xmm0); \
|
xmm1 = _op_(xmm1, xmm0); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm1); \
|
_mm_store_si128((__m128i *) dptr, xmm1); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
} \
|
} \
|
||||||
/* Finish off the remainder. */ \
|
/* Finish off the remainder. */ \
|
||||||
while (len--) { _slowWay_; } \
|
while (len--) { _slowWay_; } \
|
||||||
return PRIMITIVES_SUCCESS; \
|
return PRIMITIVES_SUCCESS; \
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------------
|
/* ----------------------------------------------------------------------------
|
||||||
* SSD = Source1, Source2, Destination
|
* SSD = Source1, Source2, Destination
|
||||||
*/
|
*/
|
||||||
#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
|
#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
|
||||||
pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, INT32 len) \
|
pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, UINT32 len) \
|
||||||
{ \
|
{ \
|
||||||
int shifts; \
|
int shifts; \
|
||||||
UINT32 offBeatMask; \
|
UINT32 offBeatMask; \
|
||||||
const _type_ *sptr1 = pSrc1; \
|
const _type_ *sptr1 = pSrc1; \
|
||||||
const _type_ *sptr2 = pSrc2; \
|
const _type_ *sptr2 = pSrc2; \
|
||||||
_type_ *dptr = pDst; \
|
_type_ *dptr = pDst; \
|
||||||
size_t count; \
|
size_t count; \
|
||||||
if (len < 16) /* pointless if too small */ \
|
if (len < 16) /* pointless if too small */ \
|
||||||
{ \
|
{ \
|
||||||
return _fallback_(pSrc1, pSrc2, pDst, len); \
|
return _fallback_(pSrc1, pSrc2, pDst, len); \
|
||||||
} \
|
} \
|
||||||
if (sizeof(_type_) == 1) shifts = 1; \
|
if (sizeof(_type_) == 1) shifts = 1; \
|
||||||
else if (sizeof(_type_) == 2) shifts = 2; \
|
else if (sizeof(_type_) == 2) shifts = 2; \
|
||||||
else if (sizeof(_type_) == 4) shifts = 3; \
|
else if (sizeof(_type_) == 4) shifts = 3; \
|
||||||
else if (sizeof(_type_) == 8) shifts = 4; \
|
else if (sizeof(_type_) == 8) shifts = 4; \
|
||||||
offBeatMask = (1 << (shifts - 1)) - 1; \
|
offBeatMask = (1 << (shifts - 1)) - 1; \
|
||||||
if ((ULONG_PTR) pDst & offBeatMask) \
|
if ((ULONG_PTR) pDst & offBeatMask) \
|
||||||
{ \
|
{ \
|
||||||
/* Incrementing the pointer skips over 16-byte boundary. */ \
|
/* Incrementing the pointer skips over 16-byte boundary. */ \
|
||||||
return _fallback_(pSrc1, pSrc2, pDst, len); \
|
return _fallback_(pSrc1, pSrc2, pDst, len); \
|
||||||
} \
|
} \
|
||||||
/* Get to the 16-byte boundary now. */ \
|
/* Get to the 16-byte boundary now. */ \
|
||||||
while ((ULONG_PTR) dptr & 0x0f) \
|
while ((ULONG_PTR) dptr & 0x0f) \
|
||||||
{ \
|
{ \
|
||||||
_slowWay_; \
|
pstatus_t status; \
|
||||||
if (--len == 0) return PRIMITIVES_SUCCESS; \
|
status = _slowWay_; \
|
||||||
} \
|
if (status != PRIMITIVES_SUCCESS) return status; \
|
||||||
/* Use 4 128-bit SSE registers. */ \
|
if (--len == 0) return PRIMITIVES_SUCCESS; \
|
||||||
count = len >> (7-shifts); \
|
} \
|
||||||
len -= count << (7-shifts); \
|
/* Use 4 128-bit SSE registers. */ \
|
||||||
if (((ULONG_PTR) sptr1 & 0x0f) || ((ULONG_PTR) sptr2 & 0x0f)) \
|
count = len >> (7-shifts); \
|
||||||
{ \
|
len -= count << (7-shifts); \
|
||||||
/* Unaligned loads */ \
|
if (((ULONG_PTR) sptr1 & 0x0f) || ((ULONG_PTR) sptr2 & 0x0f)) \
|
||||||
while (count--) \
|
{ \
|
||||||
{ \
|
/* Unaligned loads */ \
|
||||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
|
while (count--) \
|
||||||
xmm0 = _mm_lddqu_si128((__m128i *) sptr1); \
|
{ \
|
||||||
sptr1 += (16/sizeof(_type_)); \
|
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
|
||||||
xmm1 = _mm_lddqu_si128((__m128i *) sptr1); \
|
xmm0 = _mm_lddqu_si128((__m128i *) sptr1); \
|
||||||
sptr1 += (16/sizeof(_type_)); \
|
sptr1 += (16/sizeof(_type_)); \
|
||||||
xmm2 = _mm_lddqu_si128((__m128i *) sptr1); \
|
xmm1 = _mm_lddqu_si128((__m128i *) sptr1); \
|
||||||
sptr1 += (16/sizeof(_type_)); \
|
sptr1 += (16/sizeof(_type_)); \
|
||||||
xmm3 = _mm_lddqu_si128((__m128i *) sptr1); \
|
xmm2 = _mm_lddqu_si128((__m128i *) sptr1); \
|
||||||
sptr1 += (16/sizeof(_type_)); \
|
sptr1 += (16/sizeof(_type_)); \
|
||||||
xmm4 = _mm_lddqu_si128((__m128i *) sptr2); \
|
xmm3 = _mm_lddqu_si128((__m128i *) sptr1); \
|
||||||
sptr2 += (16/sizeof(_type_)); \
|
sptr1 += (16/sizeof(_type_)); \
|
||||||
xmm5 = _mm_lddqu_si128((__m128i *) sptr2); \
|
xmm4 = _mm_lddqu_si128((__m128i *) sptr2); \
|
||||||
sptr2 += (16/sizeof(_type_)); \
|
sptr2 += (16/sizeof(_type_)); \
|
||||||
xmm6 = _mm_lddqu_si128((__m128i *) sptr2); \
|
xmm5 = _mm_lddqu_si128((__m128i *) sptr2); \
|
||||||
sptr2 += (16/sizeof(_type_)); \
|
sptr2 += (16/sizeof(_type_)); \
|
||||||
xmm7 = _mm_lddqu_si128((__m128i *) sptr2); \
|
xmm6 = _mm_lddqu_si128((__m128i *) sptr2); \
|
||||||
sptr2 += (16/sizeof(_type_)); \
|
sptr2 += (16/sizeof(_type_)); \
|
||||||
xmm0 = _op_(xmm0, xmm4); \
|
xmm7 = _mm_lddqu_si128((__m128i *) sptr2); \
|
||||||
xmm1 = _op_(xmm1, xmm5); \
|
sptr2 += (16/sizeof(_type_)); \
|
||||||
xmm2 = _op_(xmm2, xmm6); \
|
xmm0 = _op_(xmm0, xmm4); \
|
||||||
xmm3 = _op_(xmm3, xmm7); \
|
xmm1 = _op_(xmm1, xmm5); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm0); \
|
xmm2 = _op_(xmm2, xmm6); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
xmm3 = _op_(xmm3, xmm7); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm1); \
|
_mm_store_si128((__m128i *) dptr, xmm0); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm2); \
|
_mm_store_si128((__m128i *) dptr, xmm1); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm3); \
|
_mm_store_si128((__m128i *) dptr, xmm2); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
} \
|
_mm_store_si128((__m128i *) dptr, xmm3); \
|
||||||
} \
|
dptr += (16/sizeof(_type_)); \
|
||||||
else \
|
} \
|
||||||
{ \
|
} \
|
||||||
/* Aligned loads */ \
|
else \
|
||||||
while (count--) \
|
{ \
|
||||||
{ \
|
/* Aligned loads */ \
|
||||||
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
|
while (count--) \
|
||||||
xmm0 = _mm_load_si128((__m128i *) sptr1); \
|
{ \
|
||||||
sptr1 += (16/sizeof(_type_)); \
|
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
|
||||||
xmm1 = _mm_load_si128((__m128i *) sptr1); \
|
xmm0 = _mm_load_si128((__m128i *) sptr1); \
|
||||||
sptr1 += (16/sizeof(_type_)); \
|
sptr1 += (16/sizeof(_type_)); \
|
||||||
xmm2 = _mm_load_si128((__m128i *) sptr1); \
|
xmm1 = _mm_load_si128((__m128i *) sptr1); \
|
||||||
sptr1 += (16/sizeof(_type_)); \
|
sptr1 += (16/sizeof(_type_)); \
|
||||||
xmm3 = _mm_load_si128((__m128i *) sptr1); \
|
xmm2 = _mm_load_si128((__m128i *) sptr1); \
|
||||||
sptr1 += (16/sizeof(_type_)); \
|
sptr1 += (16/sizeof(_type_)); \
|
||||||
xmm4 = _mm_load_si128((__m128i *) sptr2); \
|
xmm3 = _mm_load_si128((__m128i *) sptr1); \
|
||||||
sptr2 += (16/sizeof(_type_)); \
|
sptr1 += (16/sizeof(_type_)); \
|
||||||
xmm5 = _mm_load_si128((__m128i *) sptr2); \
|
xmm4 = _mm_load_si128((__m128i *) sptr2); \
|
||||||
sptr2 += (16/sizeof(_type_)); \
|
sptr2 += (16/sizeof(_type_)); \
|
||||||
xmm6 = _mm_load_si128((__m128i *) sptr2); \
|
xmm5 = _mm_load_si128((__m128i *) sptr2); \
|
||||||
sptr2 += (16/sizeof(_type_)); \
|
sptr2 += (16/sizeof(_type_)); \
|
||||||
xmm7 = _mm_load_si128((__m128i *) sptr2); \
|
xmm6 = _mm_load_si128((__m128i *) sptr2); \
|
||||||
sptr2 += (16/sizeof(_type_)); \
|
sptr2 += (16/sizeof(_type_)); \
|
||||||
xmm0 = _op_(xmm0, xmm4); \
|
xmm7 = _mm_load_si128((__m128i *) sptr2); \
|
||||||
xmm1 = _op_(xmm1, xmm5); \
|
sptr2 += (16/sizeof(_type_)); \
|
||||||
xmm2 = _op_(xmm2, xmm6); \
|
xmm0 = _op_(xmm0, xmm4); \
|
||||||
xmm3 = _op_(xmm3, xmm7); \
|
xmm1 = _op_(xmm1, xmm5); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm0); \
|
xmm2 = _op_(xmm2, xmm6); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
xmm3 = _op_(xmm3, xmm7); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm1); \
|
_mm_store_si128((__m128i *) dptr, xmm0); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm2); \
|
_mm_store_si128((__m128i *) dptr, xmm1); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm3); \
|
_mm_store_si128((__m128i *) dptr, xmm2); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
dptr += (16/sizeof(_type_)); \
|
||||||
} \
|
_mm_store_si128((__m128i *) dptr, xmm3); \
|
||||||
} \
|
dptr += (16/sizeof(_type_)); \
|
||||||
/* Use a single 128-bit SSE register. */ \
|
} \
|
||||||
count = len >> (5-shifts); \
|
} \
|
||||||
len -= count << (5-shifts); \
|
/* Use a single 128-bit SSE register. */ \
|
||||||
while (count--) \
|
count = len >> (5-shifts); \
|
||||||
{ \
|
len -= count << (5-shifts); \
|
||||||
__m128i xmm0, xmm1; \
|
while (count--) \
|
||||||
xmm0 = LOAD_SI128(sptr1); sptr1 += (16/sizeof(_type_)); \
|
{ \
|
||||||
xmm1 = LOAD_SI128(sptr2); sptr2 += (16/sizeof(_type_)); \
|
__m128i xmm0, xmm1; \
|
||||||
xmm0 = _op_(xmm0, xmm1); \
|
xmm0 = LOAD_SI128(sptr1); sptr1 += (16/sizeof(_type_)); \
|
||||||
_mm_store_si128((__m128i *) dptr, xmm0); \
|
xmm1 = LOAD_SI128(sptr2); sptr2 += (16/sizeof(_type_)); \
|
||||||
dptr += (16/sizeof(_type_)); \
|
xmm0 = _op_(xmm0, xmm1); \
|
||||||
} \
|
_mm_store_si128((__m128i *) dptr, xmm0); \
|
||||||
/* Finish off the remainder. */ \
|
dptr += (16/sizeof(_type_)); \
|
||||||
while (len--) { _slowWay_; } \
|
} \
|
||||||
return PRIMITIVES_SUCCESS; \
|
/* Finish off the remainder. */ \
|
||||||
|
while (len--) { _slowWay_; } \
|
||||||
|
return PRIMITIVES_SUCCESS; \
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* !__PRIM_TEMPLATES_H_INCLUDED__ */
|
#endif /* !__PRIM_TEMPLATES_H_INCLUDED__ */
|
||||||
|
@ -81,6 +81,6 @@ primitives_t* primitives_get_generic(void)
|
|||||||
if (!pPrimitivesGenericInitialized)
|
if (!pPrimitivesGenericInitialized)
|
||||||
primitives_init_generic();
|
primitives_init_generic();
|
||||||
|
|
||||||
return &pPrimitives;
|
return &pPrimitivesGeneric;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -26,7 +26,7 @@ static BOOL test_add16s_func(void)
|
|||||||
pstatus_t status;
|
pstatus_t status;
|
||||||
|
|
||||||
INT16 ALIGN(src1[FUNC_TEST_SIZE + 3]), ALIGN(src2[FUNC_TEST_SIZE + 3]),
|
INT16 ALIGN(src1[FUNC_TEST_SIZE + 3]), ALIGN(src2[FUNC_TEST_SIZE + 3]),
|
||||||
ALIGN(d1[FUNC_TEST_SIZE + 3]), ALIGN(d2[FUNC_TEST_SIZE + 3]);
|
ALIGN(d1[FUNC_TEST_SIZE + 3]), ALIGN(d2[FUNC_TEST_SIZE + 3]);
|
||||||
|
|
||||||
char testStr[256];
|
char testStr[256];
|
||||||
testStr[0] = '\0';
|
testStr[0] = '\0';
|
||||||
@ -50,7 +50,7 @@ static BOOL test_add16s_func(void)
|
|||||||
static BOOL test_add16s_speed(void)
|
static BOOL test_add16s_speed(void)
|
||||||
{
|
{
|
||||||
BYTE ALIGN(src1[MAX_TEST_SIZE + 3]), ALIGN(src2[MAX_TEST_SIZE + 3]),
|
BYTE ALIGN(src1[MAX_TEST_SIZE + 3]), ALIGN(src2[MAX_TEST_SIZE + 3]),
|
||||||
ALIGN(dst[MAX_TEST_SIZE + 3]);
|
ALIGN(dst[MAX_TEST_SIZE + 3]);
|
||||||
|
|
||||||
if (!g_TestPrimitivesPerformance)
|
if (!g_TestPrimitivesPerformance)
|
||||||
return TRUE;
|
return TRUE;
|
||||||
@ -59,7 +59,8 @@ static BOOL test_add16s_speed(void)
|
|||||||
winpr_RAND(src2, sizeof(src2));
|
winpr_RAND(src2, sizeof(src2));
|
||||||
|
|
||||||
if (!speed_test("add16s", "aligned", g_Iterations,
|
if (!speed_test("add16s", "aligned", g_Iterations,
|
||||||
generic->add_16s, optimized->add_16s,
|
(speed_test_fkt)generic->add_16s,
|
||||||
|
(speed_test_fkt)optimized->add_16s,
|
||||||
src1, src2, dst, FUNC_TEST_SIZE))
|
src1, src2, dst, FUNC_TEST_SIZE))
|
||||||
return FALSE;
|
return FALSE;
|
||||||
|
|
||||||
@ -72,8 +73,11 @@ int TestPrimitivesAdd(int argc, char* argv[])
|
|||||||
if (!test_add16s_func())
|
if (!test_add16s_func())
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
if (!test_add16s_speed())
|
if (g_TestPrimitivesPerformance)
|
||||||
return -1;
|
{
|
||||||
|
if (!test_add16s_speed())
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -33,8 +33,13 @@ static const int block_size[] = { 4, 64, 256 };
|
|||||||
#define GRN(_c_) (((_c_) & 0x0000FF00U) >> 8)
|
#define GRN(_c_) (((_c_) & 0x0000FF00U) >> 8)
|
||||||
#define BLU(_c_) ((_c_) & 0x000000FFU)
|
#define BLU(_c_) ((_c_) & 0x000000FFU)
|
||||||
#define TOLERANCE 1
|
#define TOLERANCE 1
|
||||||
#define PIXEL(_addr_, _bytes_, _x_, _y_) \
|
static inline const UINT32* PIXEL(const BYTE* _addr_, UINT32 _bytes_, UINT32 _x_, UINT32 _y_)
|
||||||
((UINT32 *) (((BYTE *) (_addr_)) + (_x_)*4 + (_y_)*(_bytes_)))
|
{
|
||||||
|
const BYTE* addr = _addr_ + _x_ * sizeof(UINT32) + _y_ * _bytes_;
|
||||||
|
|
||||||
|
return (const UINT32*)addr;
|
||||||
|
}
|
||||||
|
|
||||||
#define SRC1_WIDTH 6
|
#define SRC1_WIDTH 6
|
||||||
#define SRC1_HEIGHT 6
|
#define SRC1_HEIGHT 6
|
||||||
#define SRC2_WIDTH 7
|
#define SRC2_WIDTH 7
|
||||||
@ -46,8 +51,8 @@ static const int block_size[] = { 4, 64, 256 };
|
|||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
static UINT32 alpha_add(
|
static UINT32 alpha_add(
|
||||||
UINT32 c1,
|
UINT32 c1,
|
||||||
UINT32 c2)
|
UINT32 c2)
|
||||||
{
|
{
|
||||||
UINT32 a1 = ALF(c1);
|
UINT32 a1 = ALF(c1);
|
||||||
UINT32 r1 = RED(c1);
|
UINT32 r1 = RED(c1);
|
||||||
@ -66,8 +71,8 @@ static UINT32 alpha_add(
|
|||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
static UINT32 colordist(
|
static UINT32 colordist(
|
||||||
UINT32 c1,
|
UINT32 c1,
|
||||||
UINT32 c2)
|
UINT32 c2)
|
||||||
{
|
{
|
||||||
int d, maxd = 0;
|
int d, maxd = 0;
|
||||||
d = ABS(ALF(c1) - ALF(c2));
|
d = ABS(ALF(c1) - ALF(c2));
|
||||||
@ -90,10 +95,10 @@ static UINT32 colordist(
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
static BOOL check(const BYTE* pSrc1, INT32 src1Step,
|
static BOOL check(const BYTE* pSrc1, UINT32 src1Step,
|
||||||
const BYTE* pSrc2, INT32 src2Step,
|
const BYTE* pSrc2, UINT32 src2Step,
|
||||||
BYTE* pDst, INT32 dstStep,
|
BYTE* pDst, UINT32 dstStep,
|
||||||
INT32 width, INT32 height)
|
UINT32 width, UINT32 height)
|
||||||
{
|
{
|
||||||
UINT32 x, y;
|
UINT32 x, y;
|
||||||
for (y = 0; y < height; ++y)
|
for (y = 0; y < height; ++y)
|
||||||
@ -120,14 +125,14 @@ static BOOL check(const BYTE* pSrc1, INT32 src1Step,
|
|||||||
static BOOL test_alphaComp_func(void)
|
static BOOL test_alphaComp_func(void)
|
||||||
{
|
{
|
||||||
pstatus_t status;
|
pstatus_t status;
|
||||||
BYTE ALIGN(src1[SRC1_WIDTH * SRC1_HEIGHT]);
|
BYTE ALIGN(src1[SRC1_WIDTH * SRC1_HEIGHT * 4]);
|
||||||
BYTE ALIGN(src2[SRC2_WIDTH * SRC2_HEIGHT]);
|
BYTE ALIGN(src2[SRC2_WIDTH * SRC2_HEIGHT * 4]);
|
||||||
BYTE ALIGN(dst1[DST_WIDTH * DST_HEIGHT]);
|
BYTE ALIGN(dst1[DST_WIDTH * DST_HEIGHT * 4]);
|
||||||
char testStr[256];
|
|
||||||
UINT32* ptr;
|
UINT32* ptr;
|
||||||
UINT32 i;
|
UINT32 i;
|
||||||
testStr[0] = '\0';
|
|
||||||
winpr_RAND((BYTE*)src1, sizeof(src1));
|
winpr_RAND((BYTE*)src1, sizeof(src1));
|
||||||
|
|
||||||
/* Special-case the first two values */
|
/* Special-case the first two values */
|
||||||
src1[0] &= 0x00FFFFFFU;
|
src1[0] &= 0x00FFFFFFU;
|
||||||
src1[1] |= 0xFF000000U;
|
src1[1] |= 0xFF000000U;
|
||||||
@ -141,8 +146,8 @@ static BOOL test_alphaComp_func(void)
|
|||||||
memset(dst1, 0, sizeof(dst1));
|
memset(dst1, 0, sizeof(dst1));
|
||||||
|
|
||||||
status = generic->alphaComp_argb(src1, 4 * SRC1_WIDTH,
|
status = generic->alphaComp_argb(src1, 4 * SRC1_WIDTH,
|
||||||
src2, 4 * SRC2_WIDTH,
|
src2, 4 * SRC2_WIDTH,
|
||||||
dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
|
dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
|
||||||
if (status != PRIMITIVES_SUCCESS)
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
return FALSE;
|
return FALSE;
|
||||||
|
|
||||||
@ -152,8 +157,8 @@ static BOOL test_alphaComp_func(void)
|
|||||||
return FALSE;
|
return FALSE;
|
||||||
|
|
||||||
status = optimized->alphaComp_argb((const BYTE*) src1, 4 * SRC1_WIDTH,
|
status = optimized->alphaComp_argb((const BYTE*) src1, 4 * SRC1_WIDTH,
|
||||||
(const BYTE*) src2, 4 * SRC2_WIDTH,
|
(const BYTE*) src2, 4 * SRC2_WIDTH,
|
||||||
(BYTE*) dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
|
(BYTE*) dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
|
||||||
if (status != PRIMITIVES_SUCCESS)
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
return FALSE;
|
return FALSE;
|
||||||
|
|
||||||
@ -188,7 +193,8 @@ static int test_alphaComp_speed(void)
|
|||||||
memset(dst1, 0, sizeof(dst1));
|
memset(dst1, 0, sizeof(dst1));
|
||||||
|
|
||||||
if (!speed_test("add16s", "aligned", g_Iterations,
|
if (!speed_test("add16s", "aligned", g_Iterations,
|
||||||
generic->alphaComp_argb, optimized->alphaComp_argb,
|
(speed_test_fkt)generic->alphaComp_argb,
|
||||||
|
(speed_test_fkt)optimized->alphaComp_argb,
|
||||||
src1, 4 * SRC1_WIDTH,
|
src1, 4 * SRC1_WIDTH,
|
||||||
src2, 4 * SRC2_WIDTH,
|
src2, 4 * SRC2_WIDTH,
|
||||||
dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT))
|
dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT))
|
||||||
@ -203,8 +209,11 @@ int TestPrimitivesAlphaComp(int argc, char* argv[])
|
|||||||
if (!test_alphaComp_func())
|
if (!test_alphaComp_func())
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
if (!test_alphaComp_speed())
|
if (g_TestPrimitivesPerformance)
|
||||||
return -1;
|
{
|
||||||
|
if (!test_alphaComp_speed())
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -20,99 +20,87 @@
|
|||||||
#include "prim_test.h"
|
#include "prim_test.h"
|
||||||
|
|
||||||
#define FUNC_TEST_SIZE 65536
|
#define FUNC_TEST_SIZE 65536
|
||||||
static const int ANDOR_PRETEST_ITERATIONS = 100000;
|
|
||||||
static const int TEST_TIME = 2.0; // seconds
|
|
||||||
|
|
||||||
#define VALUE (0xA5A5A5A5U)
|
#define VALUE (0xA5A5A5A5U)
|
||||||
|
|
||||||
/* ========================================================================= */
|
/* ========================================================================= */
|
||||||
|
static BOOL test_and_32u_impl(const char* name, __andC_32u_t fkt,
|
||||||
|
const UINT32* src, const UINT32 val,
|
||||||
|
UINT32* dst, size_t size)
|
||||||
|
{
|
||||||
|
size_t i;
|
||||||
|
pstatus_t status = fkt(src, val, dst, size);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
for (i = 0; i < size; ++i)
|
||||||
|
{
|
||||||
|
if (dst[i] != (src[i] & val))
|
||||||
|
{
|
||||||
|
printf("AND %s FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
|
||||||
|
name, i, src[i], val, src[i] & val, dst[i]);
|
||||||
|
return FALSE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
static BOOL test_and_32u_func(void)
|
static BOOL test_and_32u_func(void)
|
||||||
{
|
{
|
||||||
UINT32 ALIGN(src[FUNC_TEST_SIZE + 3]), ALIGN(dst[FUNC_TEST_SIZE + 3]);
|
UINT32 ALIGN(src[FUNC_TEST_SIZE + 3]), ALIGN(dst[FUNC_TEST_SIZE + 3]);
|
||||||
int failed = 0;
|
|
||||||
int i;
|
|
||||||
char testStr[256];
|
|
||||||
testStr[0] = '\0';
|
|
||||||
winpr_RAND(src, sizeof(src));
|
|
||||||
generic->andC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
|
|
||||||
strcat(testStr, " general");
|
|
||||||
|
|
||||||
for (i = 1; i <= FUNC_TEST_SIZE; ++i)
|
winpr_RAND((BYTE*)src, sizeof(src));
|
||||||
{
|
|
||||||
if (dst[i] != (src[i] & VALUE))
|
|
||||||
{
|
|
||||||
printf("AND-general FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
|
|
||||||
i, src[i], VALUE, src[i] & VALUE, dst[i]);
|
|
||||||
++failed;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef WITH_SSE2
|
if (!test_and_32u_impl("generic->andC_32u aligned", generic->andC_32u,
|
||||||
|
src + 1, VALUE, dst + 1, FUNC_TEST_SIZE))
|
||||||
|
return FALSE;
|
||||||
|
if (!test_and_32u_impl("generic->andC_32u unaligned", generic->andC_32u,
|
||||||
|
src + 1, VALUE, dst + 2, FUNC_TEST_SIZE))
|
||||||
|
return FALSE;
|
||||||
|
if (!test_and_32u_impl("optimized->andC_32u aligned", optimized->andC_32u,
|
||||||
|
src + 1, VALUE, dst + 1, FUNC_TEST_SIZE))
|
||||||
|
return FALSE;
|
||||||
|
if (!test_and_32u_impl("optimized->andC_32u unaligned", optimized->andC_32u,
|
||||||
|
src + 1, VALUE, dst + 2, FUNC_TEST_SIZE))
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
if (IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
|
return TRUE;
|
||||||
{
|
|
||||||
strcat(testStr, " SSE3");
|
|
||||||
/* Aligned */
|
|
||||||
memset(dst, 0, sizeof(dst));
|
|
||||||
sse3_andC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
|
|
||||||
|
|
||||||
for (i = 1; i <= FUNC_TEST_SIZE; ++i)
|
|
||||||
{
|
|
||||||
if (dst[i] != (src[i] & VALUE))
|
|
||||||
{
|
|
||||||
printf("AND-SSE-aligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
|
|
||||||
i, src[i], VALUE, src[i] & VALUE, dst[i]);
|
|
||||||
++failed;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Unaligned */
|
|
||||||
memset(dst, 0, sizeof(dst));
|
|
||||||
sse3_andC_32u(src + 1, VALUE, dst + 2, FUNC_TEST_SIZE);
|
|
||||||
|
|
||||||
for (i = 1; i <= FUNC_TEST_SIZE; ++i)
|
|
||||||
{
|
|
||||||
if (dst[i + 1] != (src[i] & VALUE))
|
|
||||||
{
|
|
||||||
printf("AND-SSE-unaligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
|
|
||||||
i, src[i], VALUE, src[i] & VALUE, dst[i + 1]);
|
|
||||||
++failed;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* i386 */
|
|
||||||
|
|
||||||
if (!failed) printf("All and_32u tests passed (%s).\n", testStr);
|
|
||||||
|
|
||||||
return (failed > 0) ? FAILURE : SUCCESS;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
static BOOL test_and_32u_speed(void)
|
static BOOL test_and_32u_speed(void)
|
||||||
{
|
{
|
||||||
UINT32 ALIGN(src[MAX_TEST_SIZE + 3]), ALIGN(dst[MAX_TEST_SIZE + 3]);
|
UINT32 ALIGN(src[MAX_TEST_SIZE + 3]), ALIGN(dst[MAX_TEST_SIZE + 3]);
|
||||||
winpr_RAND(src, sizeof(src));
|
|
||||||
andC_32u_speed_test("and32u", "aligned", src, NULL, VALUE, dst,
|
winpr_RAND((BYTE*)src, sizeof(src));
|
||||||
test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME);
|
|
||||||
andC_32u_speed_test("and32u", "unaligned", src + 1, NULL, VALUE, dst,
|
if (!speed_test("andC_32u", "aligned", g_Iterations,
|
||||||
test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME);
|
(speed_test_fkt)generic->andC_32u,
|
||||||
return SUCCESS;
|
(speed_test_fkt)optimized->andC_32u,
|
||||||
|
src + 1, VALUE, dst + 1, MAX_TEST_SIZE))
|
||||||
|
return FALSE;
|
||||||
|
if (!speed_test("andC_32u", "unaligned", g_Iterations,
|
||||||
|
(speed_test_fkt)generic->andC_32u,
|
||||||
|
(speed_test_fkt)optimized->andC_32u,
|
||||||
|
src + 1, VALUE, dst + 2, MAX_TEST_SIZE))
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ========================================================================= */
|
/* ========================================================================= */
|
||||||
static BOOL check(const UINT32* src, const UINT32* dst, UINT32 size, UINT32 value)
|
static BOOL check(const UINT32* src, const UINT32* dst, UINT32 size, UINT32 value)
|
||||||
{
|
{
|
||||||
UINT32 i;
|
UINT32 i;
|
||||||
UINT32 failed = 0;
|
|
||||||
|
|
||||||
for (i = 1; i <= size; ++i)
|
for (i = 0; i < size; ++i)
|
||||||
{
|
{
|
||||||
if (dst[i] != (src[i] | value))
|
if (dst[i] != (src[i] | value))
|
||||||
{
|
{
|
||||||
printf("OR-general general FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
|
printf("OR-general general FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
|
||||||
i, src[i], value, src[i] | value, dst[i]);
|
i, src[i], value, src[i] | value, dst[i]);
|
||||||
++failed;
|
return FALSE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -123,8 +111,7 @@ static BOOL test_or_32u_func(void)
|
|||||||
{
|
{
|
||||||
pstatus_t status;
|
pstatus_t status;
|
||||||
UINT32 ALIGN(src[FUNC_TEST_SIZE + 3]), ALIGN(dst[FUNC_TEST_SIZE + 3]);
|
UINT32 ALIGN(src[FUNC_TEST_SIZE + 3]), ALIGN(dst[FUNC_TEST_SIZE + 3]);
|
||||||
char testStr[256];
|
|
||||||
testStr[0] = '\0';
|
|
||||||
winpr_RAND((BYTE*)src, sizeof(src));
|
winpr_RAND((BYTE*)src, sizeof(src));
|
||||||
|
|
||||||
status = generic->orC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
|
status = generic->orC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
|
||||||
@ -153,7 +140,8 @@ static BOOL test_or_32u_speed(void)
|
|||||||
winpr_RAND((BYTE*)src, sizeof(src));
|
winpr_RAND((BYTE*)src, sizeof(src));
|
||||||
|
|
||||||
if (!speed_test("add16s", "aligned", g_Iterations,
|
if (!speed_test("add16s", "aligned", g_Iterations,
|
||||||
generic->orC_32u, optimized->orC_32u,
|
(speed_test_fkt)generic->orC_32u,
|
||||||
|
(speed_test_fkt)optimized->orC_32u,
|
||||||
src + 1, VALUE, dst + 1, FUNC_TEST_SIZE))
|
src + 1, VALUE, dst + 1, FUNC_TEST_SIZE))
|
||||||
return FALSE;
|
return FALSE;
|
||||||
|
|
||||||
@ -167,14 +155,16 @@ int TestPrimitivesAndOr(int argc, char* argv[])
|
|||||||
if (!test_and_32u_func())
|
if (!test_and_32u_func())
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
if (!test_and_32u_speed())
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
if (!test_or_32u_func())
|
if (!test_or_32u_func())
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
if (!test_or_32u_speed())
|
if (g_TestPrimitivesPerformance)
|
||||||
return -1;
|
{
|
||||||
|
if (!test_and_32u_speed())
|
||||||
|
return -1;
|
||||||
|
if (!test_or_32u_speed())
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -24,19 +24,16 @@ static const int YCBCR_TRIAL_ITERATIONS = 1000;
|
|||||||
static const float TEST_TIME = 4.0;
|
static const float TEST_TIME = 4.0;
|
||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
int test_RGBToRGB_16s8u_P3AC4R_func(void)
|
static BOOL test_RGBToRGB_16s8u_P3AC4R_func(void)
|
||||||
{
|
{
|
||||||
INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]);
|
INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]);
|
||||||
UINT32 ALIGN(out1[4096]);
|
UINT32 ALIGN(out1[4096]);
|
||||||
#ifdef WITH_SSE2
|
|
||||||
UINT32 ALIGN(out2[4096]);
|
UINT32 ALIGN(out2[4096]);
|
||||||
#endif
|
|
||||||
int i;
|
int i;
|
||||||
int failed = 0;
|
BOOL failed = FALSE;
|
||||||
char testStr[256];
|
|
||||||
INT16* ptrs[3];
|
INT16* ptrs[3];
|
||||||
prim_size_t roi = { 64, 64 };
|
prim_size_t roi = { 64, 64 };
|
||||||
testStr[0] = '\0';
|
|
||||||
winpr_RAND((BYTE*)r, sizeof(r));
|
winpr_RAND((BYTE*)r, sizeof(r));
|
||||||
winpr_RAND((BYTE*)g, sizeof(g));
|
winpr_RAND((BYTE*)g, sizeof(g));
|
||||||
winpr_RAND((BYTE*)b, sizeof(b));
|
winpr_RAND((BYTE*)b, sizeof(b));
|
||||||
@ -52,56 +49,38 @@ int test_RGBToRGB_16s8u_P3AC4R_func(void)
|
|||||||
ptrs[0] = r;
|
ptrs[0] = r;
|
||||||
ptrs[1] = g;
|
ptrs[1] = g;
|
||||||
ptrs[2] = b;
|
ptrs[2] = b;
|
||||||
generic->RGBToRGB_16s8u_P3AC4R((const INT16**) ptrs, 64 * 2,
|
if (generic->RGBToRGB_16s8u_P3AC4R((const INT16**) ptrs, 64 * 2,
|
||||||
(BYTE*) out1, 64 * 4, &roi);
|
(BYTE*) out1, 64 * 4, PIXEL_FORMAT_RGBA32,
|
||||||
#ifdef WITH_SSE2
|
&roi) != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
|
if (optimized->RGBToRGB_16s8u_P3AC4R((const INT16**) ptrs, 64 * 2,
|
||||||
|
(BYTE*) out2, 64 * 4, PIXEL_FORMAT_RGBA32,
|
||||||
|
&roi) != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
for (i = 0; i < 4096; ++i)
|
||||||
{
|
{
|
||||||
strcat(testStr, " SSE2");
|
if (out1[i] != out2[i])
|
||||||
sse2_RGBToRGB_16s8u_P3AC4R((const INT16**) ptrs, 64 * 2,
|
|
||||||
(BYTE*) out2, 64 * 4, &roi);
|
|
||||||
|
|
||||||
for (i = 0; i < 4096; ++i)
|
|
||||||
{
|
{
|
||||||
if (out1[i] != out2[i])
|
printf("RGBToRGB-SSE FAIL: out1[%d]=0x%08x out2[%d]=0x%08x\n",
|
||||||
{
|
i, out1[i], i, out2[i]);
|
||||||
printf("RGBToRGB-SSE FAIL: out1[%d]=0x%08x out2[%d]=0x%08x\n",
|
failed = TRUE;
|
||||||
i, out1[i], i, out2[i]);
|
|
||||||
failed = 1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* i386 */
|
return !failed;
|
||||||
|
|
||||||
if (!failed) printf("All RGBToRGB_16s8u_P3AC4R tests passed (%s).\n", testStr);
|
|
||||||
|
|
||||||
return (failed > 0) ? FAILURE : SUCCESS;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
static const prim_size_t roi64x64 = { 64, 64 };
|
static BOOL test_RGBToRGB_16s8u_P3AC4R_speed(void)
|
||||||
STD_SPEED_TEST(
|
|
||||||
rgb_to_argb_speed, INT16*, UINT32, dst = dst,
|
|
||||||
TRUE, generic->RGBToRGB_16s8u_P3AC4R(
|
|
||||||
(const INT16**) src1, 64 * 2, (BYTE*) dst, 64 * 4, &roi64x64),
|
|
||||||
#ifdef WITH_SSE2
|
|
||||||
TRUE, sse2_RGBToRGB_16s8u_P3AC4R(
|
|
||||||
(const INT16**) src1, 64 * 2, (BYTE*) dst, 64 * 4, &roi64x64),
|
|
||||||
PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
|
|
||||||
#else
|
|
||||||
FALSE, PRIM_NOP, 0, FALSE,
|
|
||||||
#endif
|
|
||||||
FALSE, dst = dst);
|
|
||||||
|
|
||||||
int test_RGBToRGB_16s8u_P3AC4R_speed(void)
|
|
||||||
{
|
{
|
||||||
INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]);
|
const prim_size_t roi64x64 = { 64, 64 };
|
||||||
UINT32 ALIGN(dst[4096]);
|
INT16 ALIGN(r[4096+1]), ALIGN(g[4096+1]), ALIGN(b[4096+1]);
|
||||||
|
UINT32 ALIGN(dst[4096+1]);
|
||||||
int i;
|
int i;
|
||||||
INT16* ptrs[3];
|
INT16* ptrs[3];
|
||||||
int size_array[] = { 64 };
|
|
||||||
winpr_RAND((BYTE*)r, sizeof(r));
|
winpr_RAND((BYTE*)r, sizeof(r));
|
||||||
winpr_RAND((BYTE*)g, sizeof(g));
|
winpr_RAND((BYTE*)g, sizeof(g));
|
||||||
winpr_RAND((BYTE*)b, sizeof(b));
|
winpr_RAND((BYTE*)b, sizeof(b));
|
||||||
@ -114,29 +93,38 @@ int test_RGBToRGB_16s8u_P3AC4R_speed(void)
|
|||||||
b[i] &= 0x00FFU;
|
b[i] &= 0x00FFU;
|
||||||
}
|
}
|
||||||
|
|
||||||
ptrs[0] = r;
|
ptrs[0] = r+1;
|
||||||
ptrs[1] = g;
|
ptrs[1] = g+1;
|
||||||
ptrs[2] = b;
|
ptrs[2] = b+1;
|
||||||
rgb_to_argb_speed("RGBToARGB", "aligned",
|
|
||||||
(const INT16**) ptrs, NULL, 0, dst,
|
if (!speed_test("RGBToRGB_16s8u_P3AC4R", "aligned", g_Iterations,
|
||||||
size_array, 1, RGB_TRIAL_ITERATIONS, TEST_TIME);
|
(speed_test_fkt)generic->RGBToRGB_16s8u_P3AC4R,
|
||||||
return SUCCESS;
|
(speed_test_fkt)optimized->RGBToRGB_16s8u_P3AC4R,
|
||||||
|
(const INT16**) ptrs, 64 * 2, (BYTE*) dst, 64 * 4, &roi64x64))
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
if (!speed_test("RGBToRGB_16s8u_P3AC4R", "unaligned", g_Iterations,
|
||||||
|
(speed_test_fkt)generic->RGBToRGB_16s8u_P3AC4R,
|
||||||
|
(speed_test_fkt)optimized->RGBToRGB_16s8u_P3AC4R,
|
||||||
|
(const INT16**) ptrs, 64 * 2, ((BYTE*) dst)+1, 64 * 4, &roi64x64))
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ========================================================================= */
|
/* ========================================================================= */
|
||||||
int test_yCbCrToRGB_16s16s_P3P3_func(void)
|
static BOOL test_yCbCrToRGB_16s16s_P3P3_func(void)
|
||||||
{
|
{
|
||||||
|
pstatus_t status;
|
||||||
INT16 ALIGN(y[4096]), ALIGN(cb[4096]), ALIGN(cr[4096]);
|
INT16 ALIGN(y[4096]), ALIGN(cb[4096]), ALIGN(cr[4096]);
|
||||||
INT16 ALIGN(r1[4096]), ALIGN(g1[4096]), ALIGN(b1[4096]);
|
INT16 ALIGN(r1[4096]), ALIGN(g1[4096]), ALIGN(b1[4096]);
|
||||||
INT16 ALIGN(r2[4096]), ALIGN(g2[4096]), ALIGN(b2[4096]);
|
INT16 ALIGN(r2[4096]), ALIGN(g2[4096]), ALIGN(b2[4096]);
|
||||||
int i;
|
int i;
|
||||||
int failed = 0;
|
|
||||||
char testStr[256];
|
|
||||||
const INT16* in[3];
|
const INT16* in[3];
|
||||||
INT16* out1[3];
|
INT16* out1[3];
|
||||||
INT16* out2[3];
|
INT16* out2[3];
|
||||||
prim_size_t roi = { 64, 64 };
|
prim_size_t roi = { 64, 64 };
|
||||||
testStr[0] = '\0';
|
|
||||||
winpr_RAND((BYTE*)y, sizeof(y));
|
winpr_RAND((BYTE*)y, sizeof(y));
|
||||||
winpr_RAND((BYTE*)cb, sizeof(cb));
|
winpr_RAND((BYTE*)cb, sizeof(cb));
|
||||||
winpr_RAND((BYTE*)cr, sizeof(cr));
|
winpr_RAND((BYTE*)cr, sizeof(cr));
|
||||||
@ -164,57 +152,40 @@ int test_yCbCrToRGB_16s16s_P3P3_func(void)
|
|||||||
out2[0] = r2;
|
out2[0] = r2;
|
||||||
out2[1] = g2;
|
out2[1] = g2;
|
||||||
out2[2] = b2;
|
out2[2] = b2;
|
||||||
generic->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out1, 64 * 2, &roi);
|
|
||||||
#ifdef WITH_SSE2
|
|
||||||
|
|
||||||
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
|
status = generic->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out1, 64 * 2, &roi);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
status = optimized->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out2, 64 * 2, &roi);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
for (i = 0; i < 4096; ++i)
|
||||||
{
|
{
|
||||||
strcat(testStr, " SSE2");
|
if ((ABS(r1[i] - r2[i]) > 1)
|
||||||
sse2_yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out2, 64 * 2, &roi);
|
|| (ABS(g1[i] - g2[i]) > 1)
|
||||||
|
|| (ABS(b1[i] - b2[i]) > 1))
|
||||||
for (i = 0; i < 4096; ++i)
|
|
||||||
{
|
{
|
||||||
if ((ABS(r1[i] - r2[i]) > 1)
|
printf("YCbCrToRGB-SSE FAIL[%d]: %d,%d,%d vs %d,%d,%d\n", i,
|
||||||
|| (ABS(g1[i] - g2[i]) > 1)
|
r1[i], g1[i], b1[i], r2[i], g2[i], b2[i]);
|
||||||
|| (ABS(b1[i] - b2[i]) > 1))
|
return FALSE;
|
||||||
{
|
|
||||||
printf("YCbCrToRGB-SSE FAIL[%d]: %d,%d,%d vs %d,%d,%d\n", i,
|
|
||||||
r1[i], g1[i], b1[i], r2[i], g2[i], b2[i]);
|
|
||||||
failed = 1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* i386 */
|
return TRUE;
|
||||||
|
|
||||||
if (!failed) printf("All yCbCrToRGB_16s16s_P3P3 tests passed (%s).\n", testStr);
|
|
||||||
|
|
||||||
return (failed > 0) ? FAILURE : SUCCESS;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
STD_SPEED_TEST(
|
|
||||||
ycbcr_to_rgb_speed, INT16*, INT16*, dst = dst,
|
|
||||||
TRUE, generic->yCbCrToRGB_16s16s_P3P3(src1, 64 * 2, dst, 64 * 2, &roi64x64),
|
|
||||||
#ifdef WITH_SSE2
|
|
||||||
TRUE, sse2_yCbCrToRGB_16s16s_P3P3(src1, 64 * 2, dst, 64 * 2, &roi64x64),
|
|
||||||
PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
|
|
||||||
#elif defined(WITH_NEON)
|
|
||||||
TRUE, neon_yCbCrToRGB_16s16s_P3P3(src1, 64 * 2, dst, 64 * 2, &roi64x64),
|
|
||||||
PF_ARM_NEON_INSTRUCTIONS_AVAILABLE, FALSE,
|
|
||||||
#else
|
|
||||||
FALSE, PRIM_NOP, 0, FALSE,
|
|
||||||
#endif
|
|
||||||
FALSE, dst = dst);
|
|
||||||
|
|
||||||
static int test_yCbCrToRGB_16s16s_P3P3_speed(void)
|
static int test_yCbCrToRGB_16s16s_P3P3_speed(void)
|
||||||
{
|
{
|
||||||
|
prim_size_t roi = { 64, 64 };
|
||||||
INT16 ALIGN(y[4096]), ALIGN(cb[4096]), ALIGN(cr[4096]);
|
INT16 ALIGN(y[4096]), ALIGN(cb[4096]), ALIGN(cr[4096]);
|
||||||
INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]);
|
INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]);
|
||||||
int i;
|
int i;
|
||||||
const INT16* input[3];
|
const INT16* input[3];
|
||||||
INT16* output[3];
|
INT16* output[3];
|
||||||
int size_array[] = { 64 };
|
|
||||||
winpr_RAND((BYTE*)y, sizeof(y));
|
winpr_RAND((BYTE*)y, sizeof(y));
|
||||||
winpr_RAND((BYTE*)cb, sizeof(cb));
|
winpr_RAND((BYTE*)cb, sizeof(cb));
|
||||||
winpr_RAND((BYTE*)cr, sizeof(cr));
|
winpr_RAND((BYTE*)cr, sizeof(cr));
|
||||||
@ -233,37 +204,35 @@ static int test_yCbCrToRGB_16s16s_P3P3_speed(void)
|
|||||||
output[0] = r;
|
output[0] = r;
|
||||||
output[1] = g;
|
output[1] = g;
|
||||||
output[2] = b;
|
output[2] = b;
|
||||||
ycbcr_to_rgb_speed("yCbCrToRGB", "aligned", input, NULL, NULL, output,
|
|
||||||
size_array, 1, YCBCR_TRIAL_ITERATIONS, TEST_TIME);
|
if (!speed_test("yCbCrToRGB_16s16s_P3P3", "aligned", g_Iterations,
|
||||||
return SUCCESS;
|
(speed_test_fkt)generic->yCbCrToRGB_16s16s_P3P3,
|
||||||
|
(speed_test_fkt)optimized->yCbCrToRGB_16s16s_P3P3,
|
||||||
|
input, 64 * 2, output, 64 * 2, &roi))
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
int TestPrimitivesColors(int argc, char* argv[])
|
int TestPrimitivesColors(int argc, char* argv[])
|
||||||
{
|
{
|
||||||
int status;
|
prim_test_setup(FALSE);
|
||||||
status = test_RGBToRGB_16s8u_P3AC4R_func();
|
|
||||||
|
|
||||||
if (status != SUCCESS)
|
if (!test_RGBToRGB_16s8u_P3AC4R_func())
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (g_TestPrimitivesPerformance)
|
if (g_TestPrimitivesPerformance)
|
||||||
{
|
{
|
||||||
status = test_RGBToRGB_16s8u_P3AC4R_speed();
|
if (!test_RGBToRGB_16s8u_P3AC4R_speed())
|
||||||
|
|
||||||
if (status != SUCCESS)
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
status = test_yCbCrToRGB_16s16s_P3P3_func();
|
if (!test_yCbCrToRGB_16s16s_P3P3_func())
|
||||||
|
|
||||||
if (status != SUCCESS)
|
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (g_TestPrimitivesPerformance)
|
if (g_TestPrimitivesPerformance)
|
||||||
{
|
{
|
||||||
status = test_yCbCrToRGB_16s16s_P3P3_speed();
|
if (!test_yCbCrToRGB_16s16s_P3P3_speed())
|
||||||
|
|
||||||
if (status != SUCCESS)
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -19,22 +19,17 @@
|
|||||||
#include <winpr/sysinfo.h>
|
#include <winpr/sysinfo.h>
|
||||||
#include "prim_test.h"
|
#include "prim_test.h"
|
||||||
|
|
||||||
static const int MEMCPY_PRETEST_ITERATIONS = 1000000;
|
|
||||||
static const int TEST_TIME = 1.0; // seconds
|
|
||||||
#define COPY_TESTSIZE (256*2+16*2+15+15)
|
#define COPY_TESTSIZE (256*2+16*2+15+15)
|
||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
static int test_copy8u_func(void)
|
static BOOL test_copy8u_func(void)
|
||||||
{
|
{
|
||||||
primitives_t* prims = primitives_get();
|
primitives_t* prims = primitives_get();
|
||||||
BYTE ALIGN(data[COPY_TESTSIZE + 15]);
|
BYTE ALIGN(data[COPY_TESTSIZE + 15]);
|
||||||
int i, soff;
|
int i, soff;
|
||||||
int failed = 0;
|
|
||||||
char testStr[256];
|
|
||||||
BYTE ALIGN(dest[COPY_TESTSIZE + 15]);
|
BYTE ALIGN(dest[COPY_TESTSIZE + 15]);
|
||||||
testStr[0] = '\0';
|
|
||||||
winpr_RAND(data, sizeof(data));
|
winpr_RAND(data, sizeof(data));
|
||||||
strcat(testStr, " ptr");
|
|
||||||
|
|
||||||
for (soff = 0; soff < 16; ++soff)
|
for (soff = 0; soff < 16; ++soff)
|
||||||
{
|
{
|
||||||
@ -47,7 +42,8 @@ static int test_copy8u_func(void)
|
|||||||
for (length = 1; length <= COPY_TESTSIZE - doff; ++length)
|
for (length = 1; length <= COPY_TESTSIZE - doff; ++length)
|
||||||
{
|
{
|
||||||
memset(dest, 0, sizeof(dest));
|
memset(dest, 0, sizeof(dest));
|
||||||
prims->copy_8u(data + soff, dest + doff, length);
|
if (prims->copy_8u(data + soff, dest + doff, length) != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
for (i = 0; i < length; ++i)
|
for (i = 0; i < length; ++i)
|
||||||
{
|
{
|
||||||
@ -57,48 +53,47 @@ static int test_copy8u_func(void)
|
|||||||
"data[%d]=0x%02x\n",
|
"data[%d]=0x%02x\n",
|
||||||
doff, length, i + doff, dest[i + doff],
|
doff, length, i + doff, dest[i + doff],
|
||||||
i + soff, data[i + soff]);
|
i + soff, data[i + soff]);
|
||||||
failed = 1;
|
return FALSE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!failed) printf("All copy8 tests passed (%s).\n", testStr);
|
return TRUE;
|
||||||
|
|
||||||
return (failed > 0) ? FAILURE : SUCCESS;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
STD_SPEED_TEST(copy8u_speed_test, BYTE, BYTE, dst = dst,
|
static BOOL test_copy8u_speed(void)
|
||||||
TRUE, memcpy(dst, src1, size),
|
|
||||||
FALSE, PRIM_NOP, 0, FALSE,
|
|
||||||
TRUE, ippsCopy_8u(src1, dst, size));
|
|
||||||
|
|
||||||
int test_copy8u_speed(void)
|
|
||||||
{
|
{
|
||||||
BYTE ALIGN(src[MAX_TEST_SIZE + 4]);
|
BYTE ALIGN(src[MAX_TEST_SIZE + 4]);
|
||||||
BYTE ALIGN(dst[MAX_TEST_SIZE + 4]);
|
BYTE ALIGN(dst[MAX_TEST_SIZE + 4]);
|
||||||
copy8u_speed_test("copy8u", "aligned", src, NULL, 0, dst,
|
|
||||||
test_sizes, NUM_TEST_SIZES, MEMCPY_PRETEST_ITERATIONS, TEST_TIME);
|
if (!speed_test("copy_8u", "aligned", g_Iterations,
|
||||||
copy8u_speed_test("copy8u", "unaligned", src + 1, NULL, 0, dst,
|
(speed_test_fkt)generic->copy_8u,
|
||||||
test_sizes, NUM_TEST_SIZES, MEMCPY_PRETEST_ITERATIONS, TEST_TIME);
|
(speed_test_fkt)optimized->copy_8u,
|
||||||
return SUCCESS;
|
src, dst, MAX_TEST_SIZE))
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
if (!speed_test("copy_8u", "unaligned", g_Iterations,
|
||||||
|
(speed_test_fkt)generic->copy_8u,
|
||||||
|
(speed_test_fkt)optimized->copy_8u,
|
||||||
|
src+1, dst+1, MAX_TEST_SIZE))
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
int TestPrimitivesCopy(int argc, char* argv[])
|
int TestPrimitivesCopy(int argc, char* argv[])
|
||||||
{
|
{
|
||||||
int status;
|
prim_test_setup(FALSE);
|
||||||
status = test_copy8u_func();
|
|
||||||
|
|
||||||
if (status != SUCCESS)
|
if (!test_copy8u_func())
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (g_TestPrimitivesPerformance)
|
if (g_TestPrimitivesPerformance)
|
||||||
{
|
{
|
||||||
status = test_copy8u_speed();
|
if (!test_copy8u_speed())
|
||||||
|
|
||||||
if (status != SUCCESS)
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -53,14 +53,14 @@ static BOOL test_set8u_func(void)
|
|||||||
{
|
{
|
||||||
UINT32 len;
|
UINT32 len;
|
||||||
|
|
||||||
memset(dest, 0, sizeof(dest));
|
memset(dest, 3, sizeof(dest));
|
||||||
for (len = 1; len < 48 - off; ++len)
|
for (len = 1; len < 48 - off; ++len)
|
||||||
{
|
{
|
||||||
status = generic->set_8u(0xa5, dest + off, len);
|
status = generic->set_8u(0xa5, dest + off, len);
|
||||||
if (status != PRIMITIVES_SUCCESS)
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
return FALSE;
|
return FALSE;
|
||||||
|
|
||||||
if (!check8(dest, len, off, 0xa8))
|
if (!check8(dest, len, off, 0xa5))
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -69,14 +69,14 @@ static BOOL test_set8u_func(void)
|
|||||||
{
|
{
|
||||||
UINT32 len;
|
UINT32 len;
|
||||||
|
|
||||||
memset(dest, 0, sizeof(dest));
|
memset(dest, 3, sizeof(dest));
|
||||||
for (len = 1; len < 48 - off; ++len)
|
for (len = 1; len < 48 - off; ++len)
|
||||||
{
|
{
|
||||||
status = optimized->set_8u(0xa5, dest + off, len);
|
status = optimized->set_8u(0xa5, dest + off, len);
|
||||||
if (status != PRIMITIVES_SUCCESS)
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
return FALSE;
|
return FALSE;
|
||||||
|
|
||||||
if (!check8(dest, len, off, 0xa8))
|
if (!check8(dest, len, off, 0xa5))
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -95,8 +95,9 @@ static BOOL test_set8u_speed(void)
|
|||||||
{
|
{
|
||||||
winpr_RAND(&value, sizeof(value));
|
winpr_RAND(&value, sizeof(value));
|
||||||
if (!speed_test("set_8u", "", g_Iterations,
|
if (!speed_test("set_8u", "", g_Iterations,
|
||||||
generic->set_8u, optimized->set_8u,
|
(speed_test_fkt)generic->set_8u,
|
||||||
value, dest + x, len))
|
(speed_test_fkt)optimized->set_8u,
|
||||||
|
value, dest + x, x))
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -232,8 +233,9 @@ static BOOL test_set32u_speed(void)
|
|||||||
{
|
{
|
||||||
winpr_RAND(&value, sizeof(value));
|
winpr_RAND(&value, sizeof(value));
|
||||||
if (!speed_test("set_32u", "", g_Iterations,
|
if (!speed_test("set_32u", "", g_Iterations,
|
||||||
generic->set_32u, optimized->set_32u,
|
(speed_test_fkt)generic->set_32u,
|
||||||
value, dest + x, len))
|
(speed_test_fkt)optimized->set_32u,
|
||||||
|
value, dest + x, x))
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -251,8 +253,9 @@ static BOOL test_set32s_speed(void)
|
|||||||
{
|
{
|
||||||
winpr_RAND(&value, sizeof(value));
|
winpr_RAND(&value, sizeof(value));
|
||||||
if (!speed_test("set_32s", "", g_Iterations,
|
if (!speed_test("set_32s", "", g_Iterations,
|
||||||
generic->set_32s, optimized->set_32s,
|
(speed_test_fkt)generic->set_32s,
|
||||||
value, dest + x, len))
|
(speed_test_fkt)optimized->set_32s,
|
||||||
|
value, dest + x, x))
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -265,21 +268,20 @@ int TestPrimitivesSet(int argc, char* argv[])
|
|||||||
|
|
||||||
if (!test_set8u_func())
|
if (!test_set8u_func())
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
if (!test_set8u_speed())
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
if (!test_set32s_func())
|
if (!test_set32s_func())
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
if (!test_set32s_speed())
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
if (!test_set32u_func())
|
if (!test_set32u_func())
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
if (!test_set32u_speed())
|
if (g_TestPrimitivesPerformance)
|
||||||
return -1;
|
{
|
||||||
|
if (!test_set8u_speed())
|
||||||
|
return -1;
|
||||||
|
if (!test_set32s_speed())
|
||||||
|
return -1;
|
||||||
|
if (!test_set32u_speed())
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -20,207 +20,361 @@
|
|||||||
#include "prim_test.h"
|
#include "prim_test.h"
|
||||||
|
|
||||||
#define FUNC_TEST_SIZE 65536
|
#define FUNC_TEST_SIZE 65536
|
||||||
static const int SHIFT_PRETEST_ITERATIONS = 50000;
|
|
||||||
static const float TEST_TIME = 1.0;
|
|
||||||
|
|
||||||
#ifdef WITH_SSE2
|
static BOOL test_lShift_16s_func(void)
|
||||||
#define SHIFT_TEST_FUNC(_name_, _type_, _str_, _f1_, _f2_) \
|
|
||||||
int _name_(void) \
|
|
||||||
{ \
|
|
||||||
_type_ ALIGN(src[FUNC_TEST_SIZE+3]), \
|
|
||||||
ALIGN(d1[FUNC_TEST_SIZE+3]), ALIGN(d2[FUNC_TEST_SIZE+3]); \
|
|
||||||
int failed = 0; \
|
|
||||||
int i; \
|
|
||||||
char testStr[256]; \
|
|
||||||
testStr[0] = '\0'; \
|
|
||||||
get_random_data(src, sizeof(src)); \
|
|
||||||
_f1_(src+1, 3, d1+1, FUNC_TEST_SIZE); \
|
|
||||||
if (IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) \
|
|
||||||
{ \
|
|
||||||
strcat(testStr, " SSE3"); \
|
|
||||||
/* Aligned */ \
|
|
||||||
_f2_(src+1, 3, d2+1, FUNC_TEST_SIZE); \
|
|
||||||
for (i=1; i<=FUNC_TEST_SIZE; ++i) \
|
|
||||||
{ \
|
|
||||||
if (d1[i] != d2[i]) \
|
|
||||||
{ \
|
|
||||||
printf("%s-SSE-aligned FAIL[%d]: 0x%x>>3=0x%x, got 0x%x\n", \
|
|
||||||
_str_, i, src[i], d1[i], d2[i]); \
|
|
||||||
++failed; \
|
|
||||||
} \
|
|
||||||
} \
|
|
||||||
/* Unaligned */ \
|
|
||||||
_f2_(src+1, 3, d2+2, FUNC_TEST_SIZE); \
|
|
||||||
for (i=1; i<=FUNC_TEST_SIZE; ++i) \
|
|
||||||
{ \
|
|
||||||
if (d1[i] != d2[i+1]) \
|
|
||||||
{ \
|
|
||||||
printf("%s-SSE-unaligned FAIL[%d]: 0x%x>>3=0x%x, got 0x%x\n", \
|
|
||||||
_str_, i, src[i], d1[i], d2[i+1]); \
|
|
||||||
++failed; \
|
|
||||||
} \
|
|
||||||
} \
|
|
||||||
} \
|
|
||||||
if (!failed) printf("All %s tests passed (%s).\n", _str_, testStr); \
|
|
||||||
return (failed > 0) ? FAILURE : SUCCESS; \
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
#define SHIFT_TEST_FUNC(_name_, _type_, _str_, _f1_, _f2_) \
|
|
||||||
int _name_(void) \
|
|
||||||
{ \
|
|
||||||
return SUCCESS; \
|
|
||||||
}
|
|
||||||
#endif /* i386 */
|
|
||||||
|
|
||||||
SHIFT_TEST_FUNC(test_lShift_16s_func, INT16, "lshift_16s", general_lShiftC_16s,
|
|
||||||
sse2_lShiftC_16s)
|
|
||||||
SHIFT_TEST_FUNC(test_lShift_16u_func, UINT16, "lshift_16u", general_lShiftC_16u,
|
|
||||||
sse2_lShiftC_16u)
|
|
||||||
SHIFT_TEST_FUNC(test_rShift_16s_func, INT16, "rshift_16s", general_rShiftC_16s,
|
|
||||||
sse2_rShiftC_16s)
|
|
||||||
SHIFT_TEST_FUNC(test_rShift_16u_func, UINT16, "rshift_16u", general_rShiftC_16u,
|
|
||||||
sse2_rShiftC_16u)
|
|
||||||
|
|
||||||
/* ========================================================================= */
|
|
||||||
STD_SPEED_TEST(speed_lShift_16s, INT16, INT16, dst = dst,
|
|
||||||
TRUE, general_lShiftC_16s(src1, constant, dst, size),
|
|
||||||
#ifdef WITH_SSE2
|
|
||||||
TRUE, sse2_lShiftC_16s(src1, constant, dst, size),
|
|
||||||
PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
|
|
||||||
#else
|
|
||||||
FALSE, PRIM_NOP, 0, FALSE,
|
|
||||||
#endif
|
|
||||||
TRUE, ippsLShiftC_16s(src1, constant, dst, size));
|
|
||||||
STD_SPEED_TEST(speed_lShift_16u, UINT16, UINT16, dst = dst,
|
|
||||||
TRUE, general_lShiftC_16u(src1, constant, dst, size),
|
|
||||||
#ifdef WITH_SSE2
|
|
||||||
TRUE, sse2_lShiftC_16u(src1, constant, dst, size),
|
|
||||||
PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
|
|
||||||
#else
|
|
||||||
FALSE, PRIM_NOP, 0, FALSE,
|
|
||||||
#endif
|
|
||||||
TRUE, ippsLShiftC_16u(src1, constant, dst, size));
|
|
||||||
STD_SPEED_TEST(speed_rShift_16s, INT16, INT16, dst = dst,
|
|
||||||
TRUE, general_rShiftC_16s(src1, constant, dst, size),
|
|
||||||
#ifdef WITH_SSE2
|
|
||||||
TRUE, sse2_rShiftC_16s(src1, constant, dst, size),
|
|
||||||
PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
|
|
||||||
#else
|
|
||||||
FALSE, PRIM_NOP, 0, FALSE,
|
|
||||||
#endif
|
|
||||||
TRUE, ippsRShiftC_16s(src1, constant, dst, size));
|
|
||||||
STD_SPEED_TEST(speed_rShift_16u, UINT16, UINT16, dst = dst,
|
|
||||||
TRUE, general_rShiftC_16u(src1, constant, dst, size),
|
|
||||||
#ifdef WITH_SSE2
|
|
||||||
TRUE, sse2_rShiftC_16u(src1, constant, dst, size),
|
|
||||||
PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
|
|
||||||
#else
|
|
||||||
FALSE, PRIM_NOP, 0, FALSE,
|
|
||||||
#endif
|
|
||||||
TRUE, ippsRShiftC_16u(src1, constant, dst, size));
|
|
||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
|
||||||
int test_lShift_16s_speed(void)
|
|
||||||
{
|
{
|
||||||
INT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]);
|
pstatus_t status;
|
||||||
winpr_RAND(src, sizeof(src));
|
INT16 ALIGN(src[FUNC_TEST_SIZE+3]);
|
||||||
speed_lShift_16s("lShift_16s", "aligned", src, NULL, 3, dst,
|
INT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
|
||||||
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
|
UINT32 val;
|
||||||
speed_lShift_16s("lShift_16s", "unaligned", src + 1, NULL, 3, dst,
|
|
||||||
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
|
winpr_RAND((BYTE*)&val, sizeof(val));
|
||||||
return SUCCESS;
|
winpr_RAND((BYTE*)src, sizeof(src));
|
||||||
|
|
||||||
|
val = (val % (FUNC_TEST_SIZE - 1)) + 1;
|
||||||
|
|
||||||
|
/* Aligned */
|
||||||
|
status = generic->lShiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
status = optimized->lShiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
/* Unaligned */
|
||||||
|
status = generic->lShiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
status = optimized->lShiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
static BOOL test_lShift_16u_func(void)
|
||||||
|
{
|
||||||
|
pstatus_t status;
|
||||||
|
UINT16 ALIGN(src[FUNC_TEST_SIZE+3]);
|
||||||
|
UINT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
|
||||||
|
UINT32 val;
|
||||||
|
|
||||||
|
winpr_RAND((BYTE*)&val, sizeof(val));
|
||||||
|
winpr_RAND((BYTE*)src, sizeof(src));
|
||||||
|
|
||||||
|
val = (val % (FUNC_TEST_SIZE - 1)) + 1;
|
||||||
|
|
||||||
|
/* Aligned */
|
||||||
|
status = generic->lShiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
status = optimized->lShiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
/* Unaligned */
|
||||||
|
status = generic->lShiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
status = optimized->lShiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
static BOOL test_rShift_16s_func(void)
|
||||||
|
{
|
||||||
|
pstatus_t status;
|
||||||
|
INT16 ALIGN(src[FUNC_TEST_SIZE+3]);
|
||||||
|
INT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
|
||||||
|
UINT32 val;
|
||||||
|
|
||||||
|
winpr_RAND((BYTE*)&val, sizeof(val));
|
||||||
|
winpr_RAND((BYTE*)src, sizeof(src));
|
||||||
|
|
||||||
|
val = (val % (FUNC_TEST_SIZE - 1)) + 1;
|
||||||
|
|
||||||
|
/* Aligned */
|
||||||
|
status = generic->rShiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
status = optimized->rShiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
/* Unaligned */
|
||||||
|
status = generic->rShiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
status = optimized->rShiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
static BOOL test_rShift_16u_func(void)
|
||||||
|
{
|
||||||
|
pstatus_t status;
|
||||||
|
UINT16 ALIGN(src[FUNC_TEST_SIZE+3]);
|
||||||
|
UINT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
|
||||||
|
UINT32 val;
|
||||||
|
|
||||||
|
winpr_RAND((BYTE*)&val, sizeof(val));
|
||||||
|
winpr_RAND((BYTE*)src, sizeof(src));
|
||||||
|
|
||||||
|
val = (val % (FUNC_TEST_SIZE - 1)) + 1;
|
||||||
|
|
||||||
|
/* Aligned */
|
||||||
|
status = generic->rShiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
status = optimized->rShiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
/* Unaligned */
|
||||||
|
status = generic->rShiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
status = optimized->rShiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
static BOOL test_ShiftWrapper_16s_func(void)
|
||||||
|
{
|
||||||
|
pstatus_t status;
|
||||||
|
INT16 ALIGN(src[FUNC_TEST_SIZE+3]);
|
||||||
|
INT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
|
||||||
|
UINT32 tmp;
|
||||||
|
INT32 val;
|
||||||
|
|
||||||
|
winpr_RAND((BYTE*)&tmp, sizeof(tmp));
|
||||||
|
winpr_RAND((BYTE*)src, sizeof(src));
|
||||||
|
|
||||||
|
val = (tmp % (FUNC_TEST_SIZE - 1)) + 1;
|
||||||
|
|
||||||
|
/* Aligned */
|
||||||
|
status = generic->shiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
status = optimized->shiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
status = generic->shiftC_16s(src+1, -val, d1+1, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
status = optimized->shiftC_16s(src+1, -val, d1+1, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
/* Unaligned */
|
||||||
|
status = generic->shiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
status = optimized->shiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
status = generic->shiftC_16s(src+1, -val, d1+2, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
status = optimized->shiftC_16s(src+1, -val, d1+2, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
static BOOL test_ShiftWrapper_16u_func(void)
|
||||||
|
{
|
||||||
|
pstatus_t status;
|
||||||
|
UINT16 ALIGN(src[FUNC_TEST_SIZE+3]);
|
||||||
|
UINT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
|
||||||
|
UINT32 tmp;
|
||||||
|
INT32 val;
|
||||||
|
|
||||||
|
winpr_RAND((BYTE*)&tmp, sizeof(tmp));
|
||||||
|
winpr_RAND((BYTE*)src, sizeof(src));
|
||||||
|
|
||||||
|
val = (tmp % (FUNC_TEST_SIZE - 1)) + 1;
|
||||||
|
|
||||||
|
/* Aligned */
|
||||||
|
status = generic->shiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
status = optimized->shiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
status = generic->shiftC_16u(src+1, -val, d1+1, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
status = optimized->shiftC_16u(src+1, -val, d1+1, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
/* Unaligned */
|
||||||
|
status = generic->shiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
status = optimized->shiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
status = generic->shiftC_16u(src+1, -val, d1+2, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
status = optimized->shiftC_16u(src+1, -val, d1+2, FUNC_TEST_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
int test_lShift_16u_speed(void)
|
static BOOL test_lShift_16s_speed(void)
|
||||||
{
|
{
|
||||||
|
UINT32 val;
|
||||||
|
INT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]);
|
||||||
|
|
||||||
|
winpr_RAND((BYTE*)src, sizeof(src));
|
||||||
|
winpr_RAND((BYTE*)&val, sizeof(val));
|
||||||
|
|
||||||
|
if (!speed_test("lShift_16s", "aligned", g_Iterations,
|
||||||
|
(speed_test_fkt)generic->lShiftC_16s,
|
||||||
|
(speed_test_fkt)optimized->lShiftC_16s, src, val,
|
||||||
|
dst, MAX_TEST_SIZE))
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
if (!speed_test("lShift_16s", "unaligned", g_Iterations,
|
||||||
|
(speed_test_fkt)generic->lShiftC_16s,
|
||||||
|
(speed_test_fkt)optimized->lShiftC_16s, src + 1, val,
|
||||||
|
dst, MAX_TEST_SIZE))
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ------------------------------------------------------------------------- */
|
||||||
|
static BOOL test_lShift_16u_speed(void)
|
||||||
|
{
|
||||||
|
UINT32 val;
|
||||||
UINT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]);
|
UINT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]);
|
||||||
winpr_RAND(src, sizeof(src));
|
|
||||||
speed_lShift_16u("lShift_16u", "aligned", src, NULL, 3, dst,
|
winpr_RAND((BYTE*)&val, sizeof(val));
|
||||||
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
|
winpr_RAND((BYTE*)src, sizeof(src));
|
||||||
speed_lShift_16u("lShift_16u", "unaligned", src + 1, NULL, 3, dst,
|
|
||||||
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
|
if (!speed_test("lShift_16u", "aligned", g_Iterations,
|
||||||
return SUCCESS;
|
(speed_test_fkt)generic->lShiftC_16u,
|
||||||
|
(speed_test_fkt)optimized->lShiftC_16u, src, val,
|
||||||
|
dst, MAX_TEST_SIZE))
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
if (!speed_test("lShift_16u", "unaligned", g_Iterations,
|
||||||
|
(speed_test_fkt)generic->lShiftC_16u,
|
||||||
|
(speed_test_fkt)optimized->lShiftC_16u, src + 1, val,
|
||||||
|
dst, MAX_TEST_SIZE))
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
int test_rShift_16s_speed(void)
|
static BOOL test_rShift_16s_speed(void)
|
||||||
{
|
{
|
||||||
INT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]);
|
UINT32 val;
|
||||||
winpr_RAND(src, sizeof(src));
|
INT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]);
|
||||||
speed_rShift_16s("rShift_16s", "aligned", src, NULL, 3, dst,
|
|
||||||
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
|
winpr_RAND((BYTE*)src, sizeof(src));
|
||||||
speed_rShift_16s("rShift_16s", "unaligned", src + 1, NULL, 3, dst,
|
winpr_RAND((BYTE*)&val, sizeof(val));
|
||||||
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
|
if (!speed_test("rShift_16s", "aligned", g_Iterations,
|
||||||
return SUCCESS;
|
(speed_test_fkt)generic->rShiftC_16s,
|
||||||
|
(speed_test_fkt)optimized->rShiftC_16s, src, val,
|
||||||
|
dst, MAX_TEST_SIZE))
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
if (!speed_test("rShift_16s", "unaligned", g_Iterations,
|
||||||
|
(speed_test_fkt)generic->rShiftC_16s,
|
||||||
|
(speed_test_fkt)optimized->rShiftC_16s, src + 1, val,
|
||||||
|
dst, MAX_TEST_SIZE))
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
int test_rShift_16u_speed(void)
|
static BOOL test_rShift_16u_speed(void)
|
||||||
{
|
{
|
||||||
|
UINT32 val;
|
||||||
UINT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]);
|
UINT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]);
|
||||||
winpr_RAND(src, sizeof(src));
|
|
||||||
speed_rShift_16u("rShift_16u", "aligned", src, NULL, 3, dst,
|
winpr_RAND((BYTE*)&val, sizeof(val));
|
||||||
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
|
winpr_RAND((BYTE*)src, sizeof(src));
|
||||||
speed_rShift_16u("rShift_16u", "unaligned", src + 1, NULL, 3, dst,
|
|
||||||
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
|
if (!speed_test("rShift_16u", "aligned", g_Iterations,
|
||||||
return SUCCESS;
|
(speed_test_fkt)generic->rShiftC_16u,
|
||||||
|
(speed_test_fkt)optimized->rShiftC_16u, src, val,
|
||||||
|
dst, MAX_TEST_SIZE))
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
if (!speed_test("rShift_16u", "unaligned", g_Iterations,
|
||||||
|
(speed_test_fkt)generic->rShiftC_16u,
|
||||||
|
(speed_test_fkt)optimized->rShiftC_16u, src + 1, val,
|
||||||
|
dst, MAX_TEST_SIZE))
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
int TestPrimitivesShift(int argc, char* argv[])
|
int TestPrimitivesShift(int argc, char* argv[])
|
||||||
{
|
{
|
||||||
int status;
|
prim_test_setup(FALSE);
|
||||||
status = test_lShift_16s_func();
|
|
||||||
|
|
||||||
if (status != SUCCESS)
|
if (!test_lShift_16s_func())
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (g_TestPrimitivesPerformance)
|
if (g_TestPrimitivesPerformance)
|
||||||
{
|
{
|
||||||
status = test_lShift_16s_speed();
|
if (!test_lShift_16s_speed())
|
||||||
|
|
||||||
if (status != SUCCESS)
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
status = test_lShift_16u_func();
|
if (!test_lShift_16u_func())
|
||||||
|
|
||||||
if (status != SUCCESS)
|
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (g_TestPrimitivesPerformance)
|
if (g_TestPrimitivesPerformance)
|
||||||
{
|
{
|
||||||
status = test_lShift_16u_speed();
|
if (!test_lShift_16u_speed())
|
||||||
|
|
||||||
if (status != SUCCESS)
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
status = test_rShift_16s_func();
|
if (!test_rShift_16s_func())
|
||||||
|
|
||||||
if (status != SUCCESS)
|
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (g_TestPrimitivesPerformance)
|
if (g_TestPrimitivesPerformance)
|
||||||
{
|
{
|
||||||
status = test_rShift_16s_speed();
|
if (!test_rShift_16s_speed())
|
||||||
|
|
||||||
if (status != SUCCESS)
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
status = test_rShift_16u_func();
|
if (!test_rShift_16u_func())
|
||||||
|
|
||||||
if (status != SUCCESS)
|
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (g_TestPrimitivesPerformance)
|
if (g_TestPrimitivesPerformance)
|
||||||
{
|
{
|
||||||
status = test_rShift_16u_speed();
|
if (!test_rShift_16u_speed())
|
||||||
|
|
||||||
if (status != SUCCESS)
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!test_ShiftWrapper_16s_func())
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (!test_ShiftWrapper_16u_func())
|
||||||
|
return 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -19,103 +19,71 @@
|
|||||||
#include <winpr/sysinfo.h>
|
#include <winpr/sysinfo.h>
|
||||||
#include "prim_test.h"
|
#include "prim_test.h"
|
||||||
|
|
||||||
static const int SIGN_PRETEST_ITERATIONS = 100000;
|
#define TEST_BUFFER_SIZE 65535
|
||||||
static const float TEST_TIME = 1.0;
|
|
||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
static int test_sign16s_func(void)
|
static BOOL test_sign16s_func(void)
|
||||||
{
|
{
|
||||||
INT16 ALIGN(src[65535]), ALIGN(d1[65535]);
|
pstatus_t status;
|
||||||
#ifdef WITH_SSE2
|
INT16 ALIGN(src[TEST_BUFFER_SIZE]);
|
||||||
INT16 ALIGN(d2[65535]);
|
INT16 ALIGN(d1[TEST_BUFFER_SIZE]);
|
||||||
int i;
|
INT16 ALIGN(d2[TEST_BUFFER_SIZE]);
|
||||||
#endif
|
|
||||||
int failed = 0;
|
|
||||||
char testStr[256];
|
|
||||||
/* Test when we can reach 16-byte alignment */
|
|
||||||
testStr[0] = '\0';
|
|
||||||
winpr_RAND(src, sizeof(src));
|
|
||||||
general_sign_16s(src + 1, d1 + 1, 65535);
|
|
||||||
#ifdef WITH_SSE2
|
|
||||||
|
|
||||||
if (IsProcessorFeaturePresentEx(PF_EX_SSSE3))
|
winpr_RAND((BYTE*)src, sizeof(src));
|
||||||
{
|
|
||||||
strcat(testStr, " SSSE3");
|
|
||||||
ssse3_sign_16s(src + 1, d2 + 1, 65535);
|
|
||||||
|
|
||||||
for (i = 1; i < 65535; ++i)
|
status = generic->sign_16s(src + 1, d1 + 1, TEST_BUFFER_SIZE);
|
||||||
{
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
if (d1[i] != d2[i])
|
return FALSE;
|
||||||
{
|
status = optimized->sign_16s(src + 1, d2 + 1, TEST_BUFFER_SIZE);
|
||||||
printf("SIGN16s-SSE-aligned FAIL[%d] of %d: want %d, got %d\n",
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
i, src[i], d1[i], d2[i]);
|
return FALSE;
|
||||||
++failed;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* i386 */
|
if (memcmp(d1, d2, sizeof(d1)) != 0)
|
||||||
/* Test when we cannot reach 16-byte alignment */
|
return FALSE;
|
||||||
winpr_RAND(src, sizeof(src));
|
|
||||||
general_sign_16s(src + 1, d1 + 2, 65535);
|
|
||||||
#ifdef WITH_SSE2
|
|
||||||
|
|
||||||
if (IsProcessorFeaturePresentEx(PF_EX_SSSE3))
|
status = generic->sign_16s(src + 1, d1 + 2, TEST_BUFFER_SIZE);
|
||||||
{
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
ssse3_sign_16s(src + 1, d2 + 2, 65535);
|
return FALSE;
|
||||||
|
status = optimized->sign_16s(src + 1, d2 + 2, TEST_BUFFER_SIZE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
for (i = 2; i < 65535; ++i)
|
if (memcmp(d1, d2, sizeof(d1)) != 0)
|
||||||
{
|
return FALSE;
|
||||||
if (d1[i] != d2[i])
|
|
||||||
{
|
|
||||||
printf("SIGN16s-SSE-unaligned FAIL[%d] of %d: want %d, got %d\n",
|
|
||||||
i, src[i - 1], d1[i], d2[i]);
|
|
||||||
++failed;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* i386 */
|
return TRUE;
|
||||||
|
|
||||||
if (!failed) printf("All sign16s tests passed (%s).\n", testStr);
|
|
||||||
|
|
||||||
return (failed > 0) ? FAILURE : SUCCESS;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
|
||||||
STD_SPEED_TEST(sign16s_speed_test, INT16, INT16, dst = dst,
|
|
||||||
TRUE, general_sign_16s(src1, dst, size),
|
|
||||||
#ifdef WITH_SSE2
|
|
||||||
TRUE, ssse3_sign_16s(src1, dst, size), PF_EX_SSSE3, TRUE,
|
|
||||||
#else
|
|
||||||
FALSE, PRIM_NOP, 0, FALSE,
|
|
||||||
#endif
|
|
||||||
FALSE, dst = dst);
|
|
||||||
|
|
||||||
static int test_sign16s_speed(void)
|
static int test_sign16s_speed(void)
|
||||||
{
|
{
|
||||||
INT16 ALIGN(src[MAX_TEST_SIZE + 3]), ALIGN(dst[MAX_TEST_SIZE + 3]);
|
INT16 ALIGN(src[MAX_TEST_SIZE + 3]), ALIGN(dst[MAX_TEST_SIZE + 3]);
|
||||||
winpr_RAND(src, sizeof(src));
|
winpr_RAND((BYTE*)src, sizeof(src));
|
||||||
sign16s_speed_test("sign16s", "aligned", src, NULL, 0, dst,
|
|
||||||
test_sizes, NUM_TEST_SIZES, SIGN_PRETEST_ITERATIONS, TEST_TIME);
|
if (!speed_test("sign16s", "aligned", g_Iterations,
|
||||||
sign16s_speed_test("sign16s", "unaligned", src + 1, NULL, 0, dst,
|
(speed_test_fkt)generic->sign_16s,
|
||||||
test_sizes, NUM_TEST_SIZES, SIGN_PRETEST_ITERATIONS, TEST_TIME);
|
(speed_test_fkt)optimized->sign_16s, src + 1, dst + 1,
|
||||||
return SUCCESS;
|
MAX_TEST_SIZE))
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
if (!speed_test("sign16s", "unaligned", g_Iterations,
|
||||||
|
(speed_test_fkt)generic->sign_16s,
|
||||||
|
(speed_test_fkt)optimized->sign_16s, src + 1, dst + 2,
|
||||||
|
MAX_TEST_SIZE))
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
int TestPrimitivesSign(int argc, char* argv[])
|
int TestPrimitivesSign(int argc, char* argv[])
|
||||||
{
|
{
|
||||||
int status;
|
prim_test_setup(FALSE);
|
||||||
status = test_sign16s_func();
|
|
||||||
|
|
||||||
if (status != SUCCESS)
|
if (!test_sign16s_func())
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (g_TestPrimitivesPerformance)
|
if (g_TestPrimitivesPerformance)
|
||||||
{
|
{
|
||||||
status = test_sign16s_speed();
|
if (!test_sign16s_speed())
|
||||||
|
|
||||||
if (status != SUCCESS)
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -23,105 +23,103 @@
|
|||||||
#include <winpr/sysinfo.h>
|
#include <winpr/sysinfo.h>
|
||||||
#include "prim_test.h"
|
#include "prim_test.h"
|
||||||
|
|
||||||
static const int YCOCG_TRIAL_ITERATIONS = 20000;
|
|
||||||
static const float TEST_TIME = 4.0;
|
|
||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------- */
|
||||||
int test_YCoCgRToRGB_8u_AC4R_func(void)
|
static BOOL test_YCoCgRToRGB_8u_AC4R_func(void)
|
||||||
{
|
{
|
||||||
#ifdef WITH_SSE2
|
BOOL result = TRUE;
|
||||||
int i;
|
pstatus_t status;
|
||||||
INT32 ALIGN(out_sse[4098]), ALIGN(out_sse_inv[4098]);
|
INT32 ALIGN(out_sse[4098]), ALIGN(out_sse_inv[4098]);
|
||||||
#endif
|
|
||||||
INT32 ALIGN(in[4098]);
|
INT32 ALIGN(in[4098]);
|
||||||
INT32 ALIGN(out_c[4098]), ALIGN(out_c_inv[4098]);
|
INT32 ALIGN(out_c[4098]), ALIGN(out_c_inv[4098]);
|
||||||
char testStr[256];
|
|
||||||
BOOL failed = FALSE;
|
|
||||||
testStr[0] = '\0';
|
|
||||||
winpr_RAND(in, sizeof(in));
|
|
||||||
general_YCoCgToRGB_8u_AC4R((const BYTE*)(in + 1), 63 * 4,
|
|
||||||
(BYTE*) out_c, 63 * 4, 63, 61, 2, TRUE, FALSE);
|
|
||||||
general_YCoCgToRGB_8u_AC4R((const BYTE*)(in + 1), 63 * 4,
|
|
||||||
(BYTE*) out_c_inv, 63 * 4, 63, 61, 2, TRUE, TRUE);
|
|
||||||
#ifdef WITH_SSE2
|
|
||||||
|
|
||||||
if (IsProcessorFeaturePresentEx(PF_EX_SSSE3))
|
UINT32 i, x;
|
||||||
|
const UINT32 formats[] = {
|
||||||
|
PIXEL_FORMAT_ARGB32,
|
||||||
|
PIXEL_FORMAT_ABGR32,
|
||||||
|
PIXEL_FORMAT_RGBA32,
|
||||||
|
PIXEL_FORMAT_RGBX32,
|
||||||
|
PIXEL_FORMAT_BGRA32,
|
||||||
|
PIXEL_FORMAT_BGRX32
|
||||||
|
};
|
||||||
|
|
||||||
|
winpr_RAND((BYTE*)in, sizeof(in));
|
||||||
|
|
||||||
|
for (x=0; x<sizeof(formats)/sizeof(formats[0]); x++)
|
||||||
{
|
{
|
||||||
strcat(testStr, " SSSE3");
|
UINT32 format = formats[x];
|
||||||
ssse3_YCoCgRToRGB_8u_AC4R((const BYTE*)(in + 1), 63 * 4,
|
|
||||||
(BYTE*) out_sse, 63 * 4, 63, 61, 2, TRUE, FALSE);
|
status = generic->YCoCgToRGB_8u_AC4R(
|
||||||
|
(const BYTE*)(in + 1), 63 * 4,
|
||||||
|
(BYTE*) out_c, format, 63 * 4, 63, 61, 2, TRUE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
status = generic->YCoCgToRGB_8u_AC4R(
|
||||||
|
(const BYTE*)(in + 1), 63 * 4,
|
||||||
|
(BYTE*) out_c_inv, format, 63 * 4, 63, 61, 2, TRUE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
status = optimized->YCoCgToRGB_8u_AC4R(
|
||||||
|
(const BYTE*)(in + 1), 63 * 4,
|
||||||
|
(BYTE*) out_sse, format, 63 * 4, 63, 61, 2, TRUE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
status = optimized->YCoCgToRGB_8u_AC4R(
|
||||||
|
(const BYTE*)(in + 1), 63 * 4,
|
||||||
|
(BYTE*) out_sse_inv, format, 63 * 4, 63, 61, 2, TRUE);
|
||||||
|
if (status != PRIMITIVES_SUCCESS)
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
for (i = 0; i < 63 * 61; ++i)
|
for (i = 0; i < 63 * 61; ++i)
|
||||||
{
|
{
|
||||||
if (out_c[i] != out_sse[i])
|
if (out_c[i] != out_sse[i])
|
||||||
{
|
{
|
||||||
printf("YCoCgRToRGB-SSE FAIL[%d]: 0x%08x -> C 0x%08x vs SSE 0x%08x\n", i,
|
printf("optimized->YCoCgRToRGB FAIL[%d]: 0x%08x -> C 0x%08x vs optimized 0x%08x\n", i,
|
||||||
in[i + 1], out_c[i], out_sse[i]);
|
in[i + 1], out_c[i], out_sse[i]);
|
||||||
failed = TRUE;
|
result = FALSE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ssse3_YCoCgRToRGB_8u_AC4R((const BYTE*)(in + 1), 63 * 4,
|
|
||||||
(BYTE*) out_sse_inv, 63 * 4, 63, 61, 2, TRUE, TRUE);
|
|
||||||
|
|
||||||
for (i = 0; i < 63 * 61; ++i)
|
for (i = 0; i < 63 * 61; ++i)
|
||||||
{
|
{
|
||||||
if (out_c_inv[i] != out_sse_inv[i])
|
if (out_c_inv[i] != out_sse_inv[i])
|
||||||
{
|
{
|
||||||
printf("YCoCgRToRGB-SSE inverted FAIL[%d]: 0x%08x -> C 0x%08x vs SSE 0x%08x\n",
|
printf("optimized->YCoCgRToRGB inverted FAIL[%d]: 0x%08x -> C 0x%08x vs optimized 0x%08x\n",
|
||||||
i,
|
i,
|
||||||
in[i + 1], out_c_inv[i], out_sse_inv[i]);
|
in[i + 1], out_c_inv[i], out_sse_inv[i]);
|
||||||
failed = TRUE;
|
result = FALSE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return result;
|
||||||
#endif /* i386 */
|
|
||||||
|
|
||||||
if (!failed) printf("All YCoCgRToRGB_8u_AC4R tests passed (%s).\n", testStr);
|
|
||||||
|
|
||||||
return (failed > 0) ? FAILURE : SUCCESS;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ------------------------------------------------------------------------- */
|
|
||||||
STD_SPEED_TEST(
|
|
||||||
ycocg_to_rgb_speed, BYTE, BYTE, PRIM_NOP,
|
|
||||||
TRUE, general_YCoCgToRGB_8u_AC4R(src1, 64 * 4, dst, 64 * 4, 64, 64, 2, FALSE,
|
|
||||||
FALSE),
|
|
||||||
#ifdef WITH_SSE2
|
|
||||||
TRUE, ssse3_YCoCgRToRGB_8u_AC4R(src1, 64 * 4, dst, 64 * 4, 64, 64, 2, FALSE,
|
|
||||||
FALSE),
|
|
||||||
PF_EX_SSSE3, TRUE,
|
|
||||||
#else
|
|
||||||
FALSE, PRIM_NOP, 0, FALSE,
|
|
||||||
#endif
|
|
||||||
FALSE, PRIM_NOP);
|
|
||||||
|
|
||||||
static int test_YCoCgRToRGB_8u_AC4R_speed(void)
|
static int test_YCoCgRToRGB_8u_AC4R_speed(void)
|
||||||
{
|
{
|
||||||
INT32 ALIGN(in[4096]);
|
INT32 ALIGN(in[4096]);
|
||||||
INT32 ALIGN(out[4096]);
|
INT32 ALIGN(out[4096]);
|
||||||
int size_array[] = { 64 };
|
|
||||||
winpr_RAND(in, sizeof(in));
|
winpr_RAND((BYTE*)in, sizeof(in));
|
||||||
ycocg_to_rgb_speed("YCoCgToRGB", "aligned", (const BYTE*) in,
|
|
||||||
0, 0, (BYTE*) out,
|
if (!speed_test("YCoCgToRGB_8u_AC4R", "aligned", g_Iterations,
|
||||||
size_array, 1, YCOCG_TRIAL_ITERATIONS, TEST_TIME);
|
(speed_test_fkt)generic->YCoCgToRGB_8u_AC4R,
|
||||||
return SUCCESS;
|
(speed_test_fkt)optimized->YCoCgToRGB_8u_AC4R,
|
||||||
|
in, 64 * 4, out, 64 * 4, 64, 64, 2, FALSE, FALSE))
|
||||||
|
return FALSE;
|
||||||
|
|
||||||
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
int TestPrimitivesYCoCg(int argc, char* argv[])
|
int TestPrimitivesYCoCg(int argc, char* argv[])
|
||||||
{
|
{
|
||||||
int status;
|
prim_test_setup(FALSE);
|
||||||
status = test_YCoCgRToRGB_8u_AC4R_func();
|
|
||||||
|
|
||||||
if (status != SUCCESS)
|
if (!test_YCoCgRToRGB_8u_AC4R_func())
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
if (g_TestPrimitivesPerformance)
|
if (g_TestPrimitivesPerformance)
|
||||||
{
|
{
|
||||||
status = test_YCoCgRToRGB_8u_AC4R_speed();
|
if (!test_YCoCgRToRGB_8u_AC4R_speed())
|
||||||
|
|
||||||
if (status != SUCCESS)
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -38,8 +38,8 @@ static void get_size(UINT32* width, UINT32* height)
|
|||||||
winpr_RAND((BYTE*)width, sizeof(*width));
|
winpr_RAND((BYTE*)width, sizeof(*width));
|
||||||
winpr_RAND((BYTE*)height, sizeof(*height));
|
winpr_RAND((BYTE*)height, sizeof(*height));
|
||||||
// TODO: Algorithm only works on even resolutions...
|
// TODO: Algorithm only works on even resolutions...
|
||||||
*width = (*width % 4000) << 1;
|
*width = (*width % 64) << 1;
|
||||||
*height = (*height % 4000 << 1);
|
*height = (*height % 64 << 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
static BOOL check_padding(const BYTE* psrc, size_t size, size_t padding,
|
static BOOL check_padding(const BYTE* psrc, size_t size, size_t padding,
|
||||||
@ -370,11 +370,13 @@ static BOOL TestPrimitiveYUV(BOOL use444)
|
|||||||
|
|
||||||
if (use444)
|
if (use444)
|
||||||
{
|
{
|
||||||
if (prims->RGBToYUV444_8u_P3AC4R(rgb, stride, yuv, yuv_step,
|
if (prims->RGBToYUV444_8u_P3AC4R(rgb, PIXEL_FORMAT_BGRA32,
|
||||||
|
stride, yuv, yuv_step,
|
||||||
&roi) != PRIMITIVES_SUCCESS)
|
&roi) != PRIMITIVES_SUCCESS)
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
else if (prims->RGBToYUV420_8u_P3AC4R(rgb, stride, yuv, yuv_step,
|
else if (prims->RGBToYUV420_8u_P3AC4R(rgb, PIXEL_FORMAT_BGRA32,
|
||||||
|
stride, yuv, yuv_step,
|
||||||
&roi) != PRIMITIVES_SUCCESS)
|
&roi) != PRIMITIVES_SUCCESS)
|
||||||
goto fail;
|
goto fail;
|
||||||
|
|
||||||
@ -429,16 +431,16 @@ int TestPrimitivesYUV(int argc, char* argv[])
|
|||||||
UINT32 x;
|
UINT32 x;
|
||||||
int rc = -1;
|
int rc = -1;
|
||||||
|
|
||||||
|
prim_test_setup(FALSE);
|
||||||
|
|
||||||
for (x = 0; x < 10; x++)
|
for (x = 0; x < 10; x++)
|
||||||
{
|
{
|
||||||
/* TODO: This test fails on value comparison,
|
|
||||||
* there seems to be some issue left with encoder / decoder pass.
|
|
||||||
if (!TestPrimitiveYUV(FALSE))
|
|
||||||
goto end;
|
|
||||||
*/
|
|
||||||
if (!TestPrimitiveYUV(TRUE))
|
if (!TestPrimitiveYUV(TRUE))
|
||||||
goto end;
|
goto end;
|
||||||
|
|
||||||
|
if (!TestPrimitiveYUV(FALSE))
|
||||||
|
goto end;
|
||||||
|
|
||||||
if (!TestPrimitiveYUVCombine())
|
if (!TestPrimitiveYUVCombine())
|
||||||
goto end;
|
goto end;
|
||||||
}
|
}
|
||||||
|
@ -43,13 +43,6 @@
|
|||||||
extern int test_sizes[];
|
extern int test_sizes[];
|
||||||
#define NUM_TEST_SIZES 10
|
#define NUM_TEST_SIZES 10
|
||||||
|
|
||||||
#ifndef SUCCESS
|
|
||||||
#define SUCCESS 0
|
|
||||||
#endif
|
|
||||||
#ifndef FAILURE
|
|
||||||
#define FAILURE 1
|
|
||||||
#endif
|
|
||||||
|
|
||||||
extern BOOL g_TestPrimitivesPerformance;
|
extern BOOL g_TestPrimitivesPerformance;
|
||||||
extern UINT32 g_Iterations;
|
extern UINT32 g_Iterations;
|
||||||
|
|
||||||
@ -58,8 +51,10 @@ extern primitives_t* optimized;
|
|||||||
|
|
||||||
void prim_test_setup(BOOL performance);
|
void prim_test_setup(BOOL performance);
|
||||||
|
|
||||||
|
typedef pstatus_t (*speed_test_fkt)();
|
||||||
|
|
||||||
BOOL speed_test(const char* name, const char* dsc, UINT32 iterations,
|
BOOL speed_test(const char* name, const char* dsc, UINT32 iterations,
|
||||||
pstatus_t (*generic)(), pstatus_t (*optimised)(),
|
speed_test_fkt generic, speed_test_fkt optimized,
|
||||||
...);
|
...);
|
||||||
|
|
||||||
#endif // !__PRIMTEST_H_INCLUDED__
|
#endif // !__PRIMTEST_H_INCLUDED__
|
||||||
|
Loading…
Reference in New Issue
Block a user