Updated primitives API and tests.

This commit is contained in:
Armin Novak 2016-07-13 14:04:48 +02:00
parent e860fde4bc
commit 99c418766c
31 changed files with 1194 additions and 1054 deletions

View File

@ -81,62 +81,62 @@ typedef pstatus_t (*__copy_8u_AC4r_t)(
typedef pstatus_t (*__set_8u_t)(
BYTE val,
BYTE* pDst,
INT32 len);
UINT32 len);
typedef pstatus_t (*__set_32s_t)(
INT32 val,
INT32* pDst,
INT32 len);
UINT32 len);
typedef pstatus_t (*__set_32u_t)(
UINT32 val,
UINT32* pDst,
INT32 len);
UINT32 len);
typedef pstatus_t (*__zero_t)(
void* pDst,
size_t bytes);
typedef pstatus_t (*__alphaComp_argb_t)(
const BYTE* pSrc1, INT32 src1Step,
const BYTE* pSrc2, INT32 src2Step,
BYTE* pDst, INT32 dstStep,
INT32 width, INT32 height);
const BYTE* pSrc1, UINT32 src1Step,
const BYTE* pSrc2, UINT32 src2Step,
BYTE* pDst, UINT32 dstStep,
UINT32 width, UINT32 height);
typedef pstatus_t (*__add_16s_t)(
const INT16* pSrc1,
const INT16* pSrc2,
INT16* pDst,
INT32 len);
UINT32 len);
typedef pstatus_t (*__lShiftC_16s_t)(
const INT16* pSrc,
INT32 val,
UINT32 val,
INT16* pSrcDst,
INT32 len);
UINT32 len);
typedef pstatus_t (*__lShiftC_16u_t)(
const UINT16* pSrc,
INT32 val,
UINT32 val,
UINT16* pSrcDst,
INT32 len);
UINT32 len);
typedef pstatus_t (*__rShiftC_16s_t)(
const INT16* pSrc,
INT32 val,
UINT32 val,
INT16* pSrcDst,
INT32 len);
UINT32 len);
typedef pstatus_t (*__rShiftC_16u_t)(
const UINT16* pSrc,
INT32 val,
UINT32 val,
UINT16* pSrcDst,
INT32 len);
UINT32 len);
typedef pstatus_t (*__shiftC_16s_t)(
const INT16* pSrc,
INT32 val,
INT16* pSrcDst,
INT32 len);
UINT32 len);
typedef pstatus_t (*__shiftC_16u_t)(
const UINT16* pSrc,
INT32 val,
UINT16* pSrcDst,
INT32 len);
UINT32 len);
typedef pstatus_t (*__sign_16s_t)(
const INT16* pSrc,
INT16* pDst,
INT32 len);
UINT32 len);
typedef pstatus_t (*__yCbCrToRGB_16s8u_P3AC4R_t)(
const INT16* pSrc[3], INT32 srcStep,
BYTE* pDst, INT32 dstStep, UINT32 DstFormat,
@ -154,8 +154,8 @@ typedef pstatus_t (*__RGBToYCbCr_16s16s_P3P3_t)(
INT16* pDst[3], INT32 dstStep,
const prim_size_t* roi);
typedef pstatus_t (*__RGBToRGB_16s8u_P3AC4R_t)(
const INT16* pSrc[3], INT32 srcStep,
BYTE* pDst, INT32 dstStep, UINT32 DstFormat,
const INT16* const pSrc[3], UINT32 srcStep,
BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
const prim_size_t* roi);
typedef pstatus_t (*__YCoCgToRGB_8u_AC4R_t)(
const BYTE* pSrc, INT32 srcStep,
@ -177,11 +177,11 @@ typedef pstatus_t (*__YUV444ToRGB_8u_P3AC4R_t)(
BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
const prim_size_t* roi);
typedef pstatus_t (*__RGBToYUV420_8u_P3AC4R_t)(
const BYTE* pSrc, UINT32 srcStep,
const BYTE* pSrc, UINT32 SrcFormat, UINT32 srcStep,
BYTE* pDst[3], UINT32 dstStep[3],
const prim_size_t* roi);
typedef pstatus_t (*__RGBToYUV444_8u_P3AC4R_t)(
const BYTE* pSrc, UINT32 srcStep,
const BYTE* pSrc, UINT32 SrcFormat, UINT32 srcStep,
BYTE* pDst[3], UINT32 dstStep[3],
const prim_size_t* roi);
typedef pstatus_t (*__YUV420CombineToYUV444_t)(

View File

@ -1567,7 +1567,7 @@ INT32 avc420_compress(H264_CONTEXT* h264, BYTE* pSrcData, DWORD SrcFormat,
roi.width = nSrcWidth;
roi.height = nSrcHeight;
prims->RGBToYUV420_8u_P3AC4R(pSrcData, nSrcStep, pYUVData, iStride, &roi);
prims->RGBToYUV420_8u_P3AC4R(pSrcData, SrcFormat, nSrcStep, pYUVData, iStride, &roi);
status = h264->subsystem->Compress(h264, ppDstData, pDstSize, 0);

View File

@ -33,11 +33,11 @@
/* ------------------------------------------------------------------------- */
static pstatus_t general_YCoCgToRGB_8u_AC4R(
const BYTE* pSrc, INT32 srcStep,
BYTE* pDst, UINT32 DstFormat, INT32 dstStep,
UINT32 width, UINT32 height,
UINT8 shift,
BOOL withAlpha)
const BYTE* pSrc, INT32 srcStep,
BYTE* pDst, UINT32 DstFormat, INT32 dstStep,
UINT32 width, UINT32 height,
UINT8 shift,
BOOL withAlpha)
{
BYTE A;
UINT32 x, y;
@ -66,8 +66,11 @@ static pstatus_t general_YCoCgToRGB_8u_AC4R(
R = T + Co;
G = Y + Cg;
B = T - Co;
color = GetColor(DstFormat, MINMAX(R, 0, 255), MINMAX(G, 0, 255), MINMAX(B, 0,
255), A);
color = GetColor(DstFormat,
MINMAX(R, 0, 255), MINMAX(G, 0, 255),
MINMAX(B, 0, 255), A);
WriteColor(dptr, DstFormat, color);
dptr += GetBytesPerPixel(DstFormat);
}

View File

@ -40,8 +40,8 @@ static primitives_t* generic = NULL;
#ifdef WITH_SSE2
/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
const BYTE* pSrc, INT32 srcStep,
BYTE* pDst, INT32 dstStep,
const BYTE* pSrc, UINT32 srcStep,
BYTE* pDst, UINT32 DstFormat, UINT32 dstStep,
UINT32 width, UINT32 height,
UINT8 shift,
BOOL withAlpha)
@ -70,8 +70,8 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
{
/* Too small, or we'll never hit a 16-byte boundary. Punt. */
return generic->YCoCgToRGB_8u_AC4R(
pSrc, srcStep, pDst, dstStep,
width, height, shift, withAlpha, TRUE);
pSrc, srcStep, pDst, DstFormat, dstStep,
width, height, shift, withAlpha);
}
for (h = 0; h < height; h++)
@ -82,12 +82,16 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
/* Get to a 16-byte destination boundary. */
if ((ULONG_PTR) dptr & 0x0f)
{
pstatus_t status;
int startup = (16 - ((ULONG_PTR) dptr & 0x0f)) / 4;
if (startup > width) startup = width;
generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep,
startup, 1, shift, withAlpha, TRUE);
status = generic->YCoCgToRGB_8u_AC4R(
sptr, srcStep, dptr, DstFormat, dstStep,
startup, 1, shift, withAlpha);
if (status != PRIMITIVES_SUCCESS)
return status;
sptr += startup * sizeof(UINT32);
dptr += startup * sizeof(UINT32);
w -= startup;
@ -195,8 +199,13 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
/* Handle any remainder pixels. */
if (w > 0)
{
generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep,
w, 1, shift, withAlpha, TRUE);
pstatus_t status;
status = generic->YCoCgToRGB_8u_AC4R(
sptr, srcStep, dptr, DstFormat, dstStep,
w, 1, shift, withAlpha);
if (status != PRIMITIVES_SUCCESS)
return status;
sptr += w * sizeof(UINT32);
dptr += w * sizeof(UINT32);
}
@ -210,8 +219,8 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
const BYTE* pSrc, INT32 srcStep,
BYTE* pDst, INT32 dstStep,
const BYTE* pSrc, UINT32 srcStep,
BYTE* pDst, UINT32 DstFormat, UINT32 dstStep,
UINT32 width, UINT32 height,
UINT8 shift,
BOOL withAlpha)
@ -240,9 +249,8 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
{
/* Too small, or we'll never hit a 16-byte boundary. Punt. */
return generic->YCoCgToRGB_8u_AC4R(
pSrc, srcStep,
pDst, dstStep, width, height, shift,
withAlpha, FALSE);
pSrc, srcStep, pDst, DstFormat, dstStep,
width, height, shift, withAlpha);
}
for (h = 0; h < height; h++)
@ -253,12 +261,17 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
/* Get to a 16-byte destination boundary. */
if ((ULONG_PTR) dptr & 0x0f)
{
pstatus_t status;
int startup = (16 - ((ULONG_PTR) dptr & 0x0f)) / 4;
if (startup > width) startup = width;
generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep,
startup, 1, shift, withAlpha, FALSE);
status = generic->YCoCgToRGB_8u_AC4R(
sptr, srcStep, dptr, DstFormat,
dstStep, startup, 1, shift, withAlpha);
if (status != PRIMITIVES_SUCCESS)
return status;
sptr += startup * sizeof(UINT32);
dptr += startup * sizeof(UINT32);
w -= startup;
@ -370,8 +383,13 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
/* Handle any remainder pixels. */
if (w > 0)
{
generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep,
w, 1, shift, withAlpha, FALSE);
pstatus_t status;
status = generic->YCoCgToRGB_8u_AC4R(
sptr, srcStep, dptr, DstFormat, dstStep,
w, 1, shift, withAlpha);
if (status != PRIMITIVES_SUCCESS)
return status;
sptr += w * sizeof(UINT32);
dptr += w * sizeof(UINT32);
}
@ -388,21 +406,29 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
/* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(
const BYTE* pSrc, INT32 srcStep,
BYTE* pDst, INT32 dstStep,
BYTE* pDst, UINT32 DstFormat, INT32 dstStep,
UINT32 width, UINT32 height,
UINT8 shift,
BOOL withAlpha,
BOOL invert)
BOOL withAlpha)
{
if (invert)
// TODO: Need to implement proper color conversion!!!
return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat,
dstStep, width, height, shift, withAlpha);
switch(DstFormat)
{
return ssse3_YCoCgRToRGB_8u_AC4R_invert(pSrc, srcStep, pDst, dstStep,
width, height, shift, withAlpha);
}
else
{
return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(pSrc, srcStep, pDst, dstStep,
width, height, shift, withAlpha);
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
return ssse3_YCoCgRToRGB_8u_AC4R_invert(
pSrc, srcStep, pDst, DstFormat, dstStep,
width, height, shift, withAlpha);
case PIXEL_FORMAT_RGBX32:
case PIXEL_FORMAT_RGBA32:
return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
pSrc, srcStep, pDst, DstFormat, dstStep,
width, height, shift, withAlpha);
default:
return -1;
}
}
#endif /* WITH_SSE2 */

View File

@ -232,9 +232,9 @@ static pstatus_t general_YUV444SplitToYUV420(
{
/* Filter */
const INT32 u = pSrcU[2 * x] + pSrcU[2 * x + 1] + pSrcU1[2 * x]
+ pSrcU1[2 * x + 1];
+ pSrcU1[2 * x + 1];
const INT32 v = pSrcV[2 * x] + pSrcV[2 * x + 1] + pSrcV1[2 * x]
+ pSrcV1[2 * x + 1];
+ pSrcV1[2 * x + 1];
pU[x] = CLIP(u / 4L);
pV[x] = CLIP(v / 4L);
}
@ -331,7 +331,7 @@ static INLINE BYTE* writePixel(BYTE* dst, UINT32 format, BYTE Y, BYTE U, BYTE V)
const BYTE r = YUV2R(Y, U, V);
const BYTE g = YUV2G(Y, U, V);
const BYTE b = YUV2B(Y, U, V);
UINT32 color = GetColor(format, r, g, b, 0);
UINT32 color = GetColor(format, r, g, b, 0xFF);
WriteColor(dst, format, color);
return dst + GetBytesPerPixel(format);
}
@ -500,9 +500,10 @@ static INLINE BYTE RGB2V(INT32 R, INT32 G, INT32 B)
}
static pstatus_t general_RGBToYUV444_8u_P3AC4R(
const BYTE* pSrc, const UINT32 srcStep,
const BYTE* pSrc, UINT32 SrcFormat, const UINT32 srcStep,
BYTE* pDst[3], UINT32 dstStep[3], const prim_size_t* roi)
{
const UINT32 bpp = GetBytesPerPixel(SrcFormat);
UINT32 x, y;
UINT32 nWidth, nHeight;
nWidth = roi->width;
@ -517,9 +518,10 @@ static pstatus_t general_RGBToYUV444_8u_P3AC4R(
for (x = 0; x < nWidth; x++)
{
const BYTE B = pRGB[4 * x + 0];
const BYTE G = pRGB[4 * x + 1];
const BYTE R = pRGB[4 * x + 2];
BYTE B, G, R;
const UINT32 color = ReadColor(&pRGB[x * bpp], SrcFormat);
SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL);
pY[x] = RGB2Y(R, G, B);
pU[x] = RGB2U(R, G, B);
pV[x] = RGB2V(R, G, B);
@ -530,9 +532,10 @@ static pstatus_t general_RGBToYUV444_8u_P3AC4R(
}
static pstatus_t general_RGBToYUV420_8u_P3AC4R(
const BYTE* pSrc, UINT32 srcStep,
const BYTE* pSrc, UINT32 SrcFormat, UINT32 srcStep,
BYTE* pDst[3], UINT32 dstStep[3], const prim_size_t* roi)
{
const UINT32 bpp = GetBytesPerPixel(SrcFormat);
UINT32 x, y;
UINT32 halfWidth;
UINT32 halfHeight;
@ -555,39 +558,50 @@ static pstatus_t general_RGBToYUV420_8u_P3AC4R(
for (x = 0; x < halfWidth; x++)
{
INT32 R, G, B;
UINT32 color;
INT32 Ra, Ga, Ba;
const UINT32 val2x = (x * 2);
const UINT32 val2x1 = val2x + 1;
BYTE B, G, R;
/* 1st pixel */
Ba = B = pRGB[val2x * 4 + 0];
Ga = G = pRGB[val2x * 4 + 1];
Ra = R = pRGB[val2x * 4 + 2];
color = ReadColor(&pRGB[val2x * bpp], SrcFormat);
SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL);
Ba = B;
Ga = G;
Ra = R;
pY[val2x] = RGB2Y(R, G, B);
if (val2x1 < nWidth)
{
/* 2nd pixel */
Ba += B = pRGB[val2x * 4 + 4];
Ga += G = pRGB[val2x * 4 + 5];
Ra += R = pRGB[val2x * 4 + 6];
color = ReadColor(&pRGB[val2x1 * bpp], SrcFormat);
SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL);
Ba += B;
Ga += G;
Ra += R;
pY[val2x1] = RGB2Y(R, G, B);
}
if (val2y1 < nHeight)
{
/* 3rd pixel */
Ba += B = pRGB1[val2x * 4 + 0];
Ga += G = pRGB1[val2x * 4 + 1];
Ra += R = pRGB1[val2x * 4 + 2];
color = ReadColor(&pRGB1[val2x * bpp], SrcFormat);
SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL);
Ba += B;
Ga += G;
Ra += R;
pY1[val2x] = RGB2Y(R, G, B);
if (val2x1 < nWidth)
{
/* 4th pixel */
Ba += B = pRGB1[val2x * 4 + 4];
Ga += G = pRGB1[val2x * 4 + 5];
Ra += R = pRGB1[val2x * 4 + 6];
color = ReadColor(&pRGB1[val2x1 * bpp], SrcFormat);
SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL);
Ba += B;
Ga += G;
Ra += R;
pY1[val2x1] = RGB2Y(R, G, B);
}
}

View File

@ -35,6 +35,11 @@ static pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(
UINT32 i, nWidth, nHeight, VaddDst, VaddY, VaddU, VaddV;
__m128i r0, r1, r2, r3, r4, r5, r6, r7;
__m128i* buffer;
// TODO: Need to implement proper color conversion!!!!!
return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep,
DstFormat, roi);
/* last_line: if the last (U,V doubled) line should be skipped, set to 10B
* last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */
buffer = _aligned_malloc(4 * 16, 16);

View File

@ -30,7 +30,7 @@ static pstatus_t general_add_16s(
const INT16* pSrc1,
const INT16* pSrc2,
INT16* pDst,
INT32 len)
UINT32 len)
{
while (len--)
{

View File

@ -40,7 +40,7 @@ static primitives_t* generic = NULL;
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
/* ------------------------------------------------------------------------- */
SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s,
_mm_adds_epi16, generic->add_16s(sptr1++, sptr2++, dptr++, 1))
_mm_adds_epi16, generic->add_16s(sptr1++, sptr2++, dptr++, 1))
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif

View File

@ -36,23 +36,19 @@
/* ------------------------------------------------------------------------- */
static pstatus_t general_alphaComp_argb(
const BYTE* pSrc1, INT32 src1Step,
const BYTE* pSrc2, INT32 src2Step,
BYTE* pDst, INT32 dstStep,
INT32 width, INT32 height)
const BYTE* pSrc1, UINT32 src1Step,
const BYTE* pSrc2, UINT32 src2Step,
BYTE* pDst, UINT32 dstStep,
UINT32 width, UINT32 height)
{
const UINT32* sptr1 = (const UINT32*) pSrc1;
const UINT32* sptr2 = (const UINT32*) pSrc2;
UINT32* dptr = (UINT32*) pDst;
int linebytes = width * sizeof(UINT32);
int src1Jump = (src1Step - linebytes) / sizeof(UINT32);
int src2Jump = (src2Step - linebytes) / sizeof(UINT32);
int dstJump = (dstStep - linebytes) / sizeof(UINT32);
int y;
UINT32 y;
for (y = 0; y < height; y++)
{
int x;
const UINT32* sptr1 = (const UINT32*) (pSrc1 + y * src1Step);
const UINT32* sptr2 = (const UINT32*) (pSrc2 + y * src2Step);
UINT32* dptr = (UINT32*) (pDst + y * dstStep);
UINT32 x;
for (x = 0; x < width; x++)
{
@ -92,10 +88,6 @@ static pstatus_t general_alphaComp_argb(
*dptr++ = rb | ag;
}
}
sptr1 += src1Jump;
sptr2 += src2Jump;
dptr += dstJump;
}
return PRIMITIVES_SUCCESS;

View File

@ -46,10 +46,10 @@ static primitives_t* generic = NULL;
#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
pstatus_t sse2_alphaComp_argb(
const BYTE* pSrc1, INT32 src1Step,
const BYTE* pSrc2, INT32 src2Step,
BYTE* pDst, INT32 dstStep,
INT32 width, INT32 height)
const BYTE* pSrc1, UINT32 src1Step,
const BYTE* pSrc2, UINT32 src2Step,
BYTE* pDst, UINT32 dstStep,
UINT32 width, UINT32 height)
{
const UINT32* sptr1 = (const UINT32*) pSrc1;
const UINT32* sptr2 = (const UINT32*) pSrc2;
@ -62,7 +62,7 @@ pstatus_t sse2_alphaComp_argb(
if (width < 4) /* pointless if too small */
{
return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step,
pDst, dstStep, width, height);
pDst, dstStep, width, height);
}
dptr = (UINT32*) pDst;
@ -108,9 +108,13 @@ pstatus_t sse2_alphaComp_argb(
if (leadIn)
{
generic->alphaComp_argb((const BYTE*) sptr1,
src1Step, (const BYTE*) sptr2, src2Step,
(BYTE*) dptr, dstStep, leadIn, 1);
pstatus_t status;
status = generic->alphaComp_argb((const BYTE*) sptr1,
src1Step, (const BYTE*) sptr2, src2Step,
(BYTE*) dptr, dstStep, leadIn, 1);
if (status != PRIMITIVES_SUCCESS)
return status;
sptr1 += leadIn;
sptr2 += leadIn;
dptr += leadIn;
@ -181,9 +185,13 @@ pstatus_t sse2_alphaComp_argb(
/* Finish off the remainder. */
if (pixels)
{
generic->alphaComp_argb((const BYTE*) sptr1, src1Step,
(const BYTE*) sptr2, src2Step,
(BYTE*) dptr, dstStep, pixels, 1);
pstatus_t status;
status = generic->alphaComp_argb((const BYTE*) sptr1, src1Step,
(const BYTE*) sptr2, src2Step,
(BYTE*) dptr, dstStep, pixels, 1);
if (status != PRIMITIVES_SUCCESS)
return status;
sptr1 += pixels;
sptr2 += pixels;
dptr += pixels;
@ -212,7 +220,7 @@ static pstatus_t ipp_alphaComp_argb(
sz.width = width;
sz.height = height;
return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step,
pDst, dstStep, sz, ippAlphaOver);
pDst, dstStep, sz, ippAlphaOver);
}
#endif

View File

@ -262,7 +262,7 @@ static pstatus_t general_RGBToYCbCr_16s16s_P3P3(
for (y = 0; y < roi->height; y++)
{
int x;
UINT32 x;
for (x = 0; x < roi->width; ++x)
{
@ -305,10 +305,10 @@ static pstatus_t general_RGBToYCbCr_16s16s_P3P3(
/* ------------------------------------------------------------------------- */
static pstatus_t general_RGBToRGB_16s8u_P3AC4R(
const INT16* pSrc[3], /* 16-bit R,G, and B arrays */
INT32 srcStep, /* bytes between rows in source data */
const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
UINT32 srcStep, /* bytes between rows in source data */
BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */
INT32 dstStep, /* bytes between rows in dest data */
UINT32 dstStep, /* bytes between rows in dest data */
UINT32 DstFormat,
const prim_size_t* roi) /* region of interest */
{

View File

@ -91,7 +91,7 @@ static pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(
{
/* We can't maintain 16-byte alignment. */
return generic->yCbCrToRGB_16s16s_P3P3(pSrc, srcStep,
pDst, dstStep, roi);
pDst, dstStep, roi);
}
zero = _mm_setzero_si128();
@ -228,7 +228,7 @@ static pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(
{
/* We can't maintain 16-byte alignment. */
return generic->RGBToYCbCr_16s16s_P3P3(pSrc, srcStep,
pDst, dstStep, roi);
pDst, dstStep, roi);
}
min = _mm_set1_epi16(-128 * 32);
@ -357,10 +357,10 @@ static pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(
_mm_set1_epi32(0xFFFFFFFFU)
pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
const INT16* pSrc[3], /* 16-bit R,G, and B arrays */
INT32 srcStep, /* bytes between rows in source data */
const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
UINT32 srcStep, /* bytes between rows in source data */
BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */
INT32 dstStep, /* bytes between rows in dest data */
UINT32 dstStep, /* bytes between rows in dest data */
UINT32 DstFormat,
const prim_size_t* roi) /* region of interest */
{
@ -385,9 +385,13 @@ pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
|| (dstStep & 0x0f))
{
return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst,
dstStep, DstFormat, roi);
dstStep, DstFormat, roi);
}
// TODO: Need to update SSE code to allow color conversion!!!
return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst,
dstStep, DstFormat, roi);
out = (BYTE*) pDst;
srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
dstbump = (dstStep - (roi->width * sizeof(UINT32)));

View File

@ -29,7 +29,7 @@
static pstatus_t general_set_8u(
BYTE val,
BYTE* pDst,
INT32 len)
UINT32 len)
{
memset((void*) pDst, (int) val, (size_t) len);
return PRIMITIVES_SUCCESS;
@ -48,7 +48,7 @@ static pstatus_t general_zero(
static pstatus_t general_set_32s(
INT32 val,
INT32* pDst,
INT32 len)
UINT32 len)
{
INT32* dptr = (INT32*) pDst;
size_t span, remaining;
@ -85,7 +85,7 @@ static pstatus_t general_set_32s(
static pstatus_t general_set_32u(
UINT32 val,
UINT32* pDst,
INT32 len)
UINT32 len)
{
UINT32* dptr = (UINT32*) pDst;
size_t span, remaining;

View File

@ -40,7 +40,7 @@ static primitives_t* generic = NULL;
static pstatus_t sse2_set_8u(
BYTE val,
BYTE* pDst,
INT32 len)
UINT32 len)
{
BYTE byte, *dptr;
__m128i xmm0;
@ -126,7 +126,7 @@ static pstatus_t sse2_set_8u(
static pstatus_t sse2_set_32u(
UINT32 val,
UINT32* pDst,
INT32 len)
UINT32 len)
{
const primitives_t* prim = primitives_get_generic();
UINT32* dptr = (UINT32*) pDst;
@ -218,7 +218,7 @@ static pstatus_t sse2_set_32u(
static pstatus_t sse2_set_32s(
INT32 val,
INT32* pDst,
INT32 len)
UINT32 len)
{
UINT32 uval = *((UINT32*) &val);
return sse2_set_32u(uval, (UINT32*) pDst, len);

View File

@ -24,9 +24,9 @@
/* ------------------------------------------------------------------------- */
static pstatus_t general_lShiftC_16s(
const INT16* pSrc,
INT32 val,
UINT32 val,
INT16* pDst,
INT32 len)
UINT32 len)
{
if (val == 0) return PRIMITIVES_SUCCESS;
@ -38,9 +38,9 @@ static pstatus_t general_lShiftC_16s(
/* ------------------------------------------------------------------------- */
static pstatus_t general_rShiftC_16s(
const INT16* pSrc,
INT32 val,
UINT32 val,
INT16* pDst,
INT32 len)
UINT32 len)
{
if (val == 0) return PRIMITIVES_SUCCESS;
@ -52,9 +52,9 @@ static pstatus_t general_rShiftC_16s(
/* ------------------------------------------------------------------------- */
static pstatus_t general_lShiftC_16u(
const UINT16* pSrc,
INT32 val,
UINT32 val,
UINT16* pDst,
INT32 len)
UINT32 len)
{
if (val == 0) return PRIMITIVES_SUCCESS;
@ -66,9 +66,9 @@ static pstatus_t general_lShiftC_16u(
/* ------------------------------------------------------------------------- */
static pstatus_t general_rShiftC_16u(
const UINT16* pSrc,
INT32 val,
UINT32 val,
UINT16* pDst,
INT32 len)
UINT32 len)
{
if (val == 0) return PRIMITIVES_SUCCESS;
@ -82,7 +82,7 @@ static pstatus_t general_shiftC_16s(
const INT16* pSrc,
INT32 val,
INT16* pDst,
INT32 len)
UINT32 len)
{
if (val == 0) return PRIMITIVES_SUCCESS;
@ -95,7 +95,7 @@ static pstatus_t general_shiftC_16u(
const UINT16* pSrc,
INT32 val,
UINT16* pDst,
INT32 len)
UINT32 len)
{
if (val == 0) return PRIMITIVES_SUCCESS;

View File

@ -39,16 +39,16 @@ static primitives_t* generic = NULL;
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s,
_mm_slli_epi16, *dptr++ = *sptr++ << val)
_mm_slli_epi16, *dptr++ = *sptr++ << val)
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s,
_mm_srai_epi16, *dptr++ = *sptr++ >> val)
_mm_srai_epi16, *dptr++ = *sptr++ >> val)
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u,
_mm_slli_epi16, *dptr++ = *sptr++ << val)
_mm_slli_epi16, *dptr++ = *sptr++ << val)
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u,
_mm_srli_epi16, *dptr++ = *sptr++ >> val)
_mm_srli_epi16, *dptr++ = *sptr++ >> val)
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif

View File

@ -28,7 +28,7 @@
static pstatus_t general_sign_16s(
const INT16* pSrc,
INT16* pDst,
INT32 len)
UINT32 len)
{
while (len--)
{

View File

@ -35,7 +35,7 @@ static primitives_t* generic = NULL;
static pstatus_t ssse3_sign_16s(
const INT16* pSrc,
INT16* pDst,
INT32 len)
UINT32 len)
{
const INT16* sptr = (const INT16*) pSrc;
INT16* dptr = (INT16*) pDst;

View File

@ -44,143 +44,143 @@
* SCD = Source, Constant, Destination
*/
#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
static pstatus_t _name_(const _type_ *pSrc, INT32 val, _type_ *pDst, INT32 len) \
{ \
INT32 shifts; \
UINT32 offBeatMask; \
const _type_ *sptr = pSrc; \
_type_ *dptr = pDst; \
size_t count; \
if (len < 16) /* pointless if too small */ \
{ \
return _fallback_(pSrc, val, pDst, len); \
} \
if (sizeof(_type_) == 1) shifts = 1; \
else if (sizeof(_type_) == 2) shifts = 2; \
else if (sizeof(_type_) == 4) shifts = 3; \
else if (sizeof(_type_) == 8) shifts = 4; \
offBeatMask = (1 << (shifts - 1)) - 1; \
if ((ULONG_PTR) pDst & offBeatMask) \
{ \
/* Incrementing the pointer skips over 16-byte boundary. */ \
return _fallback_(pSrc, val, pDst, len); \
} \
/* Get to the 16-byte boundary now. */ \
while ((ULONG_PTR) dptr & 0x0f) \
{ \
_slowWay_; \
if (--len == 0) return PRIMITIVES_SUCCESS; \
} \
/* Use 8 128-bit SSE registers. */ \
count = len >> (8-shifts); \
len -= count << (8-shifts); \
if ((ULONG_PTR) sptr & 0x0f) \
{ \
while (count--) \
{ \
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
xmm0 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm5 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm6 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm7 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, val); \
xmm1 = _op_(xmm1, val); \
xmm2 = _op_(xmm2, val); \
xmm3 = _op_(xmm3, val); \
xmm4 = _op_(xmm4, val); \
xmm5 = _op_(xmm5, val); \
xmm6 = _op_(xmm6, val); \
xmm7 = _op_(xmm7, val); \
_mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm4); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm5); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm6); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm7); \
dptr += (16/sizeof(_type_)); \
} \
} \
else \
{ \
while (count--) \
{ \
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
xmm0 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm1 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm2 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm3 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm4 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm5 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm6 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm7 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, val); \
xmm1 = _op_(xmm1, val); \
xmm2 = _op_(xmm2, val); \
xmm3 = _op_(xmm3, val); \
xmm4 = _op_(xmm4, val); \
xmm5 = _op_(xmm5, val); \
xmm6 = _op_(xmm6, val); \
xmm7 = _op_(xmm7, val); \
_mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm4); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm5); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm6); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm7); \
dptr += (16/sizeof(_type_)); \
} \
} \
/* Use a single 128-bit SSE register. */ \
count = len >> (5-shifts); \
len -= count << (5-shifts); \
while (count--) \
{ \
__m128i xmm0 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, val); \
_mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \
} \
/* Finish off the remainder. */ \
while (len--) { _slowWay_; } \
return PRIMITIVES_SUCCESS; \
static pstatus_t _name_(const _type_ *pSrc, UINT32 val, _type_ *pDst, UINT32 len) \
{ \
INT32 shifts; \
UINT32 offBeatMask; \
const _type_ *sptr = pSrc; \
_type_ *dptr = pDst; \
size_t count; \
if (len < 16) /* pointless if too small */ \
{ \
return _fallback_(pSrc, val, pDst, len); \
} \
if (sizeof(_type_) == 1) shifts = 1; \
else if (sizeof(_type_) == 2) shifts = 2; \
else if (sizeof(_type_) == 4) shifts = 3; \
else if (sizeof(_type_) == 8) shifts = 4; \
offBeatMask = (1 << (shifts - 1)) - 1; \
if ((ULONG_PTR) pDst & offBeatMask) \
{ \
/* Incrementing the pointer skips over 16-byte boundary. */ \
return _fallback_(pSrc, val, pDst, len); \
} \
/* Get to the 16-byte boundary now. */ \
while ((ULONG_PTR) dptr & 0x0f) \
{ \
_slowWay_; \
if (--len == 0) return PRIMITIVES_SUCCESS; \
} \
/* Use 8 128-bit SSE registers. */ \
count = len >> (8-shifts); \
len -= count << (8-shifts); \
if ((ULONG_PTR) sptr & 0x0f) \
{ \
while (count--) \
{ \
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
xmm0 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm5 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm6 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm7 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, val); \
xmm1 = _op_(xmm1, val); \
xmm2 = _op_(xmm2, val); \
xmm3 = _op_(xmm3, val); \
xmm4 = _op_(xmm4, val); \
xmm5 = _op_(xmm5, val); \
xmm6 = _op_(xmm6, val); \
xmm7 = _op_(xmm7, val); \
_mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm4); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm5); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm6); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm7); \
dptr += (16/sizeof(_type_)); \
} \
} \
else \
{ \
while (count--) \
{ \
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
xmm0 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm1 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm2 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm3 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm4 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm5 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm6 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm7 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, val); \
xmm1 = _op_(xmm1, val); \
xmm2 = _op_(xmm2, val); \
xmm3 = _op_(xmm3, val); \
xmm4 = _op_(xmm4, val); \
xmm5 = _op_(xmm5, val); \
xmm6 = _op_(xmm6, val); \
xmm7 = _op_(xmm7, val); \
_mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm4); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm5); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm6); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm7); \
dptr += (16/sizeof(_type_)); \
} \
} \
/* Use a single 128-bit SSE register. */ \
count = len >> (5-shifts); \
len -= count << (5-shifts); \
while (count--) \
{ \
__m128i xmm0 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, val); \
_mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \
} \
/* Finish off the remainder. */ \
while (len--) { _slowWay_; } \
return PRIMITIVES_SUCCESS; \
}
/* ----------------------------------------------------------------------------
@ -189,228 +189,230 @@
*/
#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32 len) \
{ \
int shifts; \
UINT32 offBeatMask; \
const _type_ *sptr = pSrc; \
_type_ *dptr = pDst; \
size_t count; \
__m128i xmm0; \
if (len < 16) /* pointless if too small */ \
{ \
return _fallback_(pSrc, val, pDst, len); \
} \
if (sizeof(_type_) == 1) shifts = 1; \
else if (sizeof(_type_) == 2) shifts = 2; \
else if (sizeof(_type_) == 4) shifts = 3; \
else if (sizeof(_type_) == 8) shifts = 4; \
offBeatMask = (1 << (shifts - 1)) - 1; \
if ((ULONG_PTR) pDst & offBeatMask) \
{ \
/* Incrementing the pointer skips over 16-byte boundary. */ \
return _fallback_(pSrc, val, pDst, len); \
} \
/* Get to the 16-byte boundary now. */ \
while ((ULONG_PTR) dptr & 0x0f) \
{ \
_slowWay_; \
if (--len == 0) return PRIMITIVES_SUCCESS; \
} \
/* Use 4 128-bit SSE registers. */ \
count = len >> (7-shifts); \
len -= count << (7-shifts); \
xmm0 = _mm_set1_epi32(val); \
if ((ULONG_PTR) sptr & 0x0f) \
{ \
while (count--) \
{ \
__m128i xmm1, xmm2, xmm3, xmm4; \
xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm1 = _op_(xmm1, xmm0); \
xmm2 = _op_(xmm2, xmm0); \
xmm3 = _op_(xmm3, xmm0); \
xmm4 = _op_(xmm4, xmm0); \
_mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm4); \
dptr += (16/sizeof(_type_)); \
} \
} \
else \
{ \
while (count--) \
{ \
__m128i xmm1, xmm2, xmm3, xmm4; \
xmm1 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm2 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm3 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm4 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm1 = _op_(xmm1, xmm0); \
xmm2 = _op_(xmm2, xmm0); \
xmm3 = _op_(xmm3, xmm0); \
xmm4 = _op_(xmm4, xmm0); \
_mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm4); \
dptr += (16/sizeof(_type_)); \
} \
} \
/* Use a single 128-bit SSE register. */ \
count = len >> (5-shifts); \
len -= count << (5-shifts); \
while (count--) \
{ \
__m128i xmm1 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
xmm1 = _op_(xmm1, xmm0); \
_mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \
} \
/* Finish off the remainder. */ \
while (len--) { _slowWay_; } \
return PRIMITIVES_SUCCESS; \
{ \
int shifts; \
UINT32 offBeatMask; \
const _type_ *sptr = pSrc; \
_type_ *dptr = pDst; \
size_t count; \
__m128i xmm0; \
if (len < 16) /* pointless if too small */ \
{ \
return _fallback_(pSrc, val, pDst, len); \
} \
if (sizeof(_type_) == 1) shifts = 1; \
else if (sizeof(_type_) == 2) shifts = 2; \
else if (sizeof(_type_) == 4) shifts = 3; \
else if (sizeof(_type_) == 8) shifts = 4; \
offBeatMask = (1 << (shifts - 1)) - 1; \
if ((ULONG_PTR) pDst & offBeatMask) \
{ \
/* Incrementing the pointer skips over 16-byte boundary. */ \
return _fallback_(pSrc, val, pDst, len); \
} \
/* Get to the 16-byte boundary now. */ \
while ((ULONG_PTR) dptr & 0x0f) \
{ \
_slowWay_; \
if (--len == 0) return PRIMITIVES_SUCCESS; \
} \
/* Use 4 128-bit SSE registers. */ \
count = len >> (7-shifts); \
len -= count << (7-shifts); \
xmm0 = _mm_set1_epi32(val); \
if ((ULONG_PTR) sptr & 0x0f) \
{ \
while (count--) \
{ \
__m128i xmm1, xmm2, xmm3, xmm4; \
xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm1 = _op_(xmm1, xmm0); \
xmm2 = _op_(xmm2, xmm0); \
xmm3 = _op_(xmm3, xmm0); \
xmm4 = _op_(xmm4, xmm0); \
_mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm4); \
dptr += (16/sizeof(_type_)); \
} \
} \
else \
{ \
while (count--) \
{ \
__m128i xmm1, xmm2, xmm3, xmm4; \
xmm1 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm2 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm3 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm4 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \
xmm1 = _op_(xmm1, xmm0); \
xmm2 = _op_(xmm2, xmm0); \
xmm3 = _op_(xmm3, xmm0); \
xmm4 = _op_(xmm4, xmm0); \
_mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm4); \
dptr += (16/sizeof(_type_)); \
} \
} \
/* Use a single 128-bit SSE register. */ \
count = len >> (5-shifts); \
len -= count << (5-shifts); \
while (count--) \
{ \
__m128i xmm1 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
xmm1 = _op_(xmm1, xmm0); \
_mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \
} \
/* Finish off the remainder. */ \
while (len--) { _slowWay_; } \
return PRIMITIVES_SUCCESS; \
}
/* ----------------------------------------------------------------------------
* SSD = Source1, Source2, Destination
*/
#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, INT32 len) \
{ \
int shifts; \
UINT32 offBeatMask; \
const _type_ *sptr1 = pSrc1; \
const _type_ *sptr2 = pSrc2; \
_type_ *dptr = pDst; \
size_t count; \
if (len < 16) /* pointless if too small */ \
{ \
return _fallback_(pSrc1, pSrc2, pDst, len); \
} \
if (sizeof(_type_) == 1) shifts = 1; \
else if (sizeof(_type_) == 2) shifts = 2; \
else if (sizeof(_type_) == 4) shifts = 3; \
else if (sizeof(_type_) == 8) shifts = 4; \
offBeatMask = (1 << (shifts - 1)) - 1; \
if ((ULONG_PTR) pDst & offBeatMask) \
{ \
/* Incrementing the pointer skips over 16-byte boundary. */ \
return _fallback_(pSrc1, pSrc2, pDst, len); \
} \
/* Get to the 16-byte boundary now. */ \
while ((ULONG_PTR) dptr & 0x0f) \
{ \
_slowWay_; \
if (--len == 0) return PRIMITIVES_SUCCESS; \
} \
/* Use 4 128-bit SSE registers. */ \
count = len >> (7-shifts); \
len -= count << (7-shifts); \
if (((ULONG_PTR) sptr1 & 0x0f) || ((ULONG_PTR) sptr2 & 0x0f)) \
{ \
/* Unaligned loads */ \
while (count--) \
{ \
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
xmm0 = _mm_lddqu_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm1 = _mm_lddqu_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm2 = _mm_lddqu_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm3 = _mm_lddqu_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm4 = _mm_lddqu_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm5 = _mm_lddqu_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm6 = _mm_lddqu_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm7 = _mm_lddqu_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, xmm4); \
xmm1 = _op_(xmm1, xmm5); \
xmm2 = _op_(xmm2, xmm6); \
xmm3 = _op_(xmm3, xmm7); \
_mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \
dptr += (16/sizeof(_type_)); \
} \
} \
else \
{ \
/* Aligned loads */ \
while (count--) \
{ \
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
xmm0 = _mm_load_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm1 = _mm_load_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm2 = _mm_load_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm3 = _mm_load_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm4 = _mm_load_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm5 = _mm_load_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm6 = _mm_load_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm7 = _mm_load_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, xmm4); \
xmm1 = _op_(xmm1, xmm5); \
xmm2 = _op_(xmm2, xmm6); \
xmm3 = _op_(xmm3, xmm7); \
_mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \
dptr += (16/sizeof(_type_)); \
} \
} \
/* Use a single 128-bit SSE register. */ \
count = len >> (5-shifts); \
len -= count << (5-shifts); \
while (count--) \
{ \
__m128i xmm0, xmm1; \
xmm0 = LOAD_SI128(sptr1); sptr1 += (16/sizeof(_type_)); \
xmm1 = LOAD_SI128(sptr2); sptr2 += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, xmm1); \
_mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \
} \
/* Finish off the remainder. */ \
while (len--) { _slowWay_; } \
return PRIMITIVES_SUCCESS; \
pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, UINT32 len) \
{ \
int shifts; \
UINT32 offBeatMask; \
const _type_ *sptr1 = pSrc1; \
const _type_ *sptr2 = pSrc2; \
_type_ *dptr = pDst; \
size_t count; \
if (len < 16) /* pointless if too small */ \
{ \
return _fallback_(pSrc1, pSrc2, pDst, len); \
} \
if (sizeof(_type_) == 1) shifts = 1; \
else if (sizeof(_type_) == 2) shifts = 2; \
else if (sizeof(_type_) == 4) shifts = 3; \
else if (sizeof(_type_) == 8) shifts = 4; \
offBeatMask = (1 << (shifts - 1)) - 1; \
if ((ULONG_PTR) pDst & offBeatMask) \
{ \
/* Incrementing the pointer skips over 16-byte boundary. */ \
return _fallback_(pSrc1, pSrc2, pDst, len); \
} \
/* Get to the 16-byte boundary now. */ \
while ((ULONG_PTR) dptr & 0x0f) \
{ \
pstatus_t status; \
status = _slowWay_; \
if (status != PRIMITIVES_SUCCESS) return status; \
if (--len == 0) return PRIMITIVES_SUCCESS; \
} \
/* Use 4 128-bit SSE registers. */ \
count = len >> (7-shifts); \
len -= count << (7-shifts); \
if (((ULONG_PTR) sptr1 & 0x0f) || ((ULONG_PTR) sptr2 & 0x0f)) \
{ \
/* Unaligned loads */ \
while (count--) \
{ \
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
xmm0 = _mm_lddqu_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm1 = _mm_lddqu_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm2 = _mm_lddqu_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm3 = _mm_lddqu_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm4 = _mm_lddqu_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm5 = _mm_lddqu_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm6 = _mm_lddqu_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm7 = _mm_lddqu_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, xmm4); \
xmm1 = _op_(xmm1, xmm5); \
xmm2 = _op_(xmm2, xmm6); \
xmm3 = _op_(xmm3, xmm7); \
_mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \
dptr += (16/sizeof(_type_)); \
} \
} \
else \
{ \
/* Aligned loads */ \
while (count--) \
{ \
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
xmm0 = _mm_load_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm1 = _mm_load_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm2 = _mm_load_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm3 = _mm_load_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \
xmm4 = _mm_load_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm5 = _mm_load_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm6 = _mm_load_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm7 = _mm_load_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, xmm4); \
xmm1 = _op_(xmm1, xmm5); \
xmm2 = _op_(xmm2, xmm6); \
xmm3 = _op_(xmm3, xmm7); \
_mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \
dptr += (16/sizeof(_type_)); \
} \
} \
/* Use a single 128-bit SSE register. */ \
count = len >> (5-shifts); \
len -= count << (5-shifts); \
while (count--) \
{ \
__m128i xmm0, xmm1; \
xmm0 = LOAD_SI128(sptr1); sptr1 += (16/sizeof(_type_)); \
xmm1 = LOAD_SI128(sptr2); sptr2 += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, xmm1); \
_mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \
} \
/* Finish off the remainder. */ \
while (len--) { _slowWay_; } \
return PRIMITIVES_SUCCESS; \
}
#endif /* !__PRIM_TEMPLATES_H_INCLUDED__ */

View File

@ -81,6 +81,6 @@ primitives_t* primitives_get_generic(void)
if (!pPrimitivesGenericInitialized)
primitives_init_generic();
return &pPrimitives;
return &pPrimitivesGeneric;
}

View File

@ -26,7 +26,7 @@ static BOOL test_add16s_func(void)
pstatus_t status;
INT16 ALIGN(src1[FUNC_TEST_SIZE + 3]), ALIGN(src2[FUNC_TEST_SIZE + 3]),
ALIGN(d1[FUNC_TEST_SIZE + 3]), ALIGN(d2[FUNC_TEST_SIZE + 3]);
ALIGN(d1[FUNC_TEST_SIZE + 3]), ALIGN(d2[FUNC_TEST_SIZE + 3]);
char testStr[256];
testStr[0] = '\0';
@ -50,7 +50,7 @@ static BOOL test_add16s_func(void)
static BOOL test_add16s_speed(void)
{
BYTE ALIGN(src1[MAX_TEST_SIZE + 3]), ALIGN(src2[MAX_TEST_SIZE + 3]),
ALIGN(dst[MAX_TEST_SIZE + 3]);
ALIGN(dst[MAX_TEST_SIZE + 3]);
if (!g_TestPrimitivesPerformance)
return TRUE;
@ -59,7 +59,8 @@ static BOOL test_add16s_speed(void)
winpr_RAND(src2, sizeof(src2));
if (!speed_test("add16s", "aligned", g_Iterations,
generic->add_16s, optimized->add_16s,
(speed_test_fkt)generic->add_16s,
(speed_test_fkt)optimized->add_16s,
src1, src2, dst, FUNC_TEST_SIZE))
return FALSE;
@ -72,8 +73,11 @@ int TestPrimitivesAdd(int argc, char* argv[])
if (!test_add16s_func())
return -1;
if (!test_add16s_speed())
return -1;
if (g_TestPrimitivesPerformance)
{
if (!test_add16s_speed())
return -1;
}
return 0;
}

View File

@ -33,8 +33,13 @@ static const int block_size[] = { 4, 64, 256 };
#define GRN(_c_) (((_c_) & 0x0000FF00U) >> 8)
#define BLU(_c_) ((_c_) & 0x000000FFU)
#define TOLERANCE 1
#define PIXEL(_addr_, _bytes_, _x_, _y_) \
((UINT32 *) (((BYTE *) (_addr_)) + (_x_)*4 + (_y_)*(_bytes_)))
static inline const UINT32* PIXEL(const BYTE* _addr_, UINT32 _bytes_, UINT32 _x_, UINT32 _y_)
{
const BYTE* addr = _addr_ + _x_ * sizeof(UINT32) + _y_ * _bytes_;
return (const UINT32*)addr;
}
#define SRC1_WIDTH 6
#define SRC1_HEIGHT 6
#define SRC2_WIDTH 7
@ -46,8 +51,8 @@ static const int block_size[] = { 4, 64, 256 };
/* ------------------------------------------------------------------------- */
static UINT32 alpha_add(
UINT32 c1,
UINT32 c2)
UINT32 c1,
UINT32 c2)
{
UINT32 a1 = ALF(c1);
UINT32 r1 = RED(c1);
@ -66,8 +71,8 @@ static UINT32 alpha_add(
/* ------------------------------------------------------------------------- */
static UINT32 colordist(
UINT32 c1,
UINT32 c2)
UINT32 c1,
UINT32 c2)
{
int d, maxd = 0;
d = ABS(ALF(c1) - ALF(c2));
@ -90,10 +95,10 @@ static UINT32 colordist(
}
/* ------------------------------------------------------------------------- */
static BOOL check(const BYTE* pSrc1, INT32 src1Step,
const BYTE* pSrc2, INT32 src2Step,
BYTE* pDst, INT32 dstStep,
INT32 width, INT32 height)
static BOOL check(const BYTE* pSrc1, UINT32 src1Step,
const BYTE* pSrc2, UINT32 src2Step,
BYTE* pDst, UINT32 dstStep,
UINT32 width, UINT32 height)
{
UINT32 x, y;
for (y = 0; y < height; ++y)
@ -120,14 +125,14 @@ static BOOL check(const BYTE* pSrc1, INT32 src1Step,
static BOOL test_alphaComp_func(void)
{
pstatus_t status;
BYTE ALIGN(src1[SRC1_WIDTH * SRC1_HEIGHT]);
BYTE ALIGN(src2[SRC2_WIDTH * SRC2_HEIGHT]);
BYTE ALIGN(dst1[DST_WIDTH * DST_HEIGHT]);
char testStr[256];
BYTE ALIGN(src1[SRC1_WIDTH * SRC1_HEIGHT * 4]);
BYTE ALIGN(src2[SRC2_WIDTH * SRC2_HEIGHT * 4]);
BYTE ALIGN(dst1[DST_WIDTH * DST_HEIGHT * 4]);
UINT32* ptr;
UINT32 i;
testStr[0] = '\0';
winpr_RAND((BYTE*)src1, sizeof(src1));
/* Special-case the first two values */
src1[0] &= 0x00FFFFFFU;
src1[1] |= 0xFF000000U;
@ -141,8 +146,8 @@ static BOOL test_alphaComp_func(void)
memset(dst1, 0, sizeof(dst1));
status = generic->alphaComp_argb(src1, 4 * SRC1_WIDTH,
src2, 4 * SRC2_WIDTH,
dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
src2, 4 * SRC2_WIDTH,
dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
@ -152,8 +157,8 @@ static BOOL test_alphaComp_func(void)
return FALSE;
status = optimized->alphaComp_argb((const BYTE*) src1, 4 * SRC1_WIDTH,
(const BYTE*) src2, 4 * SRC2_WIDTH,
(BYTE*) dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
(const BYTE*) src2, 4 * SRC2_WIDTH,
(BYTE*) dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
@ -188,7 +193,8 @@ static int test_alphaComp_speed(void)
memset(dst1, 0, sizeof(dst1));
if (!speed_test("add16s", "aligned", g_Iterations,
generic->alphaComp_argb, optimized->alphaComp_argb,
(speed_test_fkt)generic->alphaComp_argb,
(speed_test_fkt)optimized->alphaComp_argb,
src1, 4 * SRC1_WIDTH,
src2, 4 * SRC2_WIDTH,
dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT))
@ -203,8 +209,11 @@ int TestPrimitivesAlphaComp(int argc, char* argv[])
if (!test_alphaComp_func())
return -1;
if (!test_alphaComp_speed())
return -1;
if (g_TestPrimitivesPerformance)
{
if (!test_alphaComp_speed())
return -1;
}
return 0;
}

View File

@ -20,99 +20,87 @@
#include "prim_test.h"
#define FUNC_TEST_SIZE 65536
static const int ANDOR_PRETEST_ITERATIONS = 100000;
static const int TEST_TIME = 2.0; // seconds
#define VALUE (0xA5A5A5A5U)
/* ========================================================================= */
static BOOL test_and_32u_impl(const char* name, __andC_32u_t fkt,
const UINT32* src, const UINT32 val,
UINT32* dst, size_t size)
{
size_t i;
pstatus_t status = fkt(src, val, dst, size);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
for (i = 0; i < size; ++i)
{
if (dst[i] != (src[i] & val))
{
printf("AND %s FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
name, i, src[i], val, src[i] & val, dst[i]);
return FALSE;
}
}
return TRUE;
}
static BOOL test_and_32u_func(void)
{
UINT32 ALIGN(src[FUNC_TEST_SIZE + 3]), ALIGN(dst[FUNC_TEST_SIZE + 3]);
int failed = 0;
int i;
char testStr[256];
testStr[0] = '\0';
winpr_RAND(src, sizeof(src));
generic->andC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
strcat(testStr, " general");
for (i = 1; i <= FUNC_TEST_SIZE; ++i)
{
if (dst[i] != (src[i] & VALUE))
{
printf("AND-general FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
i, src[i], VALUE, src[i] & VALUE, dst[i]);
++failed;
}
}
winpr_RAND((BYTE*)src, sizeof(src));
#ifdef WITH_SSE2
if (!test_and_32u_impl("generic->andC_32u aligned", generic->andC_32u,
src + 1, VALUE, dst + 1, FUNC_TEST_SIZE))
return FALSE;
if (!test_and_32u_impl("generic->andC_32u unaligned", generic->andC_32u,
src + 1, VALUE, dst + 2, FUNC_TEST_SIZE))
return FALSE;
if (!test_and_32u_impl("optimized->andC_32u aligned", optimized->andC_32u,
src + 1, VALUE, dst + 1, FUNC_TEST_SIZE))
return FALSE;
if (!test_and_32u_impl("optimized->andC_32u unaligned", optimized->andC_32u,
src + 1, VALUE, dst + 2, FUNC_TEST_SIZE))
return FALSE;
if (IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
{
strcat(testStr, " SSE3");
/* Aligned */
memset(dst, 0, sizeof(dst));
sse3_andC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
for (i = 1; i <= FUNC_TEST_SIZE; ++i)
{
if (dst[i] != (src[i] & VALUE))
{
printf("AND-SSE-aligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
i, src[i], VALUE, src[i] & VALUE, dst[i]);
++failed;
}
}
/* Unaligned */
memset(dst, 0, sizeof(dst));
sse3_andC_32u(src + 1, VALUE, dst + 2, FUNC_TEST_SIZE);
for (i = 1; i <= FUNC_TEST_SIZE; ++i)
{
if (dst[i + 1] != (src[i] & VALUE))
{
printf("AND-SSE-unaligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
i, src[i], VALUE, src[i] & VALUE, dst[i + 1]);
++failed;
}
}
}
#endif /* i386 */
if (!failed) printf("All and_32u tests passed (%s).\n", testStr);
return (failed > 0) ? FAILURE : SUCCESS;
return TRUE;
}
/* ------------------------------------------------------------------------- */
static BOOL test_and_32u_speed(void)
{
UINT32 ALIGN(src[MAX_TEST_SIZE + 3]), ALIGN(dst[MAX_TEST_SIZE + 3]);
winpr_RAND(src, sizeof(src));
andC_32u_speed_test("and32u", "aligned", src, NULL, VALUE, dst,
test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME);
andC_32u_speed_test("and32u", "unaligned", src + 1, NULL, VALUE, dst,
test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME);
return SUCCESS;
winpr_RAND((BYTE*)src, sizeof(src));
if (!speed_test("andC_32u", "aligned", g_Iterations,
(speed_test_fkt)generic->andC_32u,
(speed_test_fkt)optimized->andC_32u,
src + 1, VALUE, dst + 1, MAX_TEST_SIZE))
return FALSE;
if (!speed_test("andC_32u", "unaligned", g_Iterations,
(speed_test_fkt)generic->andC_32u,
(speed_test_fkt)optimized->andC_32u,
src + 1, VALUE, dst + 2, MAX_TEST_SIZE))
return FALSE;
return TRUE;
}
/* ========================================================================= */
static BOOL check(const UINT32* src, const UINT32* dst, UINT32 size, UINT32 value)
{
UINT32 i;
UINT32 failed = 0;
for (i = 1; i <= size; ++i)
for (i = 0; i < size; ++i)
{
if (dst[i] != (src[i] | value))
{
printf("OR-general general FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
i, src[i], value, src[i] | value, dst[i]);
++failed;
return FALSE;
}
}
@ -123,8 +111,7 @@ static BOOL test_or_32u_func(void)
{
pstatus_t status;
UINT32 ALIGN(src[FUNC_TEST_SIZE + 3]), ALIGN(dst[FUNC_TEST_SIZE + 3]);
char testStr[256];
testStr[0] = '\0';
winpr_RAND((BYTE*)src, sizeof(src));
status = generic->orC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
@ -153,7 +140,8 @@ static BOOL test_or_32u_speed(void)
winpr_RAND((BYTE*)src, sizeof(src));
if (!speed_test("add16s", "aligned", g_Iterations,
generic->orC_32u, optimized->orC_32u,
(speed_test_fkt)generic->orC_32u,
(speed_test_fkt)optimized->orC_32u,
src + 1, VALUE, dst + 1, FUNC_TEST_SIZE))
return FALSE;
@ -167,14 +155,16 @@ int TestPrimitivesAndOr(int argc, char* argv[])
if (!test_and_32u_func())
return -1;
if (!test_and_32u_speed())
return -1;
if (!test_or_32u_func())
return -1;
if (!test_or_32u_speed())
return -1;
if (g_TestPrimitivesPerformance)
{
if (!test_and_32u_speed())
return -1;
if (!test_or_32u_speed())
return -1;
}
return 0;
}

View File

@ -24,19 +24,16 @@ static const int YCBCR_TRIAL_ITERATIONS = 1000;
static const float TEST_TIME = 4.0;
/* ------------------------------------------------------------------------- */
int test_RGBToRGB_16s8u_P3AC4R_func(void)
static BOOL test_RGBToRGB_16s8u_P3AC4R_func(void)
{
INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]);
UINT32 ALIGN(out1[4096]);
#ifdef WITH_SSE2
UINT32 ALIGN(out2[4096]);
#endif
int i;
int failed = 0;
char testStr[256];
BOOL failed = FALSE;
INT16* ptrs[3];
prim_size_t roi = { 64, 64 };
testStr[0] = '\0';
winpr_RAND((BYTE*)r, sizeof(r));
winpr_RAND((BYTE*)g, sizeof(g));
winpr_RAND((BYTE*)b, sizeof(b));
@ -52,56 +49,38 @@ int test_RGBToRGB_16s8u_P3AC4R_func(void)
ptrs[0] = r;
ptrs[1] = g;
ptrs[2] = b;
generic->RGBToRGB_16s8u_P3AC4R((const INT16**) ptrs, 64 * 2,
(BYTE*) out1, 64 * 4, &roi);
#ifdef WITH_SSE2
if (generic->RGBToRGB_16s8u_P3AC4R((const INT16**) ptrs, 64 * 2,
(BYTE*) out1, 64 * 4, PIXEL_FORMAT_RGBA32,
&roi) != PRIMITIVES_SUCCESS)
return FALSE;
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
if (optimized->RGBToRGB_16s8u_P3AC4R((const INT16**) ptrs, 64 * 2,
(BYTE*) out2, 64 * 4, PIXEL_FORMAT_RGBA32,
&roi) != PRIMITIVES_SUCCESS)
return FALSE;
for (i = 0; i < 4096; ++i)
{
strcat(testStr, " SSE2");
sse2_RGBToRGB_16s8u_P3AC4R((const INT16**) ptrs, 64 * 2,
(BYTE*) out2, 64 * 4, &roi);
for (i = 0; i < 4096; ++i)
if (out1[i] != out2[i])
{
if (out1[i] != out2[i])
{
printf("RGBToRGB-SSE FAIL: out1[%d]=0x%08x out2[%d]=0x%08x\n",
i, out1[i], i, out2[i]);
failed = 1;
}
printf("RGBToRGB-SSE FAIL: out1[%d]=0x%08x out2[%d]=0x%08x\n",
i, out1[i], i, out2[i]);
failed = TRUE;
}
}
#endif /* i386 */
if (!failed) printf("All RGBToRGB_16s8u_P3AC4R tests passed (%s).\n", testStr);
return (failed > 0) ? FAILURE : SUCCESS;
return !failed;
}
/* ------------------------------------------------------------------------- */
static const prim_size_t roi64x64 = { 64, 64 };
STD_SPEED_TEST(
rgb_to_argb_speed, INT16*, UINT32, dst = dst,
TRUE, generic->RGBToRGB_16s8u_P3AC4R(
(const INT16**) src1, 64 * 2, (BYTE*) dst, 64 * 4, &roi64x64),
#ifdef WITH_SSE2
TRUE, sse2_RGBToRGB_16s8u_P3AC4R(
(const INT16**) src1, 64 * 2, (BYTE*) dst, 64 * 4, &roi64x64),
PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
#else
FALSE, PRIM_NOP, 0, FALSE,
#endif
FALSE, dst = dst);
int test_RGBToRGB_16s8u_P3AC4R_speed(void)
static BOOL test_RGBToRGB_16s8u_P3AC4R_speed(void)
{
INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]);
UINT32 ALIGN(dst[4096]);
const prim_size_t roi64x64 = { 64, 64 };
INT16 ALIGN(r[4096+1]), ALIGN(g[4096+1]), ALIGN(b[4096+1]);
UINT32 ALIGN(dst[4096+1]);
int i;
INT16* ptrs[3];
int size_array[] = { 64 };
winpr_RAND((BYTE*)r, sizeof(r));
winpr_RAND((BYTE*)g, sizeof(g));
winpr_RAND((BYTE*)b, sizeof(b));
@ -114,29 +93,38 @@ int test_RGBToRGB_16s8u_P3AC4R_speed(void)
b[i] &= 0x00FFU;
}
ptrs[0] = r;
ptrs[1] = g;
ptrs[2] = b;
rgb_to_argb_speed("RGBToARGB", "aligned",
(const INT16**) ptrs, NULL, 0, dst,
size_array, 1, RGB_TRIAL_ITERATIONS, TEST_TIME);
return SUCCESS;
ptrs[0] = r+1;
ptrs[1] = g+1;
ptrs[2] = b+1;
if (!speed_test("RGBToRGB_16s8u_P3AC4R", "aligned", g_Iterations,
(speed_test_fkt)generic->RGBToRGB_16s8u_P3AC4R,
(speed_test_fkt)optimized->RGBToRGB_16s8u_P3AC4R,
(const INT16**) ptrs, 64 * 2, (BYTE*) dst, 64 * 4, &roi64x64))
return FALSE;
if (!speed_test("RGBToRGB_16s8u_P3AC4R", "unaligned", g_Iterations,
(speed_test_fkt)generic->RGBToRGB_16s8u_P3AC4R,
(speed_test_fkt)optimized->RGBToRGB_16s8u_P3AC4R,
(const INT16**) ptrs, 64 * 2, ((BYTE*) dst)+1, 64 * 4, &roi64x64))
return FALSE;
return TRUE;
}
/* ========================================================================= */
int test_yCbCrToRGB_16s16s_P3P3_func(void)
static BOOL test_yCbCrToRGB_16s16s_P3P3_func(void)
{
pstatus_t status;
INT16 ALIGN(y[4096]), ALIGN(cb[4096]), ALIGN(cr[4096]);
INT16 ALIGN(r1[4096]), ALIGN(g1[4096]), ALIGN(b1[4096]);
INT16 ALIGN(r2[4096]), ALIGN(g2[4096]), ALIGN(b2[4096]);
int i;
int failed = 0;
char testStr[256];
const INT16* in[3];
INT16* out1[3];
INT16* out2[3];
prim_size_t roi = { 64, 64 };
testStr[0] = '\0';
winpr_RAND((BYTE*)y, sizeof(y));
winpr_RAND((BYTE*)cb, sizeof(cb));
winpr_RAND((BYTE*)cr, sizeof(cr));
@ -164,57 +152,40 @@ int test_yCbCrToRGB_16s16s_P3P3_func(void)
out2[0] = r2;
out2[1] = g2;
out2[2] = b2;
generic->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out1, 64 * 2, &roi);
#ifdef WITH_SSE2
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
status = generic->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out1, 64 * 2, &roi);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out2, 64 * 2, &roi);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
for (i = 0; i < 4096; ++i)
{
strcat(testStr, " SSE2");
sse2_yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out2, 64 * 2, &roi);
for (i = 0; i < 4096; ++i)
if ((ABS(r1[i] - r2[i]) > 1)
|| (ABS(g1[i] - g2[i]) > 1)
|| (ABS(b1[i] - b2[i]) > 1))
{
if ((ABS(r1[i] - r2[i]) > 1)
|| (ABS(g1[i] - g2[i]) > 1)
|| (ABS(b1[i] - b2[i]) > 1))
{
printf("YCbCrToRGB-SSE FAIL[%d]: %d,%d,%d vs %d,%d,%d\n", i,
r1[i], g1[i], b1[i], r2[i], g2[i], b2[i]);
failed = 1;
}
printf("YCbCrToRGB-SSE FAIL[%d]: %d,%d,%d vs %d,%d,%d\n", i,
r1[i], g1[i], b1[i], r2[i], g2[i], b2[i]);
return FALSE;
}
}
#endif /* i386 */
if (!failed) printf("All yCbCrToRGB_16s16s_P3P3 tests passed (%s).\n", testStr);
return (failed > 0) ? FAILURE : SUCCESS;
return TRUE;
}
/* ------------------------------------------------------------------------- */
STD_SPEED_TEST(
ycbcr_to_rgb_speed, INT16*, INT16*, dst = dst,
TRUE, generic->yCbCrToRGB_16s16s_P3P3(src1, 64 * 2, dst, 64 * 2, &roi64x64),
#ifdef WITH_SSE2
TRUE, sse2_yCbCrToRGB_16s16s_P3P3(src1, 64 * 2, dst, 64 * 2, &roi64x64),
PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
#elif defined(WITH_NEON)
TRUE, neon_yCbCrToRGB_16s16s_P3P3(src1, 64 * 2, dst, 64 * 2, &roi64x64),
PF_ARM_NEON_INSTRUCTIONS_AVAILABLE, FALSE,
#else
FALSE, PRIM_NOP, 0, FALSE,
#endif
FALSE, dst = dst);
static int test_yCbCrToRGB_16s16s_P3P3_speed(void)
{
prim_size_t roi = { 64, 64 };
INT16 ALIGN(y[4096]), ALIGN(cb[4096]), ALIGN(cr[4096]);
INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]);
int i;
const INT16* input[3];
INT16* output[3];
int size_array[] = { 64 };
winpr_RAND((BYTE*)y, sizeof(y));
winpr_RAND((BYTE*)cb, sizeof(cb));
winpr_RAND((BYTE*)cr, sizeof(cr));
@ -233,37 +204,35 @@ static int test_yCbCrToRGB_16s16s_P3P3_speed(void)
output[0] = r;
output[1] = g;
output[2] = b;
ycbcr_to_rgb_speed("yCbCrToRGB", "aligned", input, NULL, NULL, output,
size_array, 1, YCBCR_TRIAL_ITERATIONS, TEST_TIME);
return SUCCESS;
if (!speed_test("yCbCrToRGB_16s16s_P3P3", "aligned", g_Iterations,
(speed_test_fkt)generic->yCbCrToRGB_16s16s_P3P3,
(speed_test_fkt)optimized->yCbCrToRGB_16s16s_P3P3,
input, 64 * 2, output, 64 * 2, &roi))
return FALSE;
return TRUE;
}
int TestPrimitivesColors(int argc, char* argv[])
{
int status;
status = test_RGBToRGB_16s8u_P3AC4R_func();
prim_test_setup(FALSE);
if (status != SUCCESS)
if (!test_RGBToRGB_16s8u_P3AC4R_func())
return 1;
if (g_TestPrimitivesPerformance)
{
status = test_RGBToRGB_16s8u_P3AC4R_speed();
if (status != SUCCESS)
if (!test_RGBToRGB_16s8u_P3AC4R_speed())
return 1;
}
status = test_yCbCrToRGB_16s16s_P3P3_func();
if (status != SUCCESS)
if (!test_yCbCrToRGB_16s16s_P3P3_func())
return 1;
if (g_TestPrimitivesPerformance)
{
status = test_yCbCrToRGB_16s16s_P3P3_speed();
if (status != SUCCESS)
if (!test_yCbCrToRGB_16s16s_P3P3_speed())
return 1;
}

View File

@ -19,22 +19,17 @@
#include <winpr/sysinfo.h>
#include "prim_test.h"
static const int MEMCPY_PRETEST_ITERATIONS = 1000000;
static const int TEST_TIME = 1.0; // seconds
#define COPY_TESTSIZE (256*2+16*2+15+15)
/* ------------------------------------------------------------------------- */
static int test_copy8u_func(void)
static BOOL test_copy8u_func(void)
{
primitives_t* prims = primitives_get();
BYTE ALIGN(data[COPY_TESTSIZE + 15]);
int i, soff;
int failed = 0;
char testStr[256];
BYTE ALIGN(dest[COPY_TESTSIZE + 15]);
testStr[0] = '\0';
winpr_RAND(data, sizeof(data));
strcat(testStr, " ptr");
for (soff = 0; soff < 16; ++soff)
{
@ -47,7 +42,8 @@ static int test_copy8u_func(void)
for (length = 1; length <= COPY_TESTSIZE - doff; ++length)
{
memset(dest, 0, sizeof(dest));
prims->copy_8u(data + soff, dest + doff, length);
if (prims->copy_8u(data + soff, dest + doff, length) != PRIMITIVES_SUCCESS)
return FALSE;
for (i = 0; i < length; ++i)
{
@ -57,48 +53,47 @@ static int test_copy8u_func(void)
"data[%d]=0x%02x\n",
doff, length, i + doff, dest[i + doff],
i + soff, data[i + soff]);
failed = 1;
return FALSE;
}
}
}
}
}
if (!failed) printf("All copy8 tests passed (%s).\n", testStr);
return (failed > 0) ? FAILURE : SUCCESS;
return TRUE;
}
/* ------------------------------------------------------------------------- */
STD_SPEED_TEST(copy8u_speed_test, BYTE, BYTE, dst = dst,
TRUE, memcpy(dst, src1, size),
FALSE, PRIM_NOP, 0, FALSE,
TRUE, ippsCopy_8u(src1, dst, size));
int test_copy8u_speed(void)
static BOOL test_copy8u_speed(void)
{
BYTE ALIGN(src[MAX_TEST_SIZE + 4]);
BYTE ALIGN(dst[MAX_TEST_SIZE + 4]);
copy8u_speed_test("copy8u", "aligned", src, NULL, 0, dst,
test_sizes, NUM_TEST_SIZES, MEMCPY_PRETEST_ITERATIONS, TEST_TIME);
copy8u_speed_test("copy8u", "unaligned", src + 1, NULL, 0, dst,
test_sizes, NUM_TEST_SIZES, MEMCPY_PRETEST_ITERATIONS, TEST_TIME);
return SUCCESS;
if (!speed_test("copy_8u", "aligned", g_Iterations,
(speed_test_fkt)generic->copy_8u,
(speed_test_fkt)optimized->copy_8u,
src, dst, MAX_TEST_SIZE))
return FALSE;
if (!speed_test("copy_8u", "unaligned", g_Iterations,
(speed_test_fkt)generic->copy_8u,
(speed_test_fkt)optimized->copy_8u,
src+1, dst+1, MAX_TEST_SIZE))
return FALSE;
return TRUE;
}
int TestPrimitivesCopy(int argc, char* argv[])
{
int status;
status = test_copy8u_func();
prim_test_setup(FALSE);
if (status != SUCCESS)
if (!test_copy8u_func())
return 1;
if (g_TestPrimitivesPerformance)
{
status = test_copy8u_speed();
if (status != SUCCESS)
if (!test_copy8u_speed())
return 1;
}

View File

@ -53,14 +53,14 @@ static BOOL test_set8u_func(void)
{
UINT32 len;
memset(dest, 0, sizeof(dest));
memset(dest, 3, sizeof(dest));
for (len = 1; len < 48 - off; ++len)
{
status = generic->set_8u(0xa5, dest + off, len);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
if (!check8(dest, len, off, 0xa8))
if (!check8(dest, len, off, 0xa5))
return FALSE;
}
}
@ -69,14 +69,14 @@ static BOOL test_set8u_func(void)
{
UINT32 len;
memset(dest, 0, sizeof(dest));
memset(dest, 3, sizeof(dest));
for (len = 1; len < 48 - off; ++len)
{
status = optimized->set_8u(0xa5, dest + off, len);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
if (!check8(dest, len, off, 0xa8))
if (!check8(dest, len, off, 0xa5))
return FALSE;
}
}
@ -95,8 +95,9 @@ static BOOL test_set8u_speed(void)
{
winpr_RAND(&value, sizeof(value));
if (!speed_test("set_8u", "", g_Iterations,
generic->set_8u, optimized->set_8u,
value, dest + x, len))
(speed_test_fkt)generic->set_8u,
(speed_test_fkt)optimized->set_8u,
value, dest + x, x))
return FALSE;
}
@ -232,8 +233,9 @@ static BOOL test_set32u_speed(void)
{
winpr_RAND(&value, sizeof(value));
if (!speed_test("set_32u", "", g_Iterations,
generic->set_32u, optimized->set_32u,
value, dest + x, len))
(speed_test_fkt)generic->set_32u,
(speed_test_fkt)optimized->set_32u,
value, dest + x, x))
return FALSE;
}
@ -251,8 +253,9 @@ static BOOL test_set32s_speed(void)
{
winpr_RAND(&value, sizeof(value));
if (!speed_test("set_32s", "", g_Iterations,
generic->set_32s, optimized->set_32s,
value, dest + x, len))
(speed_test_fkt)generic->set_32s,
(speed_test_fkt)optimized->set_32s,
value, dest + x, x))
return FALSE;
}
@ -265,21 +268,20 @@ int TestPrimitivesSet(int argc, char* argv[])
if (!test_set8u_func())
return -1;
if (!test_set8u_speed())
return -1;
if (!test_set32s_func())
return -1;
if (!test_set32s_speed())
return -1;
if (!test_set32u_func())
return -1;
if (!test_set32u_speed())
return -1;
if (g_TestPrimitivesPerformance)
{
if (!test_set8u_speed())
return -1;
if (!test_set32s_speed())
return -1;
if (!test_set32u_speed())
return -1;
}
return 0;
}

View File

@ -20,207 +20,361 @@
#include "prim_test.h"
#define FUNC_TEST_SIZE 65536
static const int SHIFT_PRETEST_ITERATIONS = 50000;
static const float TEST_TIME = 1.0;
#ifdef WITH_SSE2
#define SHIFT_TEST_FUNC(_name_, _type_, _str_, _f1_, _f2_) \
int _name_(void) \
{ \
_type_ ALIGN(src[FUNC_TEST_SIZE+3]), \
ALIGN(d1[FUNC_TEST_SIZE+3]), ALIGN(d2[FUNC_TEST_SIZE+3]); \
int failed = 0; \
int i; \
char testStr[256]; \
testStr[0] = '\0'; \
get_random_data(src, sizeof(src)); \
_f1_(src+1, 3, d1+1, FUNC_TEST_SIZE); \
if (IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) \
{ \
strcat(testStr, " SSE3"); \
/* Aligned */ \
_f2_(src+1, 3, d2+1, FUNC_TEST_SIZE); \
for (i=1; i<=FUNC_TEST_SIZE; ++i) \
{ \
if (d1[i] != d2[i]) \
{ \
printf("%s-SSE-aligned FAIL[%d]: 0x%x>>3=0x%x, got 0x%x\n", \
_str_, i, src[i], d1[i], d2[i]); \
++failed; \
} \
} \
/* Unaligned */ \
_f2_(src+1, 3, d2+2, FUNC_TEST_SIZE); \
for (i=1; i<=FUNC_TEST_SIZE; ++i) \
{ \
if (d1[i] != d2[i+1]) \
{ \
printf("%s-SSE-unaligned FAIL[%d]: 0x%x>>3=0x%x, got 0x%x\n", \
_str_, i, src[i], d1[i], d2[i+1]); \
++failed; \
} \
} \
} \
if (!failed) printf("All %s tests passed (%s).\n", _str_, testStr); \
return (failed > 0) ? FAILURE : SUCCESS; \
}
#else
#define SHIFT_TEST_FUNC(_name_, _type_, _str_, _f1_, _f2_) \
int _name_(void) \
{ \
return SUCCESS; \
}
#endif /* i386 */
SHIFT_TEST_FUNC(test_lShift_16s_func, INT16, "lshift_16s", general_lShiftC_16s,
sse2_lShiftC_16s)
SHIFT_TEST_FUNC(test_lShift_16u_func, UINT16, "lshift_16u", general_lShiftC_16u,
sse2_lShiftC_16u)
SHIFT_TEST_FUNC(test_rShift_16s_func, INT16, "rshift_16s", general_rShiftC_16s,
sse2_rShiftC_16s)
SHIFT_TEST_FUNC(test_rShift_16u_func, UINT16, "rshift_16u", general_rShiftC_16u,
sse2_rShiftC_16u)
/* ========================================================================= */
STD_SPEED_TEST(speed_lShift_16s, INT16, INT16, dst = dst,
TRUE, general_lShiftC_16s(src1, constant, dst, size),
#ifdef WITH_SSE2
TRUE, sse2_lShiftC_16s(src1, constant, dst, size),
PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
#else
FALSE, PRIM_NOP, 0, FALSE,
#endif
TRUE, ippsLShiftC_16s(src1, constant, dst, size));
STD_SPEED_TEST(speed_lShift_16u, UINT16, UINT16, dst = dst,
TRUE, general_lShiftC_16u(src1, constant, dst, size),
#ifdef WITH_SSE2
TRUE, sse2_lShiftC_16u(src1, constant, dst, size),
PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
#else
FALSE, PRIM_NOP, 0, FALSE,
#endif
TRUE, ippsLShiftC_16u(src1, constant, dst, size));
STD_SPEED_TEST(speed_rShift_16s, INT16, INT16, dst = dst,
TRUE, general_rShiftC_16s(src1, constant, dst, size),
#ifdef WITH_SSE2
TRUE, sse2_rShiftC_16s(src1, constant, dst, size),
PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
#else
FALSE, PRIM_NOP, 0, FALSE,
#endif
TRUE, ippsRShiftC_16s(src1, constant, dst, size));
STD_SPEED_TEST(speed_rShift_16u, UINT16, UINT16, dst = dst,
TRUE, general_rShiftC_16u(src1, constant, dst, size),
#ifdef WITH_SSE2
TRUE, sse2_rShiftC_16u(src1, constant, dst, size),
PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
#else
FALSE, PRIM_NOP, 0, FALSE,
#endif
TRUE, ippsRShiftC_16u(src1, constant, dst, size));
/* ------------------------------------------------------------------------- */
int test_lShift_16s_speed(void)
static BOOL test_lShift_16s_func(void)
{
INT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]);
winpr_RAND(src, sizeof(src));
speed_lShift_16s("lShift_16s", "aligned", src, NULL, 3, dst,
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
speed_lShift_16s("lShift_16s", "unaligned", src + 1, NULL, 3, dst,
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
return SUCCESS;
pstatus_t status;
INT16 ALIGN(src[FUNC_TEST_SIZE+3]);
INT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
UINT32 val;
winpr_RAND((BYTE*)&val, sizeof(val));
winpr_RAND((BYTE*)src, sizeof(src));
val = (val % (FUNC_TEST_SIZE - 1)) + 1;
/* Aligned */
status = generic->lShiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->lShiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
/* Unaligned */
status = generic->lShiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->lShiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
return TRUE;
}
static BOOL test_lShift_16u_func(void)
{
pstatus_t status;
UINT16 ALIGN(src[FUNC_TEST_SIZE+3]);
UINT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
UINT32 val;
winpr_RAND((BYTE*)&val, sizeof(val));
winpr_RAND((BYTE*)src, sizeof(src));
val = (val % (FUNC_TEST_SIZE - 1)) + 1;
/* Aligned */
status = generic->lShiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->lShiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
/* Unaligned */
status = generic->lShiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->lShiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
return TRUE;
}
static BOOL test_rShift_16s_func(void)
{
pstatus_t status;
INT16 ALIGN(src[FUNC_TEST_SIZE+3]);
INT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
UINT32 val;
winpr_RAND((BYTE*)&val, sizeof(val));
winpr_RAND((BYTE*)src, sizeof(src));
val = (val % (FUNC_TEST_SIZE - 1)) + 1;
/* Aligned */
status = generic->rShiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->rShiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
/* Unaligned */
status = generic->rShiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->rShiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
return TRUE;
}
static BOOL test_rShift_16u_func(void)
{
pstatus_t status;
UINT16 ALIGN(src[FUNC_TEST_SIZE+3]);
UINT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
UINT32 val;
winpr_RAND((BYTE*)&val, sizeof(val));
winpr_RAND((BYTE*)src, sizeof(src));
val = (val % (FUNC_TEST_SIZE - 1)) + 1;
/* Aligned */
status = generic->rShiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->rShiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
/* Unaligned */
status = generic->rShiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->rShiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
return TRUE;
}
static BOOL test_ShiftWrapper_16s_func(void)
{
pstatus_t status;
INT16 ALIGN(src[FUNC_TEST_SIZE+3]);
INT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
UINT32 tmp;
INT32 val;
winpr_RAND((BYTE*)&tmp, sizeof(tmp));
winpr_RAND((BYTE*)src, sizeof(src));
val = (tmp % (FUNC_TEST_SIZE - 1)) + 1;
/* Aligned */
status = generic->shiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = generic->shiftC_16s(src+1, -val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16s(src+1, -val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
/* Unaligned */
status = generic->shiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = generic->shiftC_16s(src+1, -val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16s(src+1, -val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
return TRUE;
}
static BOOL test_ShiftWrapper_16u_func(void)
{
pstatus_t status;
UINT16 ALIGN(src[FUNC_TEST_SIZE+3]);
UINT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
UINT32 tmp;
INT32 val;
winpr_RAND((BYTE*)&tmp, sizeof(tmp));
winpr_RAND((BYTE*)src, sizeof(src));
val = (tmp % (FUNC_TEST_SIZE - 1)) + 1;
/* Aligned */
status = generic->shiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = generic->shiftC_16u(src+1, -val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16u(src+1, -val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
/* Unaligned */
status = generic->shiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = generic->shiftC_16u(src+1, -val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16u(src+1, -val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
return TRUE;
}
/* ------------------------------------------------------------------------- */
int test_lShift_16u_speed(void)
static BOOL test_lShift_16s_speed(void)
{
UINT32 val;
INT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]);
winpr_RAND((BYTE*)src, sizeof(src));
winpr_RAND((BYTE*)&val, sizeof(val));
if (!speed_test("lShift_16s", "aligned", g_Iterations,
(speed_test_fkt)generic->lShiftC_16s,
(speed_test_fkt)optimized->lShiftC_16s, src, val,
dst, MAX_TEST_SIZE))
return FALSE;
if (!speed_test("lShift_16s", "unaligned", g_Iterations,
(speed_test_fkt)generic->lShiftC_16s,
(speed_test_fkt)optimized->lShiftC_16s, src + 1, val,
dst, MAX_TEST_SIZE))
return FALSE;
return TRUE;
}
/* ------------------------------------------------------------------------- */
static BOOL test_lShift_16u_speed(void)
{
UINT32 val;
UINT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]);
winpr_RAND(src, sizeof(src));
speed_lShift_16u("lShift_16u", "aligned", src, NULL, 3, dst,
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
speed_lShift_16u("lShift_16u", "unaligned", src + 1, NULL, 3, dst,
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
return SUCCESS;
winpr_RAND((BYTE*)&val, sizeof(val));
winpr_RAND((BYTE*)src, sizeof(src));
if (!speed_test("lShift_16u", "aligned", g_Iterations,
(speed_test_fkt)generic->lShiftC_16u,
(speed_test_fkt)optimized->lShiftC_16u, src, val,
dst, MAX_TEST_SIZE))
return FALSE;
if (!speed_test("lShift_16u", "unaligned", g_Iterations,
(speed_test_fkt)generic->lShiftC_16u,
(speed_test_fkt)optimized->lShiftC_16u, src + 1, val,
dst, MAX_TEST_SIZE))
return FALSE;
return TRUE;
}
/* ------------------------------------------------------------------------- */
int test_rShift_16s_speed(void)
static BOOL test_rShift_16s_speed(void)
{
INT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]);
winpr_RAND(src, sizeof(src));
speed_rShift_16s("rShift_16s", "aligned", src, NULL, 3, dst,
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
speed_rShift_16s("rShift_16s", "unaligned", src + 1, NULL, 3, dst,
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
return SUCCESS;
UINT32 val;
INT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]);
winpr_RAND((BYTE*)src, sizeof(src));
winpr_RAND((BYTE*)&val, sizeof(val));
if (!speed_test("rShift_16s", "aligned", g_Iterations,
(speed_test_fkt)generic->rShiftC_16s,
(speed_test_fkt)optimized->rShiftC_16s, src, val,
dst, MAX_TEST_SIZE))
return FALSE;
if (!speed_test("rShift_16s", "unaligned", g_Iterations,
(speed_test_fkt)generic->rShiftC_16s,
(speed_test_fkt)optimized->rShiftC_16s, src + 1, val,
dst, MAX_TEST_SIZE))
return FALSE;
return TRUE;
}
/* ------------------------------------------------------------------------- */
int test_rShift_16u_speed(void)
static BOOL test_rShift_16u_speed(void)
{
UINT32 val;
UINT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]);
winpr_RAND(src, sizeof(src));
speed_rShift_16u("rShift_16u", "aligned", src, NULL, 3, dst,
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
speed_rShift_16u("rShift_16u", "unaligned", src + 1, NULL, 3, dst,
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
return SUCCESS;
winpr_RAND((BYTE*)&val, sizeof(val));
winpr_RAND((BYTE*)src, sizeof(src));
if (!speed_test("rShift_16u", "aligned", g_Iterations,
(speed_test_fkt)generic->rShiftC_16u,
(speed_test_fkt)optimized->rShiftC_16u, src, val,
dst, MAX_TEST_SIZE))
return FALSE;
if (!speed_test("rShift_16u", "unaligned", g_Iterations,
(speed_test_fkt)generic->rShiftC_16u,
(speed_test_fkt)optimized->rShiftC_16u, src + 1, val,
dst, MAX_TEST_SIZE))
return FALSE;
return TRUE;
}
int TestPrimitivesShift(int argc, char* argv[])
{
int status;
status = test_lShift_16s_func();
prim_test_setup(FALSE);
if (status != SUCCESS)
if (!test_lShift_16s_func())
return 1;
if (g_TestPrimitivesPerformance)
{
status = test_lShift_16s_speed();
if (status != SUCCESS)
if (!test_lShift_16s_speed())
return 1;
}
status = test_lShift_16u_func();
if (status != SUCCESS)
if (!test_lShift_16u_func())
return 1;
if (g_TestPrimitivesPerformance)
{
status = test_lShift_16u_speed();
if (status != SUCCESS)
if (!test_lShift_16u_speed())
return 1;
}
status = test_rShift_16s_func();
if (status != SUCCESS)
if (!test_rShift_16s_func())
return 1;
if (g_TestPrimitivesPerformance)
{
status = test_rShift_16s_speed();
if (status != SUCCESS)
if (!test_rShift_16s_speed())
return 1;
}
status = test_rShift_16u_func();
if (status != SUCCESS)
if (!test_rShift_16u_func())
return 1;
if (g_TestPrimitivesPerformance)
{
status = test_rShift_16u_speed();
if (status != SUCCESS)
if (!test_rShift_16u_speed())
return 1;
}
if (!test_ShiftWrapper_16s_func())
return 1;
if (!test_ShiftWrapper_16u_func())
return 1;
return 0;
}

View File

@ -19,103 +19,71 @@
#include <winpr/sysinfo.h>
#include "prim_test.h"
static const int SIGN_PRETEST_ITERATIONS = 100000;
static const float TEST_TIME = 1.0;
#define TEST_BUFFER_SIZE 65535
/* ------------------------------------------------------------------------- */
static int test_sign16s_func(void)
static BOOL test_sign16s_func(void)
{
INT16 ALIGN(src[65535]), ALIGN(d1[65535]);
#ifdef WITH_SSE2
INT16 ALIGN(d2[65535]);
int i;
#endif
int failed = 0;
char testStr[256];
/* Test when we can reach 16-byte alignment */
testStr[0] = '\0';
winpr_RAND(src, sizeof(src));
general_sign_16s(src + 1, d1 + 1, 65535);
#ifdef WITH_SSE2
pstatus_t status;
INT16 ALIGN(src[TEST_BUFFER_SIZE]);
INT16 ALIGN(d1[TEST_BUFFER_SIZE]);
INT16 ALIGN(d2[TEST_BUFFER_SIZE]);
if (IsProcessorFeaturePresentEx(PF_EX_SSSE3))
{
strcat(testStr, " SSSE3");
ssse3_sign_16s(src + 1, d2 + 1, 65535);
winpr_RAND((BYTE*)src, sizeof(src));
for (i = 1; i < 65535; ++i)
{
if (d1[i] != d2[i])
{
printf("SIGN16s-SSE-aligned FAIL[%d] of %d: want %d, got %d\n",
i, src[i], d1[i], d2[i]);
++failed;
}
}
}
status = generic->sign_16s(src + 1, d1 + 1, TEST_BUFFER_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->sign_16s(src + 1, d2 + 1, TEST_BUFFER_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
#endif /* i386 */
/* Test when we cannot reach 16-byte alignment */
winpr_RAND(src, sizeof(src));
general_sign_16s(src + 1, d1 + 2, 65535);
#ifdef WITH_SSE2
if (memcmp(d1, d2, sizeof(d1)) != 0)
return FALSE;
if (IsProcessorFeaturePresentEx(PF_EX_SSSE3))
{
ssse3_sign_16s(src + 1, d2 + 2, 65535);
status = generic->sign_16s(src + 1, d1 + 2, TEST_BUFFER_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->sign_16s(src + 1, d2 + 2, TEST_BUFFER_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
for (i = 2; i < 65535; ++i)
{
if (d1[i] != d2[i])
{
printf("SIGN16s-SSE-unaligned FAIL[%d] of %d: want %d, got %d\n",
i, src[i - 1], d1[i], d2[i]);
++failed;
}
}
}
if (memcmp(d1, d2, sizeof(d1)) != 0)
return FALSE;
#endif /* i386 */
if (!failed) printf("All sign16s tests passed (%s).\n", testStr);
return (failed > 0) ? FAILURE : SUCCESS;
return TRUE;
}
/* ------------------------------------------------------------------------- */
STD_SPEED_TEST(sign16s_speed_test, INT16, INT16, dst = dst,
TRUE, general_sign_16s(src1, dst, size),
#ifdef WITH_SSE2
TRUE, ssse3_sign_16s(src1, dst, size), PF_EX_SSSE3, TRUE,
#else
FALSE, PRIM_NOP, 0, FALSE,
#endif
FALSE, dst = dst);
static int test_sign16s_speed(void)
{
INT16 ALIGN(src[MAX_TEST_SIZE + 3]), ALIGN(dst[MAX_TEST_SIZE + 3]);
winpr_RAND(src, sizeof(src));
sign16s_speed_test("sign16s", "aligned", src, NULL, 0, dst,
test_sizes, NUM_TEST_SIZES, SIGN_PRETEST_ITERATIONS, TEST_TIME);
sign16s_speed_test("sign16s", "unaligned", src + 1, NULL, 0, dst,
test_sizes, NUM_TEST_SIZES, SIGN_PRETEST_ITERATIONS, TEST_TIME);
return SUCCESS;
winpr_RAND((BYTE*)src, sizeof(src));
if (!speed_test("sign16s", "aligned", g_Iterations,
(speed_test_fkt)generic->sign_16s,
(speed_test_fkt)optimized->sign_16s, src + 1, dst + 1,
MAX_TEST_SIZE))
return FALSE;
if (!speed_test("sign16s", "unaligned", g_Iterations,
(speed_test_fkt)generic->sign_16s,
(speed_test_fkt)optimized->sign_16s, src + 1, dst + 2,
MAX_TEST_SIZE))
return FALSE;
return TRUE;
}
int TestPrimitivesSign(int argc, char* argv[])
{
int status;
status = test_sign16s_func();
prim_test_setup(FALSE);
if (status != SUCCESS)
if (!test_sign16s_func())
return 1;
if (g_TestPrimitivesPerformance)
{
status = test_sign16s_speed();
if (status != SUCCESS)
if (!test_sign16s_speed())
return 1;
}

View File

@ -23,105 +23,103 @@
#include <winpr/sysinfo.h>
#include "prim_test.h"
static const int YCOCG_TRIAL_ITERATIONS = 20000;
static const float TEST_TIME = 4.0;
/* ------------------------------------------------------------------------- */
int test_YCoCgRToRGB_8u_AC4R_func(void)
static BOOL test_YCoCgRToRGB_8u_AC4R_func(void)
{
#ifdef WITH_SSE2
int i;
BOOL result = TRUE;
pstatus_t status;
INT32 ALIGN(out_sse[4098]), ALIGN(out_sse_inv[4098]);
#endif
INT32 ALIGN(in[4098]);
INT32 ALIGN(out_c[4098]), ALIGN(out_c_inv[4098]);
char testStr[256];
BOOL failed = FALSE;
testStr[0] = '\0';
winpr_RAND(in, sizeof(in));
general_YCoCgToRGB_8u_AC4R((const BYTE*)(in + 1), 63 * 4,
(BYTE*) out_c, 63 * 4, 63, 61, 2, TRUE, FALSE);
general_YCoCgToRGB_8u_AC4R((const BYTE*)(in + 1), 63 * 4,
(BYTE*) out_c_inv, 63 * 4, 63, 61, 2, TRUE, TRUE);
#ifdef WITH_SSE2
if (IsProcessorFeaturePresentEx(PF_EX_SSSE3))
UINT32 i, x;
const UINT32 formats[] = {
PIXEL_FORMAT_ARGB32,
PIXEL_FORMAT_ABGR32,
PIXEL_FORMAT_RGBA32,
PIXEL_FORMAT_RGBX32,
PIXEL_FORMAT_BGRA32,
PIXEL_FORMAT_BGRX32
};
winpr_RAND((BYTE*)in, sizeof(in));
for (x=0; x<sizeof(formats)/sizeof(formats[0]); x++)
{
strcat(testStr, " SSSE3");
ssse3_YCoCgRToRGB_8u_AC4R((const BYTE*)(in + 1), 63 * 4,
(BYTE*) out_sse, 63 * 4, 63, 61, 2, TRUE, FALSE);
UINT32 format = formats[x];
status = generic->YCoCgToRGB_8u_AC4R(
(const BYTE*)(in + 1), 63 * 4,
(BYTE*) out_c, format, 63 * 4, 63, 61, 2, TRUE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = generic->YCoCgToRGB_8u_AC4R(
(const BYTE*)(in + 1), 63 * 4,
(BYTE*) out_c_inv, format, 63 * 4, 63, 61, 2, TRUE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->YCoCgToRGB_8u_AC4R(
(const BYTE*)(in + 1), 63 * 4,
(BYTE*) out_sse, format, 63 * 4, 63, 61, 2, TRUE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->YCoCgToRGB_8u_AC4R(
(const BYTE*)(in + 1), 63 * 4,
(BYTE*) out_sse_inv, format, 63 * 4, 63, 61, 2, TRUE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
for (i = 0; i < 63 * 61; ++i)
{
if (out_c[i] != out_sse[i])
{
printf("YCoCgRToRGB-SSE FAIL[%d]: 0x%08x -> C 0x%08x vs SSE 0x%08x\n", i,
printf("optimized->YCoCgRToRGB FAIL[%d]: 0x%08x -> C 0x%08x vs optimized 0x%08x\n", i,
in[i + 1], out_c[i], out_sse[i]);
failed = TRUE;
result = FALSE;
}
}
ssse3_YCoCgRToRGB_8u_AC4R((const BYTE*)(in + 1), 63 * 4,
(BYTE*) out_sse_inv, 63 * 4, 63, 61, 2, TRUE, TRUE);
for (i = 0; i < 63 * 61; ++i)
{
if (out_c_inv[i] != out_sse_inv[i])
{
printf("YCoCgRToRGB-SSE inverted FAIL[%d]: 0x%08x -> C 0x%08x vs SSE 0x%08x\n",
printf("optimized->YCoCgRToRGB inverted FAIL[%d]: 0x%08x -> C 0x%08x vs optimized 0x%08x\n",
i,
in[i + 1], out_c_inv[i], out_sse_inv[i]);
failed = TRUE;
result = FALSE;
}
}
}
#endif /* i386 */
if (!failed) printf("All YCoCgRToRGB_8u_AC4R tests passed (%s).\n", testStr);
return (failed > 0) ? FAILURE : SUCCESS;
return result;
}
/* ------------------------------------------------------------------------- */
STD_SPEED_TEST(
ycocg_to_rgb_speed, BYTE, BYTE, PRIM_NOP,
TRUE, general_YCoCgToRGB_8u_AC4R(src1, 64 * 4, dst, 64 * 4, 64, 64, 2, FALSE,
FALSE),
#ifdef WITH_SSE2
TRUE, ssse3_YCoCgRToRGB_8u_AC4R(src1, 64 * 4, dst, 64 * 4, 64, 64, 2, FALSE,
FALSE),
PF_EX_SSSE3, TRUE,
#else
FALSE, PRIM_NOP, 0, FALSE,
#endif
FALSE, PRIM_NOP);
static int test_YCoCgRToRGB_8u_AC4R_speed(void)
{
INT32 ALIGN(in[4096]);
INT32 ALIGN(out[4096]);
int size_array[] = { 64 };
winpr_RAND(in, sizeof(in));
ycocg_to_rgb_speed("YCoCgToRGB", "aligned", (const BYTE*) in,
0, 0, (BYTE*) out,
size_array, 1, YCOCG_TRIAL_ITERATIONS, TEST_TIME);
return SUCCESS;
winpr_RAND((BYTE*)in, sizeof(in));
if (!speed_test("YCoCgToRGB_8u_AC4R", "aligned", g_Iterations,
(speed_test_fkt)generic->YCoCgToRGB_8u_AC4R,
(speed_test_fkt)optimized->YCoCgToRGB_8u_AC4R,
in, 64 * 4, out, 64 * 4, 64, 64, 2, FALSE, FALSE))
return FALSE;
return TRUE;
}
int TestPrimitivesYCoCg(int argc, char* argv[])
{
int status;
status = test_YCoCgRToRGB_8u_AC4R_func();
prim_test_setup(FALSE);
if (status != SUCCESS)
if (!test_YCoCgRToRGB_8u_AC4R_func())
return 1;
if (g_TestPrimitivesPerformance)
{
status = test_YCoCgRToRGB_8u_AC4R_speed();
if (status != SUCCESS)
if (!test_YCoCgRToRGB_8u_AC4R_speed())
return 1;
}

View File

@ -38,8 +38,8 @@ static void get_size(UINT32* width, UINT32* height)
winpr_RAND((BYTE*)width, sizeof(*width));
winpr_RAND((BYTE*)height, sizeof(*height));
// TODO: Algorithm only works on even resolutions...
*width = (*width % 4000) << 1;
*height = (*height % 4000 << 1);
*width = (*width % 64) << 1;
*height = (*height % 64 << 1);
}
static BOOL check_padding(const BYTE* psrc, size_t size, size_t padding,
@ -370,11 +370,13 @@ static BOOL TestPrimitiveYUV(BOOL use444)
if (use444)
{
if (prims->RGBToYUV444_8u_P3AC4R(rgb, stride, yuv, yuv_step,
if (prims->RGBToYUV444_8u_P3AC4R(rgb, PIXEL_FORMAT_BGRA32,
stride, yuv, yuv_step,
&roi) != PRIMITIVES_SUCCESS)
goto fail;
}
else if (prims->RGBToYUV420_8u_P3AC4R(rgb, stride, yuv, yuv_step,
else if (prims->RGBToYUV420_8u_P3AC4R(rgb, PIXEL_FORMAT_BGRA32,
stride, yuv, yuv_step,
&roi) != PRIMITIVES_SUCCESS)
goto fail;
@ -429,16 +431,16 @@ int TestPrimitivesYUV(int argc, char* argv[])
UINT32 x;
int rc = -1;
prim_test_setup(FALSE);
for (x = 0; x < 10; x++)
{
/* TODO: This test fails on value comparison,
* there seems to be some issue left with encoder / decoder pass.
if (!TestPrimitiveYUV(FALSE))
goto end;
*/
if (!TestPrimitiveYUV(TRUE))
goto end;
if (!TestPrimitiveYUV(FALSE))
goto end;
if (!TestPrimitiveYUVCombine())
goto end;
}

View File

@ -43,13 +43,6 @@
extern int test_sizes[];
#define NUM_TEST_SIZES 10
#ifndef SUCCESS
#define SUCCESS 0
#endif
#ifndef FAILURE
#define FAILURE 1
#endif
extern BOOL g_TestPrimitivesPerformance;
extern UINT32 g_Iterations;
@ -58,8 +51,10 @@ extern primitives_t* optimized;
void prim_test_setup(BOOL performance);
typedef pstatus_t (*speed_test_fkt)();
BOOL speed_test(const char* name, const char* dsc, UINT32 iterations,
pstatus_t (*generic)(), pstatus_t (*optimised)(),
speed_test_fkt generic, speed_test_fkt optimized,
...);
#endif // !__PRIMTEST_H_INCLUDED__