Updated primitives API and tests.

This commit is contained in:
Armin Novak 2016-07-13 14:04:48 +02:00
parent e860fde4bc
commit 99c418766c
31 changed files with 1194 additions and 1054 deletions

View File

@ -81,62 +81,62 @@ typedef pstatus_t (*__copy_8u_AC4r_t)(
typedef pstatus_t (*__set_8u_t)( typedef pstatus_t (*__set_8u_t)(
BYTE val, BYTE val,
BYTE* pDst, BYTE* pDst,
INT32 len); UINT32 len);
typedef pstatus_t (*__set_32s_t)( typedef pstatus_t (*__set_32s_t)(
INT32 val, INT32 val,
INT32* pDst, INT32* pDst,
INT32 len); UINT32 len);
typedef pstatus_t (*__set_32u_t)( typedef pstatus_t (*__set_32u_t)(
UINT32 val, UINT32 val,
UINT32* pDst, UINT32* pDst,
INT32 len); UINT32 len);
typedef pstatus_t (*__zero_t)( typedef pstatus_t (*__zero_t)(
void* pDst, void* pDst,
size_t bytes); size_t bytes);
typedef pstatus_t (*__alphaComp_argb_t)( typedef pstatus_t (*__alphaComp_argb_t)(
const BYTE* pSrc1, INT32 src1Step, const BYTE* pSrc1, UINT32 src1Step,
const BYTE* pSrc2, INT32 src2Step, const BYTE* pSrc2, UINT32 src2Step,
BYTE* pDst, INT32 dstStep, BYTE* pDst, UINT32 dstStep,
INT32 width, INT32 height); UINT32 width, UINT32 height);
typedef pstatus_t (*__add_16s_t)( typedef pstatus_t (*__add_16s_t)(
const INT16* pSrc1, const INT16* pSrc1,
const INT16* pSrc2, const INT16* pSrc2,
INT16* pDst, INT16* pDst,
INT32 len); UINT32 len);
typedef pstatus_t (*__lShiftC_16s_t)( typedef pstatus_t (*__lShiftC_16s_t)(
const INT16* pSrc, const INT16* pSrc,
INT32 val, UINT32 val,
INT16* pSrcDst, INT16* pSrcDst,
INT32 len); UINT32 len);
typedef pstatus_t (*__lShiftC_16u_t)( typedef pstatus_t (*__lShiftC_16u_t)(
const UINT16* pSrc, const UINT16* pSrc,
INT32 val, UINT32 val,
UINT16* pSrcDst, UINT16* pSrcDst,
INT32 len); UINT32 len);
typedef pstatus_t (*__rShiftC_16s_t)( typedef pstatus_t (*__rShiftC_16s_t)(
const INT16* pSrc, const INT16* pSrc,
INT32 val, UINT32 val,
INT16* pSrcDst, INT16* pSrcDst,
INT32 len); UINT32 len);
typedef pstatus_t (*__rShiftC_16u_t)( typedef pstatus_t (*__rShiftC_16u_t)(
const UINT16* pSrc, const UINT16* pSrc,
INT32 val, UINT32 val,
UINT16* pSrcDst, UINT16* pSrcDst,
INT32 len); UINT32 len);
typedef pstatus_t (*__shiftC_16s_t)( typedef pstatus_t (*__shiftC_16s_t)(
const INT16* pSrc, const INT16* pSrc,
INT32 val, INT32 val,
INT16* pSrcDst, INT16* pSrcDst,
INT32 len); UINT32 len);
typedef pstatus_t (*__shiftC_16u_t)( typedef pstatus_t (*__shiftC_16u_t)(
const UINT16* pSrc, const UINT16* pSrc,
INT32 val, INT32 val,
UINT16* pSrcDst, UINT16* pSrcDst,
INT32 len); UINT32 len);
typedef pstatus_t (*__sign_16s_t)( typedef pstatus_t (*__sign_16s_t)(
const INT16* pSrc, const INT16* pSrc,
INT16* pDst, INT16* pDst,
INT32 len); UINT32 len);
typedef pstatus_t (*__yCbCrToRGB_16s8u_P3AC4R_t)( typedef pstatus_t (*__yCbCrToRGB_16s8u_P3AC4R_t)(
const INT16* pSrc[3], INT32 srcStep, const INT16* pSrc[3], INT32 srcStep,
BYTE* pDst, INT32 dstStep, UINT32 DstFormat, BYTE* pDst, INT32 dstStep, UINT32 DstFormat,
@ -154,8 +154,8 @@ typedef pstatus_t (*__RGBToYCbCr_16s16s_P3P3_t)(
INT16* pDst[3], INT32 dstStep, INT16* pDst[3], INT32 dstStep,
const prim_size_t* roi); const prim_size_t* roi);
typedef pstatus_t (*__RGBToRGB_16s8u_P3AC4R_t)( typedef pstatus_t (*__RGBToRGB_16s8u_P3AC4R_t)(
const INT16* pSrc[3], INT32 srcStep, const INT16* const pSrc[3], UINT32 srcStep,
BYTE* pDst, INT32 dstStep, UINT32 DstFormat, BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
const prim_size_t* roi); const prim_size_t* roi);
typedef pstatus_t (*__YCoCgToRGB_8u_AC4R_t)( typedef pstatus_t (*__YCoCgToRGB_8u_AC4R_t)(
const BYTE* pSrc, INT32 srcStep, const BYTE* pSrc, INT32 srcStep,
@ -177,11 +177,11 @@ typedef pstatus_t (*__YUV444ToRGB_8u_P3AC4R_t)(
BYTE* pDst, UINT32 dstStep, UINT32 DstFormat, BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
const prim_size_t* roi); const prim_size_t* roi);
typedef pstatus_t (*__RGBToYUV420_8u_P3AC4R_t)( typedef pstatus_t (*__RGBToYUV420_8u_P3AC4R_t)(
const BYTE* pSrc, UINT32 srcStep, const BYTE* pSrc, UINT32 SrcFormat, UINT32 srcStep,
BYTE* pDst[3], UINT32 dstStep[3], BYTE* pDst[3], UINT32 dstStep[3],
const prim_size_t* roi); const prim_size_t* roi);
typedef pstatus_t (*__RGBToYUV444_8u_P3AC4R_t)( typedef pstatus_t (*__RGBToYUV444_8u_P3AC4R_t)(
const BYTE* pSrc, UINT32 srcStep, const BYTE* pSrc, UINT32 SrcFormat, UINT32 srcStep,
BYTE* pDst[3], UINT32 dstStep[3], BYTE* pDst[3], UINT32 dstStep[3],
const prim_size_t* roi); const prim_size_t* roi);
typedef pstatus_t (*__YUV420CombineToYUV444_t)( typedef pstatus_t (*__YUV420CombineToYUV444_t)(

View File

@ -1567,7 +1567,7 @@ INT32 avc420_compress(H264_CONTEXT* h264, BYTE* pSrcData, DWORD SrcFormat,
roi.width = nSrcWidth; roi.width = nSrcWidth;
roi.height = nSrcHeight; roi.height = nSrcHeight;
prims->RGBToYUV420_8u_P3AC4R(pSrcData, nSrcStep, pYUVData, iStride, &roi); prims->RGBToYUV420_8u_P3AC4R(pSrcData, SrcFormat, nSrcStep, pYUVData, iStride, &roi);
status = h264->subsystem->Compress(h264, ppDstData, pDstSize, 0); status = h264->subsystem->Compress(h264, ppDstData, pDstSize, 0);

View File

@ -33,11 +33,11 @@
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
static pstatus_t general_YCoCgToRGB_8u_AC4R( static pstatus_t general_YCoCgToRGB_8u_AC4R(
const BYTE* pSrc, INT32 srcStep, const BYTE* pSrc, INT32 srcStep,
BYTE* pDst, UINT32 DstFormat, INT32 dstStep, BYTE* pDst, UINT32 DstFormat, INT32 dstStep,
UINT32 width, UINT32 height, UINT32 width, UINT32 height,
UINT8 shift, UINT8 shift,
BOOL withAlpha) BOOL withAlpha)
{ {
BYTE A; BYTE A;
UINT32 x, y; UINT32 x, y;
@ -66,8 +66,11 @@ static pstatus_t general_YCoCgToRGB_8u_AC4R(
R = T + Co; R = T + Co;
G = Y + Cg; G = Y + Cg;
B = T - Co; B = T - Co;
color = GetColor(DstFormat, MINMAX(R, 0, 255), MINMAX(G, 0, 255), MINMAX(B, 0,
255), A); color = GetColor(DstFormat,
MINMAX(R, 0, 255), MINMAX(G, 0, 255),
MINMAX(B, 0, 255), A);
WriteColor(dptr, DstFormat, color);
dptr += GetBytesPerPixel(DstFormat); dptr += GetBytesPerPixel(DstFormat);
} }

View File

@ -40,8 +40,8 @@ static primitives_t* generic = NULL;
#ifdef WITH_SSE2 #ifdef WITH_SSE2
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert( static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
const BYTE* pSrc, INT32 srcStep, const BYTE* pSrc, UINT32 srcStep,
BYTE* pDst, INT32 dstStep, BYTE* pDst, UINT32 DstFormat, UINT32 dstStep,
UINT32 width, UINT32 height, UINT32 width, UINT32 height,
UINT8 shift, UINT8 shift,
BOOL withAlpha) BOOL withAlpha)
@ -70,8 +70,8 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
{ {
/* Too small, or we'll never hit a 16-byte boundary. Punt. */ /* Too small, or we'll never hit a 16-byte boundary. Punt. */
return generic->YCoCgToRGB_8u_AC4R( return generic->YCoCgToRGB_8u_AC4R(
pSrc, srcStep, pDst, dstStep, pSrc, srcStep, pDst, DstFormat, dstStep,
width, height, shift, withAlpha, TRUE); width, height, shift, withAlpha);
} }
for (h = 0; h < height; h++) for (h = 0; h < height; h++)
@ -82,12 +82,16 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
/* Get to a 16-byte destination boundary. */ /* Get to a 16-byte destination boundary. */
if ((ULONG_PTR) dptr & 0x0f) if ((ULONG_PTR) dptr & 0x0f)
{ {
pstatus_t status;
int startup = (16 - ((ULONG_PTR) dptr & 0x0f)) / 4; int startup = (16 - ((ULONG_PTR) dptr & 0x0f)) / 4;
if (startup > width) startup = width; if (startup > width) startup = width;
generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep, status = generic->YCoCgToRGB_8u_AC4R(
startup, 1, shift, withAlpha, TRUE); sptr, srcStep, dptr, DstFormat, dstStep,
startup, 1, shift, withAlpha);
if (status != PRIMITIVES_SUCCESS)
return status;
sptr += startup * sizeof(UINT32); sptr += startup * sizeof(UINT32);
dptr += startup * sizeof(UINT32); dptr += startup * sizeof(UINT32);
w -= startup; w -= startup;
@ -195,8 +199,13 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
/* Handle any remainder pixels. */ /* Handle any remainder pixels. */
if (w > 0) if (w > 0)
{ {
generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep, pstatus_t status;
w, 1, shift, withAlpha, TRUE); status = generic->YCoCgToRGB_8u_AC4R(
sptr, srcStep, dptr, DstFormat, dstStep,
w, 1, shift, withAlpha);
if (status != PRIMITIVES_SUCCESS)
return status;
sptr += w * sizeof(UINT32); sptr += w * sizeof(UINT32);
dptr += w * sizeof(UINT32); dptr += w * sizeof(UINT32);
} }
@ -210,8 +219,8 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert( static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
const BYTE* pSrc, INT32 srcStep, const BYTE* pSrc, UINT32 srcStep,
BYTE* pDst, INT32 dstStep, BYTE* pDst, UINT32 DstFormat, UINT32 dstStep,
UINT32 width, UINT32 height, UINT32 width, UINT32 height,
UINT8 shift, UINT8 shift,
BOOL withAlpha) BOOL withAlpha)
@ -240,9 +249,8 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
{ {
/* Too small, or we'll never hit a 16-byte boundary. Punt. */ /* Too small, or we'll never hit a 16-byte boundary. Punt. */
return generic->YCoCgToRGB_8u_AC4R( return generic->YCoCgToRGB_8u_AC4R(
pSrc, srcStep, pSrc, srcStep, pDst, DstFormat, dstStep,
pDst, dstStep, width, height, shift, width, height, shift, withAlpha);
withAlpha, FALSE);
} }
for (h = 0; h < height; h++) for (h = 0; h < height; h++)
@ -253,12 +261,17 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
/* Get to a 16-byte destination boundary. */ /* Get to a 16-byte destination boundary. */
if ((ULONG_PTR) dptr & 0x0f) if ((ULONG_PTR) dptr & 0x0f)
{ {
pstatus_t status;
int startup = (16 - ((ULONG_PTR) dptr & 0x0f)) / 4; int startup = (16 - ((ULONG_PTR) dptr & 0x0f)) / 4;
if (startup > width) startup = width; if (startup > width) startup = width;
generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep, status = generic->YCoCgToRGB_8u_AC4R(
startup, 1, shift, withAlpha, FALSE); sptr, srcStep, dptr, DstFormat,
dstStep, startup, 1, shift, withAlpha);
if (status != PRIMITIVES_SUCCESS)
return status;
sptr += startup * sizeof(UINT32); sptr += startup * sizeof(UINT32);
dptr += startup * sizeof(UINT32); dptr += startup * sizeof(UINT32);
w -= startup; w -= startup;
@ -370,8 +383,13 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
/* Handle any remainder pixels. */ /* Handle any remainder pixels. */
if (w > 0) if (w > 0)
{ {
generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep, pstatus_t status;
w, 1, shift, withAlpha, FALSE); status = generic->YCoCgToRGB_8u_AC4R(
sptr, srcStep, dptr, DstFormat, dstStep,
w, 1, shift, withAlpha);
if (status != PRIMITIVES_SUCCESS)
return status;
sptr += w * sizeof(UINT32); sptr += w * sizeof(UINT32);
dptr += w * sizeof(UINT32); dptr += w * sizeof(UINT32);
} }
@ -388,21 +406,29 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R( static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(
const BYTE* pSrc, INT32 srcStep, const BYTE* pSrc, INT32 srcStep,
BYTE* pDst, INT32 dstStep, BYTE* pDst, UINT32 DstFormat, INT32 dstStep,
UINT32 width, UINT32 height, UINT32 width, UINT32 height,
UINT8 shift, UINT8 shift,
BOOL withAlpha, BOOL withAlpha)
BOOL invert)
{ {
if (invert) // TODO: Need to implement proper color conversion!!!
return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat,
dstStep, width, height, shift, withAlpha);
switch(DstFormat)
{ {
return ssse3_YCoCgRToRGB_8u_AC4R_invert(pSrc, srcStep, pDst, dstStep, case PIXEL_FORMAT_BGRX32:
width, height, shift, withAlpha); case PIXEL_FORMAT_BGRA32:
} return ssse3_YCoCgRToRGB_8u_AC4R_invert(
else pSrc, srcStep, pDst, DstFormat, dstStep,
{ width, height, shift, withAlpha);
return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(pSrc, srcStep, pDst, dstStep, case PIXEL_FORMAT_RGBX32:
width, height, shift, withAlpha); case PIXEL_FORMAT_RGBA32:
return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
pSrc, srcStep, pDst, DstFormat, dstStep,
width, height, shift, withAlpha);
default:
return -1;
} }
} }
#endif /* WITH_SSE2 */ #endif /* WITH_SSE2 */

View File

@ -232,9 +232,9 @@ static pstatus_t general_YUV444SplitToYUV420(
{ {
/* Filter */ /* Filter */
const INT32 u = pSrcU[2 * x] + pSrcU[2 * x + 1] + pSrcU1[2 * x] const INT32 u = pSrcU[2 * x] + pSrcU[2 * x + 1] + pSrcU1[2 * x]
+ pSrcU1[2 * x + 1]; + pSrcU1[2 * x + 1];
const INT32 v = pSrcV[2 * x] + pSrcV[2 * x + 1] + pSrcV1[2 * x] const INT32 v = pSrcV[2 * x] + pSrcV[2 * x + 1] + pSrcV1[2 * x]
+ pSrcV1[2 * x + 1]; + pSrcV1[2 * x + 1];
pU[x] = CLIP(u / 4L); pU[x] = CLIP(u / 4L);
pV[x] = CLIP(v / 4L); pV[x] = CLIP(v / 4L);
} }
@ -331,7 +331,7 @@ static INLINE BYTE* writePixel(BYTE* dst, UINT32 format, BYTE Y, BYTE U, BYTE V)
const BYTE r = YUV2R(Y, U, V); const BYTE r = YUV2R(Y, U, V);
const BYTE g = YUV2G(Y, U, V); const BYTE g = YUV2G(Y, U, V);
const BYTE b = YUV2B(Y, U, V); const BYTE b = YUV2B(Y, U, V);
UINT32 color = GetColor(format, r, g, b, 0); UINT32 color = GetColor(format, r, g, b, 0xFF);
WriteColor(dst, format, color); WriteColor(dst, format, color);
return dst + GetBytesPerPixel(format); return dst + GetBytesPerPixel(format);
} }
@ -500,9 +500,10 @@ static INLINE BYTE RGB2V(INT32 R, INT32 G, INT32 B)
} }
static pstatus_t general_RGBToYUV444_8u_P3AC4R( static pstatus_t general_RGBToYUV444_8u_P3AC4R(
const BYTE* pSrc, const UINT32 srcStep, const BYTE* pSrc, UINT32 SrcFormat, const UINT32 srcStep,
BYTE* pDst[3], UINT32 dstStep[3], const prim_size_t* roi) BYTE* pDst[3], UINT32 dstStep[3], const prim_size_t* roi)
{ {
const UINT32 bpp = GetBytesPerPixel(SrcFormat);
UINT32 x, y; UINT32 x, y;
UINT32 nWidth, nHeight; UINT32 nWidth, nHeight;
nWidth = roi->width; nWidth = roi->width;
@ -517,9 +518,10 @@ static pstatus_t general_RGBToYUV444_8u_P3AC4R(
for (x = 0; x < nWidth; x++) for (x = 0; x < nWidth; x++)
{ {
const BYTE B = pRGB[4 * x + 0]; BYTE B, G, R;
const BYTE G = pRGB[4 * x + 1]; const UINT32 color = ReadColor(&pRGB[x * bpp], SrcFormat);
const BYTE R = pRGB[4 * x + 2]; SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL);
pY[x] = RGB2Y(R, G, B); pY[x] = RGB2Y(R, G, B);
pU[x] = RGB2U(R, G, B); pU[x] = RGB2U(R, G, B);
pV[x] = RGB2V(R, G, B); pV[x] = RGB2V(R, G, B);
@ -530,9 +532,10 @@ static pstatus_t general_RGBToYUV444_8u_P3AC4R(
} }
static pstatus_t general_RGBToYUV420_8u_P3AC4R( static pstatus_t general_RGBToYUV420_8u_P3AC4R(
const BYTE* pSrc, UINT32 srcStep, const BYTE* pSrc, UINT32 SrcFormat, UINT32 srcStep,
BYTE* pDst[3], UINT32 dstStep[3], const prim_size_t* roi) BYTE* pDst[3], UINT32 dstStep[3], const prim_size_t* roi)
{ {
const UINT32 bpp = GetBytesPerPixel(SrcFormat);
UINT32 x, y; UINT32 x, y;
UINT32 halfWidth; UINT32 halfWidth;
UINT32 halfHeight; UINT32 halfHeight;
@ -555,39 +558,50 @@ static pstatus_t general_RGBToYUV420_8u_P3AC4R(
for (x = 0; x < halfWidth; x++) for (x = 0; x < halfWidth; x++)
{ {
INT32 R, G, B; UINT32 color;
INT32 Ra, Ga, Ba; INT32 Ra, Ga, Ba;
const UINT32 val2x = (x * 2); const UINT32 val2x = (x * 2);
const UINT32 val2x1 = val2x + 1; const UINT32 val2x1 = val2x + 1;
BYTE B, G, R;
/* 1st pixel */ /* 1st pixel */
Ba = B = pRGB[val2x * 4 + 0]; color = ReadColor(&pRGB[val2x * bpp], SrcFormat);
Ga = G = pRGB[val2x * 4 + 1]; SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL);
Ra = R = pRGB[val2x * 4 + 2];
Ba = B;
Ga = G;
Ra = R;
pY[val2x] = RGB2Y(R, G, B); pY[val2x] = RGB2Y(R, G, B);
if (val2x1 < nWidth) if (val2x1 < nWidth)
{ {
/* 2nd pixel */ /* 2nd pixel */
Ba += B = pRGB[val2x * 4 + 4]; color = ReadColor(&pRGB[val2x1 * bpp], SrcFormat);
Ga += G = pRGB[val2x * 4 + 5]; SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL);
Ra += R = pRGB[val2x * 4 + 6]; Ba += B;
Ga += G;
Ra += R;
pY[val2x1] = RGB2Y(R, G, B); pY[val2x1] = RGB2Y(R, G, B);
} }
if (val2y1 < nHeight) if (val2y1 < nHeight)
{ {
/* 3rd pixel */ /* 3rd pixel */
Ba += B = pRGB1[val2x * 4 + 0]; color = ReadColor(&pRGB1[val2x * bpp], SrcFormat);
Ga += G = pRGB1[val2x * 4 + 1]; SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL);
Ra += R = pRGB1[val2x * 4 + 2]; Ba += B;
Ga += G;
Ra += R;
pY1[val2x] = RGB2Y(R, G, B); pY1[val2x] = RGB2Y(R, G, B);
if (val2x1 < nWidth) if (val2x1 < nWidth)
{ {
/* 4th pixel */ /* 4th pixel */
Ba += B = pRGB1[val2x * 4 + 4]; color = ReadColor(&pRGB1[val2x1 * bpp], SrcFormat);
Ga += G = pRGB1[val2x * 4 + 5]; SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL);
Ra += R = pRGB1[val2x * 4 + 6]; Ba += B;
Ga += G;
Ra += R;
pY1[val2x1] = RGB2Y(R, G, B); pY1[val2x1] = RGB2Y(R, G, B);
} }
} }

View File

@ -35,6 +35,11 @@ static pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(
UINT32 i, nWidth, nHeight, VaddDst, VaddY, VaddU, VaddV; UINT32 i, nWidth, nHeight, VaddDst, VaddY, VaddU, VaddV;
__m128i r0, r1, r2, r3, r4, r5, r6, r7; __m128i r0, r1, r2, r3, r4, r5, r6, r7;
__m128i* buffer; __m128i* buffer;
// TODO: Need to implement proper color conversion!!!!!
return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep,
DstFormat, roi);
/* last_line: if the last (U,V doubled) line should be skipped, set to 10B /* last_line: if the last (U,V doubled) line should be skipped, set to 10B
* last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */ * last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */
buffer = _aligned_malloc(4 * 16, 16); buffer = _aligned_malloc(4 * 16, 16);

View File

@ -30,7 +30,7 @@ static pstatus_t general_add_16s(
const INT16* pSrc1, const INT16* pSrc1,
const INT16* pSrc2, const INT16* pSrc2,
INT16* pDst, INT16* pDst,
INT32 len) UINT32 len)
{ {
while (len--) while (len--)
{ {

View File

@ -40,7 +40,7 @@ static primitives_t* generic = NULL;
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) # if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s,
_mm_adds_epi16, generic->add_16s(sptr1++, sptr2++, dptr++, 1)) _mm_adds_epi16, generic->add_16s(sptr1++, sptr2++, dptr++, 1))
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ # endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif #endif

View File

@ -36,23 +36,19 @@
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
static pstatus_t general_alphaComp_argb( static pstatus_t general_alphaComp_argb(
const BYTE* pSrc1, INT32 src1Step, const BYTE* pSrc1, UINT32 src1Step,
const BYTE* pSrc2, INT32 src2Step, const BYTE* pSrc2, UINT32 src2Step,
BYTE* pDst, INT32 dstStep, BYTE* pDst, UINT32 dstStep,
INT32 width, INT32 height) UINT32 width, UINT32 height)
{ {
const UINT32* sptr1 = (const UINT32*) pSrc1; UINT32 y;
const UINT32* sptr2 = (const UINT32*) pSrc2;
UINT32* dptr = (UINT32*) pDst;
int linebytes = width * sizeof(UINT32);
int src1Jump = (src1Step - linebytes) / sizeof(UINT32);
int src2Jump = (src2Step - linebytes) / sizeof(UINT32);
int dstJump = (dstStep - linebytes) / sizeof(UINT32);
int y;
for (y = 0; y < height; y++) for (y = 0; y < height; y++)
{ {
int x; const UINT32* sptr1 = (const UINT32*) (pSrc1 + y * src1Step);
const UINT32* sptr2 = (const UINT32*) (pSrc2 + y * src2Step);
UINT32* dptr = (UINT32*) (pDst + y * dstStep);
UINT32 x;
for (x = 0; x < width; x++) for (x = 0; x < width; x++)
{ {
@ -92,10 +88,6 @@ static pstatus_t general_alphaComp_argb(
*dptr++ = rb | ag; *dptr++ = rb | ag;
} }
} }
sptr1 += src1Jump;
sptr2 += src2Jump;
dptr += dstJump;
} }
return PRIMITIVES_SUCCESS; return PRIMITIVES_SUCCESS;

View File

@ -46,10 +46,10 @@ static primitives_t* generic = NULL;
#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) #if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
pstatus_t sse2_alphaComp_argb( pstatus_t sse2_alphaComp_argb(
const BYTE* pSrc1, INT32 src1Step, const BYTE* pSrc1, UINT32 src1Step,
const BYTE* pSrc2, INT32 src2Step, const BYTE* pSrc2, UINT32 src2Step,
BYTE* pDst, INT32 dstStep, BYTE* pDst, UINT32 dstStep,
INT32 width, INT32 height) UINT32 width, UINT32 height)
{ {
const UINT32* sptr1 = (const UINT32*) pSrc1; const UINT32* sptr1 = (const UINT32*) pSrc1;
const UINT32* sptr2 = (const UINT32*) pSrc2; const UINT32* sptr2 = (const UINT32*) pSrc2;
@ -62,7 +62,7 @@ pstatus_t sse2_alphaComp_argb(
if (width < 4) /* pointless if too small */ if (width < 4) /* pointless if too small */
{ {
return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step,
pDst, dstStep, width, height); pDst, dstStep, width, height);
} }
dptr = (UINT32*) pDst; dptr = (UINT32*) pDst;
@ -108,9 +108,13 @@ pstatus_t sse2_alphaComp_argb(
if (leadIn) if (leadIn)
{ {
generic->alphaComp_argb((const BYTE*) sptr1, pstatus_t status;
src1Step, (const BYTE*) sptr2, src2Step, status = generic->alphaComp_argb((const BYTE*) sptr1,
(BYTE*) dptr, dstStep, leadIn, 1); src1Step, (const BYTE*) sptr2, src2Step,
(BYTE*) dptr, dstStep, leadIn, 1);
if (status != PRIMITIVES_SUCCESS)
return status;
sptr1 += leadIn; sptr1 += leadIn;
sptr2 += leadIn; sptr2 += leadIn;
dptr += leadIn; dptr += leadIn;
@ -181,9 +185,13 @@ pstatus_t sse2_alphaComp_argb(
/* Finish off the remainder. */ /* Finish off the remainder. */
if (pixels) if (pixels)
{ {
generic->alphaComp_argb((const BYTE*) sptr1, src1Step, pstatus_t status;
(const BYTE*) sptr2, src2Step, status = generic->alphaComp_argb((const BYTE*) sptr1, src1Step,
(BYTE*) dptr, dstStep, pixels, 1); (const BYTE*) sptr2, src2Step,
(BYTE*) dptr, dstStep, pixels, 1);
if (status != PRIMITIVES_SUCCESS)
return status;
sptr1 += pixels; sptr1 += pixels;
sptr2 += pixels; sptr2 += pixels;
dptr += pixels; dptr += pixels;
@ -212,7 +220,7 @@ static pstatus_t ipp_alphaComp_argb(
sz.width = width; sz.width = width;
sz.height = height; sz.height = height;
return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step, return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step,
pDst, dstStep, sz, ippAlphaOver); pDst, dstStep, sz, ippAlphaOver);
} }
#endif #endif

View File

@ -262,7 +262,7 @@ static pstatus_t general_RGBToYCbCr_16s16s_P3P3(
for (y = 0; y < roi->height; y++) for (y = 0; y < roi->height; y++)
{ {
int x; UINT32 x;
for (x = 0; x < roi->width; ++x) for (x = 0; x < roi->width; ++x)
{ {
@ -305,10 +305,10 @@ static pstatus_t general_RGBToYCbCr_16s16s_P3P3(
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
static pstatus_t general_RGBToRGB_16s8u_P3AC4R( static pstatus_t general_RGBToRGB_16s8u_P3AC4R(
const INT16* pSrc[3], /* 16-bit R,G, and B arrays */ const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
INT32 srcStep, /* bytes between rows in source data */ UINT32 srcStep, /* bytes between rows in source data */
BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */ BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */
INT32 dstStep, /* bytes between rows in dest data */ UINT32 dstStep, /* bytes between rows in dest data */
UINT32 DstFormat, UINT32 DstFormat,
const prim_size_t* roi) /* region of interest */ const prim_size_t* roi) /* region of interest */
{ {

View File

@ -91,7 +91,7 @@ static pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(
{ {
/* We can't maintain 16-byte alignment. */ /* We can't maintain 16-byte alignment. */
return generic->yCbCrToRGB_16s16s_P3P3(pSrc, srcStep, return generic->yCbCrToRGB_16s16s_P3P3(pSrc, srcStep,
pDst, dstStep, roi); pDst, dstStep, roi);
} }
zero = _mm_setzero_si128(); zero = _mm_setzero_si128();
@ -228,7 +228,7 @@ static pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(
{ {
/* We can't maintain 16-byte alignment. */ /* We can't maintain 16-byte alignment. */
return generic->RGBToYCbCr_16s16s_P3P3(pSrc, srcStep, return generic->RGBToYCbCr_16s16s_P3P3(pSrc, srcStep,
pDst, dstStep, roi); pDst, dstStep, roi);
} }
min = _mm_set1_epi16(-128 * 32); min = _mm_set1_epi16(-128 * 32);
@ -357,10 +357,10 @@ static pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(
_mm_set1_epi32(0xFFFFFFFFU) _mm_set1_epi32(0xFFFFFFFFU)
pstatus_t sse2_RGBToRGB_16s8u_P3AC4R( pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
const INT16* pSrc[3], /* 16-bit R,G, and B arrays */ const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
INT32 srcStep, /* bytes between rows in source data */ UINT32 srcStep, /* bytes between rows in source data */
BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */ BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */
INT32 dstStep, /* bytes between rows in dest data */ UINT32 dstStep, /* bytes between rows in dest data */
UINT32 DstFormat, UINT32 DstFormat,
const prim_size_t* roi) /* region of interest */ const prim_size_t* roi) /* region of interest */
{ {
@ -385,9 +385,13 @@ pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
|| (dstStep & 0x0f)) || (dstStep & 0x0f))
{ {
return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst,
dstStep, DstFormat, roi); dstStep, DstFormat, roi);
} }
// TODO: Need to update SSE code to allow color conversion!!!
return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst,
dstStep, DstFormat, roi);
out = (BYTE*) pDst; out = (BYTE*) pDst;
srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16); srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
dstbump = (dstStep - (roi->width * sizeof(UINT32))); dstbump = (dstStep - (roi->width * sizeof(UINT32)));

View File

@ -29,7 +29,7 @@
static pstatus_t general_set_8u( static pstatus_t general_set_8u(
BYTE val, BYTE val,
BYTE* pDst, BYTE* pDst,
INT32 len) UINT32 len)
{ {
memset((void*) pDst, (int) val, (size_t) len); memset((void*) pDst, (int) val, (size_t) len);
return PRIMITIVES_SUCCESS; return PRIMITIVES_SUCCESS;
@ -48,7 +48,7 @@ static pstatus_t general_zero(
static pstatus_t general_set_32s( static pstatus_t general_set_32s(
INT32 val, INT32 val,
INT32* pDst, INT32* pDst,
INT32 len) UINT32 len)
{ {
INT32* dptr = (INT32*) pDst; INT32* dptr = (INT32*) pDst;
size_t span, remaining; size_t span, remaining;
@ -85,7 +85,7 @@ static pstatus_t general_set_32s(
static pstatus_t general_set_32u( static pstatus_t general_set_32u(
UINT32 val, UINT32 val,
UINT32* pDst, UINT32* pDst,
INT32 len) UINT32 len)
{ {
UINT32* dptr = (UINT32*) pDst; UINT32* dptr = (UINT32*) pDst;
size_t span, remaining; size_t span, remaining;

View File

@ -40,7 +40,7 @@ static primitives_t* generic = NULL;
static pstatus_t sse2_set_8u( static pstatus_t sse2_set_8u(
BYTE val, BYTE val,
BYTE* pDst, BYTE* pDst,
INT32 len) UINT32 len)
{ {
BYTE byte, *dptr; BYTE byte, *dptr;
__m128i xmm0; __m128i xmm0;
@ -126,7 +126,7 @@ static pstatus_t sse2_set_8u(
static pstatus_t sse2_set_32u( static pstatus_t sse2_set_32u(
UINT32 val, UINT32 val,
UINT32* pDst, UINT32* pDst,
INT32 len) UINT32 len)
{ {
const primitives_t* prim = primitives_get_generic(); const primitives_t* prim = primitives_get_generic();
UINT32* dptr = (UINT32*) pDst; UINT32* dptr = (UINT32*) pDst;
@ -218,7 +218,7 @@ static pstatus_t sse2_set_32u(
static pstatus_t sse2_set_32s( static pstatus_t sse2_set_32s(
INT32 val, INT32 val,
INT32* pDst, INT32* pDst,
INT32 len) UINT32 len)
{ {
UINT32 uval = *((UINT32*) &val); UINT32 uval = *((UINT32*) &val);
return sse2_set_32u(uval, (UINT32*) pDst, len); return sse2_set_32u(uval, (UINT32*) pDst, len);

View File

@ -24,9 +24,9 @@
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
static pstatus_t general_lShiftC_16s( static pstatus_t general_lShiftC_16s(
const INT16* pSrc, const INT16* pSrc,
INT32 val, UINT32 val,
INT16* pDst, INT16* pDst,
INT32 len) UINT32 len)
{ {
if (val == 0) return PRIMITIVES_SUCCESS; if (val == 0) return PRIMITIVES_SUCCESS;
@ -38,9 +38,9 @@ static pstatus_t general_lShiftC_16s(
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
static pstatus_t general_rShiftC_16s( static pstatus_t general_rShiftC_16s(
const INT16* pSrc, const INT16* pSrc,
INT32 val, UINT32 val,
INT16* pDst, INT16* pDst,
INT32 len) UINT32 len)
{ {
if (val == 0) return PRIMITIVES_SUCCESS; if (val == 0) return PRIMITIVES_SUCCESS;
@ -52,9 +52,9 @@ static pstatus_t general_rShiftC_16s(
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
static pstatus_t general_lShiftC_16u( static pstatus_t general_lShiftC_16u(
const UINT16* pSrc, const UINT16* pSrc,
INT32 val, UINT32 val,
UINT16* pDst, UINT16* pDst,
INT32 len) UINT32 len)
{ {
if (val == 0) return PRIMITIVES_SUCCESS; if (val == 0) return PRIMITIVES_SUCCESS;
@ -66,9 +66,9 @@ static pstatus_t general_lShiftC_16u(
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
static pstatus_t general_rShiftC_16u( static pstatus_t general_rShiftC_16u(
const UINT16* pSrc, const UINT16* pSrc,
INT32 val, UINT32 val,
UINT16* pDst, UINT16* pDst,
INT32 len) UINT32 len)
{ {
if (val == 0) return PRIMITIVES_SUCCESS; if (val == 0) return PRIMITIVES_SUCCESS;
@ -82,7 +82,7 @@ static pstatus_t general_shiftC_16s(
const INT16* pSrc, const INT16* pSrc,
INT32 val, INT32 val,
INT16* pDst, INT16* pDst,
INT32 len) UINT32 len)
{ {
if (val == 0) return PRIMITIVES_SUCCESS; if (val == 0) return PRIMITIVES_SUCCESS;
@ -95,7 +95,7 @@ static pstatus_t general_shiftC_16u(
const UINT16* pSrc, const UINT16* pSrc,
INT32 val, INT32 val,
UINT16* pDst, UINT16* pDst,
INT32 len) UINT32 len)
{ {
if (val == 0) return PRIMITIVES_SUCCESS; if (val == 0) return PRIMITIVES_SUCCESS;

View File

@ -39,16 +39,16 @@ static primitives_t* generic = NULL;
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) # if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s,
_mm_slli_epi16, *dptr++ = *sptr++ << val) _mm_slli_epi16, *dptr++ = *sptr++ << val)
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s, SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s,
_mm_srai_epi16, *dptr++ = *sptr++ >> val) _mm_srai_epi16, *dptr++ = *sptr++ >> val)
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u,
_mm_slli_epi16, *dptr++ = *sptr++ << val) _mm_slli_epi16, *dptr++ = *sptr++ << val)
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u,
_mm_srli_epi16, *dptr++ = *sptr++ >> val) _mm_srli_epi16, *dptr++ = *sptr++ >> val)
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ # endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif #endif

View File

@ -28,7 +28,7 @@
static pstatus_t general_sign_16s( static pstatus_t general_sign_16s(
const INT16* pSrc, const INT16* pSrc,
INT16* pDst, INT16* pDst,
INT32 len) UINT32 len)
{ {
while (len--) while (len--)
{ {

View File

@ -35,7 +35,7 @@ static primitives_t* generic = NULL;
static pstatus_t ssse3_sign_16s( static pstatus_t ssse3_sign_16s(
const INT16* pSrc, const INT16* pSrc,
INT16* pDst, INT16* pDst,
INT32 len) UINT32 len)
{ {
const INT16* sptr = (const INT16*) pSrc; const INT16* sptr = (const INT16*) pSrc;
INT16* dptr = (INT16*) pDst; INT16* dptr = (INT16*) pDst;

View File

@ -44,143 +44,143 @@
* SCD = Source, Constant, Destination * SCD = Source, Constant, Destination
*/ */
#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \ #define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
static pstatus_t _name_(const _type_ *pSrc, INT32 val, _type_ *pDst, INT32 len) \ static pstatus_t _name_(const _type_ *pSrc, UINT32 val, _type_ *pDst, UINT32 len) \
{ \ { \
INT32 shifts; \ INT32 shifts; \
UINT32 offBeatMask; \ UINT32 offBeatMask; \
const _type_ *sptr = pSrc; \ const _type_ *sptr = pSrc; \
_type_ *dptr = pDst; \ _type_ *dptr = pDst; \
size_t count; \ size_t count; \
if (len < 16) /* pointless if too small */ \ if (len < 16) /* pointless if too small */ \
{ \ { \
return _fallback_(pSrc, val, pDst, len); \ return _fallback_(pSrc, val, pDst, len); \
} \ } \
if (sizeof(_type_) == 1) shifts = 1; \ if (sizeof(_type_) == 1) shifts = 1; \
else if (sizeof(_type_) == 2) shifts = 2; \ else if (sizeof(_type_) == 2) shifts = 2; \
else if (sizeof(_type_) == 4) shifts = 3; \ else if (sizeof(_type_) == 4) shifts = 3; \
else if (sizeof(_type_) == 8) shifts = 4; \ else if (sizeof(_type_) == 8) shifts = 4; \
offBeatMask = (1 << (shifts - 1)) - 1; \ offBeatMask = (1 << (shifts - 1)) - 1; \
if ((ULONG_PTR) pDst & offBeatMask) \ if ((ULONG_PTR) pDst & offBeatMask) \
{ \ { \
/* Incrementing the pointer skips over 16-byte boundary. */ \ /* Incrementing the pointer skips over 16-byte boundary. */ \
return _fallback_(pSrc, val, pDst, len); \ return _fallback_(pSrc, val, pDst, len); \
} \ } \
/* Get to the 16-byte boundary now. */ \ /* Get to the 16-byte boundary now. */ \
while ((ULONG_PTR) dptr & 0x0f) \ while ((ULONG_PTR) dptr & 0x0f) \
{ \ { \
_slowWay_; \ _slowWay_; \
if (--len == 0) return PRIMITIVES_SUCCESS; \ if (--len == 0) return PRIMITIVES_SUCCESS; \
} \ } \
/* Use 8 128-bit SSE registers. */ \ /* Use 8 128-bit SSE registers. */ \
count = len >> (8-shifts); \ count = len >> (8-shifts); \
len -= count << (8-shifts); \ len -= count << (8-shifts); \
if ((ULONG_PTR) sptr & 0x0f) \ if ((ULONG_PTR) sptr & 0x0f) \
{ \ { \
while (count--) \ while (count--) \
{ \ { \
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
xmm0 = _mm_lddqu_si128((__m128i *) sptr); \ xmm0 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm1 = _mm_lddqu_si128((__m128i *) sptr); \ xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm2 = _mm_lddqu_si128((__m128i *) sptr); \ xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm3 = _mm_lddqu_si128((__m128i *) sptr); \ xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm4 = _mm_lddqu_si128((__m128i *) sptr); \ xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm5 = _mm_lddqu_si128((__m128i *) sptr); \ xmm5 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm6 = _mm_lddqu_si128((__m128i *) sptr); \ xmm6 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm7 = _mm_lddqu_si128((__m128i *) sptr); \ xmm7 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, val); \ xmm0 = _op_(xmm0, val); \
xmm1 = _op_(xmm1, val); \ xmm1 = _op_(xmm1, val); \
xmm2 = _op_(xmm2, val); \ xmm2 = _op_(xmm2, val); \
xmm3 = _op_(xmm3, val); \ xmm3 = _op_(xmm3, val); \
xmm4 = _op_(xmm4, val); \ xmm4 = _op_(xmm4, val); \
xmm5 = _op_(xmm5, val); \ xmm5 = _op_(xmm5, val); \
xmm6 = _op_(xmm6, val); \ xmm6 = _op_(xmm6, val); \
xmm7 = _op_(xmm7, val); \ xmm7 = _op_(xmm7, val); \
_mm_store_si128((__m128i *) dptr, xmm0); \ _mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm1); \ _mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \ _mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \ _mm_store_si128((__m128i *) dptr, xmm3); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm4); \ _mm_store_si128((__m128i *) dptr, xmm4); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm5); \ _mm_store_si128((__m128i *) dptr, xmm5); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm6); \ _mm_store_si128((__m128i *) dptr, xmm6); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm7); \ _mm_store_si128((__m128i *) dptr, xmm7); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
} \ } \
} \ } \
else \ else \
{ \ { \
while (count--) \ while (count--) \
{ \ { \
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
xmm0 = _mm_load_si128((__m128i *) sptr); \ xmm0 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm1 = _mm_load_si128((__m128i *) sptr); \ xmm1 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm2 = _mm_load_si128((__m128i *) sptr); \ xmm2 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm3 = _mm_load_si128((__m128i *) sptr); \ xmm3 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm4 = _mm_load_si128((__m128i *) sptr); \ xmm4 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm5 = _mm_load_si128((__m128i *) sptr); \ xmm5 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm6 = _mm_load_si128((__m128i *) sptr); \ xmm6 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm7 = _mm_load_si128((__m128i *) sptr); \ xmm7 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, val); \ xmm0 = _op_(xmm0, val); \
xmm1 = _op_(xmm1, val); \ xmm1 = _op_(xmm1, val); \
xmm2 = _op_(xmm2, val); \ xmm2 = _op_(xmm2, val); \
xmm3 = _op_(xmm3, val); \ xmm3 = _op_(xmm3, val); \
xmm4 = _op_(xmm4, val); \ xmm4 = _op_(xmm4, val); \
xmm5 = _op_(xmm5, val); \ xmm5 = _op_(xmm5, val); \
xmm6 = _op_(xmm6, val); \ xmm6 = _op_(xmm6, val); \
xmm7 = _op_(xmm7, val); \ xmm7 = _op_(xmm7, val); \
_mm_store_si128((__m128i *) dptr, xmm0); \ _mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm1); \ _mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \ _mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \ _mm_store_si128((__m128i *) dptr, xmm3); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm4); \ _mm_store_si128((__m128i *) dptr, xmm4); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm5); \ _mm_store_si128((__m128i *) dptr, xmm5); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm6); \ _mm_store_si128((__m128i *) dptr, xmm6); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm7); \ _mm_store_si128((__m128i *) dptr, xmm7); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
} \ } \
} \ } \
/* Use a single 128-bit SSE register. */ \ /* Use a single 128-bit SSE register. */ \
count = len >> (5-shifts); \ count = len >> (5-shifts); \
len -= count << (5-shifts); \ len -= count << (5-shifts); \
while (count--) \ while (count--) \
{ \ { \
__m128i xmm0 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \ __m128i xmm0 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, val); \ xmm0 = _op_(xmm0, val); \
_mm_store_si128((__m128i *) dptr, xmm0); \ _mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
} \ } \
/* Finish off the remainder. */ \ /* Finish off the remainder. */ \
while (len--) { _slowWay_; } \ while (len--) { _slowWay_; } \
return PRIMITIVES_SUCCESS; \ return PRIMITIVES_SUCCESS; \
} }
/* ---------------------------------------------------------------------------- /* ----------------------------------------------------------------------------
@ -189,228 +189,230 @@
*/ */
#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \ #define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32 len) \ pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32 len) \
{ \ { \
int shifts; \ int shifts; \
UINT32 offBeatMask; \ UINT32 offBeatMask; \
const _type_ *sptr = pSrc; \ const _type_ *sptr = pSrc; \
_type_ *dptr = pDst; \ _type_ *dptr = pDst; \
size_t count; \ size_t count; \
__m128i xmm0; \ __m128i xmm0; \
if (len < 16) /* pointless if too small */ \ if (len < 16) /* pointless if too small */ \
{ \ { \
return _fallback_(pSrc, val, pDst, len); \ return _fallback_(pSrc, val, pDst, len); \
} \ } \
if (sizeof(_type_) == 1) shifts = 1; \ if (sizeof(_type_) == 1) shifts = 1; \
else if (sizeof(_type_) == 2) shifts = 2; \ else if (sizeof(_type_) == 2) shifts = 2; \
else if (sizeof(_type_) == 4) shifts = 3; \ else if (sizeof(_type_) == 4) shifts = 3; \
else if (sizeof(_type_) == 8) shifts = 4; \ else if (sizeof(_type_) == 8) shifts = 4; \
offBeatMask = (1 << (shifts - 1)) - 1; \ offBeatMask = (1 << (shifts - 1)) - 1; \
if ((ULONG_PTR) pDst & offBeatMask) \ if ((ULONG_PTR) pDst & offBeatMask) \
{ \ { \
/* Incrementing the pointer skips over 16-byte boundary. */ \ /* Incrementing the pointer skips over 16-byte boundary. */ \
return _fallback_(pSrc, val, pDst, len); \ return _fallback_(pSrc, val, pDst, len); \
} \ } \
/* Get to the 16-byte boundary now. */ \ /* Get to the 16-byte boundary now. */ \
while ((ULONG_PTR) dptr & 0x0f) \ while ((ULONG_PTR) dptr & 0x0f) \
{ \ { \
_slowWay_; \ _slowWay_; \
if (--len == 0) return PRIMITIVES_SUCCESS; \ if (--len == 0) return PRIMITIVES_SUCCESS; \
} \ } \
/* Use 4 128-bit SSE registers. */ \ /* Use 4 128-bit SSE registers. */ \
count = len >> (7-shifts); \ count = len >> (7-shifts); \
len -= count << (7-shifts); \ len -= count << (7-shifts); \
xmm0 = _mm_set1_epi32(val); \ xmm0 = _mm_set1_epi32(val); \
if ((ULONG_PTR) sptr & 0x0f) \ if ((ULONG_PTR) sptr & 0x0f) \
{ \ { \
while (count--) \ while (count--) \
{ \ { \
__m128i xmm1, xmm2, xmm3, xmm4; \ __m128i xmm1, xmm2, xmm3, xmm4; \
xmm1 = _mm_lddqu_si128((__m128i *) sptr); \ xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm2 = _mm_lddqu_si128((__m128i *) sptr); \ xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm3 = _mm_lddqu_si128((__m128i *) sptr); \ xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm4 = _mm_lddqu_si128((__m128i *) sptr); \ xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm1 = _op_(xmm1, xmm0); \ xmm1 = _op_(xmm1, xmm0); \
xmm2 = _op_(xmm2, xmm0); \ xmm2 = _op_(xmm2, xmm0); \
xmm3 = _op_(xmm3, xmm0); \ xmm3 = _op_(xmm3, xmm0); \
xmm4 = _op_(xmm4, xmm0); \ xmm4 = _op_(xmm4, xmm0); \
_mm_store_si128((__m128i *) dptr, xmm1); \ _mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \ _mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \ _mm_store_si128((__m128i *) dptr, xmm3); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm4); \ _mm_store_si128((__m128i *) dptr, xmm4); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
} \ } \
} \ } \
else \ else \
{ \ { \
while (count--) \ while (count--) \
{ \ { \
__m128i xmm1, xmm2, xmm3, xmm4; \ __m128i xmm1, xmm2, xmm3, xmm4; \
xmm1 = _mm_load_si128((__m128i *) sptr); \ xmm1 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm2 = _mm_load_si128((__m128i *) sptr); \ xmm2 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm3 = _mm_load_si128((__m128i *) sptr); \ xmm3 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm4 = _mm_load_si128((__m128i *) sptr); \ xmm4 = _mm_load_si128((__m128i *) sptr); \
sptr += (16/sizeof(_type_)); \ sptr += (16/sizeof(_type_)); \
xmm1 = _op_(xmm1, xmm0); \ xmm1 = _op_(xmm1, xmm0); \
xmm2 = _op_(xmm2, xmm0); \ xmm2 = _op_(xmm2, xmm0); \
xmm3 = _op_(xmm3, xmm0); \ xmm3 = _op_(xmm3, xmm0); \
xmm4 = _op_(xmm4, xmm0); \ xmm4 = _op_(xmm4, xmm0); \
_mm_store_si128((__m128i *) dptr, xmm1); \ _mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \ _mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \ _mm_store_si128((__m128i *) dptr, xmm3); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm4); \ _mm_store_si128((__m128i *) dptr, xmm4); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
} \ } \
} \ } \
/* Use a single 128-bit SSE register. */ \ /* Use a single 128-bit SSE register. */ \
count = len >> (5-shifts); \ count = len >> (5-shifts); \
len -= count << (5-shifts); \ len -= count << (5-shifts); \
while (count--) \ while (count--) \
{ \ { \
__m128i xmm1 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \ __m128i xmm1 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
xmm1 = _op_(xmm1, xmm0); \ xmm1 = _op_(xmm1, xmm0); \
_mm_store_si128((__m128i *) dptr, xmm1); \ _mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
} \ } \
/* Finish off the remainder. */ \ /* Finish off the remainder. */ \
while (len--) { _slowWay_; } \ while (len--) { _slowWay_; } \
return PRIMITIVES_SUCCESS; \ return PRIMITIVES_SUCCESS; \
} }
/* ---------------------------------------------------------------------------- /* ----------------------------------------------------------------------------
* SSD = Source1, Source2, Destination * SSD = Source1, Source2, Destination
*/ */
#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \ #define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, INT32 len) \ pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, UINT32 len) \
{ \ { \
int shifts; \ int shifts; \
UINT32 offBeatMask; \ UINT32 offBeatMask; \
const _type_ *sptr1 = pSrc1; \ const _type_ *sptr1 = pSrc1; \
const _type_ *sptr2 = pSrc2; \ const _type_ *sptr2 = pSrc2; \
_type_ *dptr = pDst; \ _type_ *dptr = pDst; \
size_t count; \ size_t count; \
if (len < 16) /* pointless if too small */ \ if (len < 16) /* pointless if too small */ \
{ \ { \
return _fallback_(pSrc1, pSrc2, pDst, len); \ return _fallback_(pSrc1, pSrc2, pDst, len); \
} \ } \
if (sizeof(_type_) == 1) shifts = 1; \ if (sizeof(_type_) == 1) shifts = 1; \
else if (sizeof(_type_) == 2) shifts = 2; \ else if (sizeof(_type_) == 2) shifts = 2; \
else if (sizeof(_type_) == 4) shifts = 3; \ else if (sizeof(_type_) == 4) shifts = 3; \
else if (sizeof(_type_) == 8) shifts = 4; \ else if (sizeof(_type_) == 8) shifts = 4; \
offBeatMask = (1 << (shifts - 1)) - 1; \ offBeatMask = (1 << (shifts - 1)) - 1; \
if ((ULONG_PTR) pDst & offBeatMask) \ if ((ULONG_PTR) pDst & offBeatMask) \
{ \ { \
/* Incrementing the pointer skips over 16-byte boundary. */ \ /* Incrementing the pointer skips over 16-byte boundary. */ \
return _fallback_(pSrc1, pSrc2, pDst, len); \ return _fallback_(pSrc1, pSrc2, pDst, len); \
} \ } \
/* Get to the 16-byte boundary now. */ \ /* Get to the 16-byte boundary now. */ \
while ((ULONG_PTR) dptr & 0x0f) \ while ((ULONG_PTR) dptr & 0x0f) \
{ \ { \
_slowWay_; \ pstatus_t status; \
if (--len == 0) return PRIMITIVES_SUCCESS; \ status = _slowWay_; \
} \ if (status != PRIMITIVES_SUCCESS) return status; \
/* Use 4 128-bit SSE registers. */ \ if (--len == 0) return PRIMITIVES_SUCCESS; \
count = len >> (7-shifts); \ } \
len -= count << (7-shifts); \ /* Use 4 128-bit SSE registers. */ \
if (((ULONG_PTR) sptr1 & 0x0f) || ((ULONG_PTR) sptr2 & 0x0f)) \ count = len >> (7-shifts); \
{ \ len -= count << (7-shifts); \
/* Unaligned loads */ \ if (((ULONG_PTR) sptr1 & 0x0f) || ((ULONG_PTR) sptr2 & 0x0f)) \
while (count--) \ { \
{ \ /* Unaligned loads */ \
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \ while (count--) \
xmm0 = _mm_lddqu_si128((__m128i *) sptr1); \ { \
sptr1 += (16/sizeof(_type_)); \ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
xmm1 = _mm_lddqu_si128((__m128i *) sptr1); \ xmm0 = _mm_lddqu_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \ sptr1 += (16/sizeof(_type_)); \
xmm2 = _mm_lddqu_si128((__m128i *) sptr1); \ xmm1 = _mm_lddqu_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \ sptr1 += (16/sizeof(_type_)); \
xmm3 = _mm_lddqu_si128((__m128i *) sptr1); \ xmm2 = _mm_lddqu_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \ sptr1 += (16/sizeof(_type_)); \
xmm4 = _mm_lddqu_si128((__m128i *) sptr2); \ xmm3 = _mm_lddqu_si128((__m128i *) sptr1); \
sptr2 += (16/sizeof(_type_)); \ sptr1 += (16/sizeof(_type_)); \
xmm5 = _mm_lddqu_si128((__m128i *) sptr2); \ xmm4 = _mm_lddqu_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \ sptr2 += (16/sizeof(_type_)); \
xmm6 = _mm_lddqu_si128((__m128i *) sptr2); \ xmm5 = _mm_lddqu_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \ sptr2 += (16/sizeof(_type_)); \
xmm7 = _mm_lddqu_si128((__m128i *) sptr2); \ xmm6 = _mm_lddqu_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \ sptr2 += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, xmm4); \ xmm7 = _mm_lddqu_si128((__m128i *) sptr2); \
xmm1 = _op_(xmm1, xmm5); \ sptr2 += (16/sizeof(_type_)); \
xmm2 = _op_(xmm2, xmm6); \ xmm0 = _op_(xmm0, xmm4); \
xmm3 = _op_(xmm3, xmm7); \ xmm1 = _op_(xmm1, xmm5); \
_mm_store_si128((__m128i *) dptr, xmm0); \ xmm2 = _op_(xmm2, xmm6); \
dptr += (16/sizeof(_type_)); \ xmm3 = _op_(xmm3, xmm7); \
_mm_store_si128((__m128i *) dptr, xmm1); \ _mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \ _mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \ _mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
} \ _mm_store_si128((__m128i *) dptr, xmm3); \
} \ dptr += (16/sizeof(_type_)); \
else \ } \
{ \ } \
/* Aligned loads */ \ else \
while (count--) \ { \
{ \ /* Aligned loads */ \
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \ while (count--) \
xmm0 = _mm_load_si128((__m128i *) sptr1); \ { \
sptr1 += (16/sizeof(_type_)); \ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
xmm1 = _mm_load_si128((__m128i *) sptr1); \ xmm0 = _mm_load_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \ sptr1 += (16/sizeof(_type_)); \
xmm2 = _mm_load_si128((__m128i *) sptr1); \ xmm1 = _mm_load_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \ sptr1 += (16/sizeof(_type_)); \
xmm3 = _mm_load_si128((__m128i *) sptr1); \ xmm2 = _mm_load_si128((__m128i *) sptr1); \
sptr1 += (16/sizeof(_type_)); \ sptr1 += (16/sizeof(_type_)); \
xmm4 = _mm_load_si128((__m128i *) sptr2); \ xmm3 = _mm_load_si128((__m128i *) sptr1); \
sptr2 += (16/sizeof(_type_)); \ sptr1 += (16/sizeof(_type_)); \
xmm5 = _mm_load_si128((__m128i *) sptr2); \ xmm4 = _mm_load_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \ sptr2 += (16/sizeof(_type_)); \
xmm6 = _mm_load_si128((__m128i *) sptr2); \ xmm5 = _mm_load_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \ sptr2 += (16/sizeof(_type_)); \
xmm7 = _mm_load_si128((__m128i *) sptr2); \ xmm6 = _mm_load_si128((__m128i *) sptr2); \
sptr2 += (16/sizeof(_type_)); \ sptr2 += (16/sizeof(_type_)); \
xmm0 = _op_(xmm0, xmm4); \ xmm7 = _mm_load_si128((__m128i *) sptr2); \
xmm1 = _op_(xmm1, xmm5); \ sptr2 += (16/sizeof(_type_)); \
xmm2 = _op_(xmm2, xmm6); \ xmm0 = _op_(xmm0, xmm4); \
xmm3 = _op_(xmm3, xmm7); \ xmm1 = _op_(xmm1, xmm5); \
_mm_store_si128((__m128i *) dptr, xmm0); \ xmm2 = _op_(xmm2, xmm6); \
dptr += (16/sizeof(_type_)); \ xmm3 = _op_(xmm3, xmm7); \
_mm_store_si128((__m128i *) dptr, xmm1); \ _mm_store_si128((__m128i *) dptr, xmm0); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm2); \ _mm_store_si128((__m128i *) dptr, xmm1); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm3); \ _mm_store_si128((__m128i *) dptr, xmm2); \
dptr += (16/sizeof(_type_)); \ dptr += (16/sizeof(_type_)); \
} \ _mm_store_si128((__m128i *) dptr, xmm3); \
} \ dptr += (16/sizeof(_type_)); \
/* Use a single 128-bit SSE register. */ \ } \
count = len >> (5-shifts); \ } \
len -= count << (5-shifts); \ /* Use a single 128-bit SSE register. */ \
while (count--) \ count = len >> (5-shifts); \
{ \ len -= count << (5-shifts); \
__m128i xmm0, xmm1; \ while (count--) \
xmm0 = LOAD_SI128(sptr1); sptr1 += (16/sizeof(_type_)); \ { \
xmm1 = LOAD_SI128(sptr2); sptr2 += (16/sizeof(_type_)); \ __m128i xmm0, xmm1; \
xmm0 = _op_(xmm0, xmm1); \ xmm0 = LOAD_SI128(sptr1); sptr1 += (16/sizeof(_type_)); \
_mm_store_si128((__m128i *) dptr, xmm0); \ xmm1 = LOAD_SI128(sptr2); sptr2 += (16/sizeof(_type_)); \
dptr += (16/sizeof(_type_)); \ xmm0 = _op_(xmm0, xmm1); \
} \ _mm_store_si128((__m128i *) dptr, xmm0); \
/* Finish off the remainder. */ \ dptr += (16/sizeof(_type_)); \
while (len--) { _slowWay_; } \ } \
return PRIMITIVES_SUCCESS; \ /* Finish off the remainder. */ \
while (len--) { _slowWay_; } \
return PRIMITIVES_SUCCESS; \
} }
#endif /* !__PRIM_TEMPLATES_H_INCLUDED__ */ #endif /* !__PRIM_TEMPLATES_H_INCLUDED__ */

View File

@ -81,6 +81,6 @@ primitives_t* primitives_get_generic(void)
if (!pPrimitivesGenericInitialized) if (!pPrimitivesGenericInitialized)
primitives_init_generic(); primitives_init_generic();
return &pPrimitives; return &pPrimitivesGeneric;
} }

View File

@ -26,7 +26,7 @@ static BOOL test_add16s_func(void)
pstatus_t status; pstatus_t status;
INT16 ALIGN(src1[FUNC_TEST_SIZE + 3]), ALIGN(src2[FUNC_TEST_SIZE + 3]), INT16 ALIGN(src1[FUNC_TEST_SIZE + 3]), ALIGN(src2[FUNC_TEST_SIZE + 3]),
ALIGN(d1[FUNC_TEST_SIZE + 3]), ALIGN(d2[FUNC_TEST_SIZE + 3]); ALIGN(d1[FUNC_TEST_SIZE + 3]), ALIGN(d2[FUNC_TEST_SIZE + 3]);
char testStr[256]; char testStr[256];
testStr[0] = '\0'; testStr[0] = '\0';
@ -50,7 +50,7 @@ static BOOL test_add16s_func(void)
static BOOL test_add16s_speed(void) static BOOL test_add16s_speed(void)
{ {
BYTE ALIGN(src1[MAX_TEST_SIZE + 3]), ALIGN(src2[MAX_TEST_SIZE + 3]), BYTE ALIGN(src1[MAX_TEST_SIZE + 3]), ALIGN(src2[MAX_TEST_SIZE + 3]),
ALIGN(dst[MAX_TEST_SIZE + 3]); ALIGN(dst[MAX_TEST_SIZE + 3]);
if (!g_TestPrimitivesPerformance) if (!g_TestPrimitivesPerformance)
return TRUE; return TRUE;
@ -59,7 +59,8 @@ static BOOL test_add16s_speed(void)
winpr_RAND(src2, sizeof(src2)); winpr_RAND(src2, sizeof(src2));
if (!speed_test("add16s", "aligned", g_Iterations, if (!speed_test("add16s", "aligned", g_Iterations,
generic->add_16s, optimized->add_16s, (speed_test_fkt)generic->add_16s,
(speed_test_fkt)optimized->add_16s,
src1, src2, dst, FUNC_TEST_SIZE)) src1, src2, dst, FUNC_TEST_SIZE))
return FALSE; return FALSE;
@ -72,8 +73,11 @@ int TestPrimitivesAdd(int argc, char* argv[])
if (!test_add16s_func()) if (!test_add16s_func())
return -1; return -1;
if (!test_add16s_speed()) if (g_TestPrimitivesPerformance)
return -1; {
if (!test_add16s_speed())
return -1;
}
return 0; return 0;
} }

View File

@ -33,8 +33,13 @@ static const int block_size[] = { 4, 64, 256 };
#define GRN(_c_) (((_c_) & 0x0000FF00U) >> 8) #define GRN(_c_) (((_c_) & 0x0000FF00U) >> 8)
#define BLU(_c_) ((_c_) & 0x000000FFU) #define BLU(_c_) ((_c_) & 0x000000FFU)
#define TOLERANCE 1 #define TOLERANCE 1
#define PIXEL(_addr_, _bytes_, _x_, _y_) \ static inline const UINT32* PIXEL(const BYTE* _addr_, UINT32 _bytes_, UINT32 _x_, UINT32 _y_)
((UINT32 *) (((BYTE *) (_addr_)) + (_x_)*4 + (_y_)*(_bytes_))) {
const BYTE* addr = _addr_ + _x_ * sizeof(UINT32) + _y_ * _bytes_;
return (const UINT32*)addr;
}
#define SRC1_WIDTH 6 #define SRC1_WIDTH 6
#define SRC1_HEIGHT 6 #define SRC1_HEIGHT 6
#define SRC2_WIDTH 7 #define SRC2_WIDTH 7
@ -46,8 +51,8 @@ static const int block_size[] = { 4, 64, 256 };
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
static UINT32 alpha_add( static UINT32 alpha_add(
UINT32 c1, UINT32 c1,
UINT32 c2) UINT32 c2)
{ {
UINT32 a1 = ALF(c1); UINT32 a1 = ALF(c1);
UINT32 r1 = RED(c1); UINT32 r1 = RED(c1);
@ -66,8 +71,8 @@ static UINT32 alpha_add(
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
static UINT32 colordist( static UINT32 colordist(
UINT32 c1, UINT32 c1,
UINT32 c2) UINT32 c2)
{ {
int d, maxd = 0; int d, maxd = 0;
d = ABS(ALF(c1) - ALF(c2)); d = ABS(ALF(c1) - ALF(c2));
@ -90,10 +95,10 @@ static UINT32 colordist(
} }
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
static BOOL check(const BYTE* pSrc1, INT32 src1Step, static BOOL check(const BYTE* pSrc1, UINT32 src1Step,
const BYTE* pSrc2, INT32 src2Step, const BYTE* pSrc2, UINT32 src2Step,
BYTE* pDst, INT32 dstStep, BYTE* pDst, UINT32 dstStep,
INT32 width, INT32 height) UINT32 width, UINT32 height)
{ {
UINT32 x, y; UINT32 x, y;
for (y = 0; y < height; ++y) for (y = 0; y < height; ++y)
@ -120,14 +125,14 @@ static BOOL check(const BYTE* pSrc1, INT32 src1Step,
static BOOL test_alphaComp_func(void) static BOOL test_alphaComp_func(void)
{ {
pstatus_t status; pstatus_t status;
BYTE ALIGN(src1[SRC1_WIDTH * SRC1_HEIGHT]); BYTE ALIGN(src1[SRC1_WIDTH * SRC1_HEIGHT * 4]);
BYTE ALIGN(src2[SRC2_WIDTH * SRC2_HEIGHT]); BYTE ALIGN(src2[SRC2_WIDTH * SRC2_HEIGHT * 4]);
BYTE ALIGN(dst1[DST_WIDTH * DST_HEIGHT]); BYTE ALIGN(dst1[DST_WIDTH * DST_HEIGHT * 4]);
char testStr[256];
UINT32* ptr; UINT32* ptr;
UINT32 i; UINT32 i;
testStr[0] = '\0';
winpr_RAND((BYTE*)src1, sizeof(src1)); winpr_RAND((BYTE*)src1, sizeof(src1));
/* Special-case the first two values */ /* Special-case the first two values */
src1[0] &= 0x00FFFFFFU; src1[0] &= 0x00FFFFFFU;
src1[1] |= 0xFF000000U; src1[1] |= 0xFF000000U;
@ -141,8 +146,8 @@ static BOOL test_alphaComp_func(void)
memset(dst1, 0, sizeof(dst1)); memset(dst1, 0, sizeof(dst1));
status = generic->alphaComp_argb(src1, 4 * SRC1_WIDTH, status = generic->alphaComp_argb(src1, 4 * SRC1_WIDTH,
src2, 4 * SRC2_WIDTH, src2, 4 * SRC2_WIDTH,
dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT); dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
if (status != PRIMITIVES_SUCCESS) if (status != PRIMITIVES_SUCCESS)
return FALSE; return FALSE;
@ -152,8 +157,8 @@ static BOOL test_alphaComp_func(void)
return FALSE; return FALSE;
status = optimized->alphaComp_argb((const BYTE*) src1, 4 * SRC1_WIDTH, status = optimized->alphaComp_argb((const BYTE*) src1, 4 * SRC1_WIDTH,
(const BYTE*) src2, 4 * SRC2_WIDTH, (const BYTE*) src2, 4 * SRC2_WIDTH,
(BYTE*) dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT); (BYTE*) dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
if (status != PRIMITIVES_SUCCESS) if (status != PRIMITIVES_SUCCESS)
return FALSE; return FALSE;
@ -188,7 +193,8 @@ static int test_alphaComp_speed(void)
memset(dst1, 0, sizeof(dst1)); memset(dst1, 0, sizeof(dst1));
if (!speed_test("add16s", "aligned", g_Iterations, if (!speed_test("add16s", "aligned", g_Iterations,
generic->alphaComp_argb, optimized->alphaComp_argb, (speed_test_fkt)generic->alphaComp_argb,
(speed_test_fkt)optimized->alphaComp_argb,
src1, 4 * SRC1_WIDTH, src1, 4 * SRC1_WIDTH,
src2, 4 * SRC2_WIDTH, src2, 4 * SRC2_WIDTH,
dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT)) dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT))
@ -203,8 +209,11 @@ int TestPrimitivesAlphaComp(int argc, char* argv[])
if (!test_alphaComp_func()) if (!test_alphaComp_func())
return -1; return -1;
if (!test_alphaComp_speed()) if (g_TestPrimitivesPerformance)
return -1; {
if (!test_alphaComp_speed())
return -1;
}
return 0; return 0;
} }

View File

@ -20,99 +20,87 @@
#include "prim_test.h" #include "prim_test.h"
#define FUNC_TEST_SIZE 65536 #define FUNC_TEST_SIZE 65536
static const int ANDOR_PRETEST_ITERATIONS = 100000;
static const int TEST_TIME = 2.0; // seconds
#define VALUE (0xA5A5A5A5U) #define VALUE (0xA5A5A5A5U)
/* ========================================================================= */ /* ========================================================================= */
static BOOL test_and_32u_impl(const char* name, __andC_32u_t fkt,
const UINT32* src, const UINT32 val,
UINT32* dst, size_t size)
{
size_t i;
pstatus_t status = fkt(src, val, dst, size);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
for (i = 0; i < size; ++i)
{
if (dst[i] != (src[i] & val))
{
printf("AND %s FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
name, i, src[i], val, src[i] & val, dst[i]);
return FALSE;
}
}
return TRUE;
}
static BOOL test_and_32u_func(void) static BOOL test_and_32u_func(void)
{ {
UINT32 ALIGN(src[FUNC_TEST_SIZE + 3]), ALIGN(dst[FUNC_TEST_SIZE + 3]); UINT32 ALIGN(src[FUNC_TEST_SIZE + 3]), ALIGN(dst[FUNC_TEST_SIZE + 3]);
int failed = 0;
int i;
char testStr[256];
testStr[0] = '\0';
winpr_RAND(src, sizeof(src));
generic->andC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
strcat(testStr, " general");
for (i = 1; i <= FUNC_TEST_SIZE; ++i) winpr_RAND((BYTE*)src, sizeof(src));
{
if (dst[i] != (src[i] & VALUE))
{
printf("AND-general FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
i, src[i], VALUE, src[i] & VALUE, dst[i]);
++failed;
}
}
#ifdef WITH_SSE2 if (!test_and_32u_impl("generic->andC_32u aligned", generic->andC_32u,
src + 1, VALUE, dst + 1, FUNC_TEST_SIZE))
return FALSE;
if (!test_and_32u_impl("generic->andC_32u unaligned", generic->andC_32u,
src + 1, VALUE, dst + 2, FUNC_TEST_SIZE))
return FALSE;
if (!test_and_32u_impl("optimized->andC_32u aligned", optimized->andC_32u,
src + 1, VALUE, dst + 1, FUNC_TEST_SIZE))
return FALSE;
if (!test_and_32u_impl("optimized->andC_32u unaligned", optimized->andC_32u,
src + 1, VALUE, dst + 2, FUNC_TEST_SIZE))
return FALSE;
if (IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) return TRUE;
{
strcat(testStr, " SSE3");
/* Aligned */
memset(dst, 0, sizeof(dst));
sse3_andC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
for (i = 1; i <= FUNC_TEST_SIZE; ++i)
{
if (dst[i] != (src[i] & VALUE))
{
printf("AND-SSE-aligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
i, src[i], VALUE, src[i] & VALUE, dst[i]);
++failed;
}
}
/* Unaligned */
memset(dst, 0, sizeof(dst));
sse3_andC_32u(src + 1, VALUE, dst + 2, FUNC_TEST_SIZE);
for (i = 1; i <= FUNC_TEST_SIZE; ++i)
{
if (dst[i + 1] != (src[i] & VALUE))
{
printf("AND-SSE-unaligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
i, src[i], VALUE, src[i] & VALUE, dst[i + 1]);
++failed;
}
}
}
#endif /* i386 */
if (!failed) printf("All and_32u tests passed (%s).\n", testStr);
return (failed > 0) ? FAILURE : SUCCESS;
} }
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
static BOOL test_and_32u_speed(void) static BOOL test_and_32u_speed(void)
{ {
UINT32 ALIGN(src[MAX_TEST_SIZE + 3]), ALIGN(dst[MAX_TEST_SIZE + 3]); UINT32 ALIGN(src[MAX_TEST_SIZE + 3]), ALIGN(dst[MAX_TEST_SIZE + 3]);
winpr_RAND(src, sizeof(src));
andC_32u_speed_test("and32u", "aligned", src, NULL, VALUE, dst, winpr_RAND((BYTE*)src, sizeof(src));
test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME);
andC_32u_speed_test("and32u", "unaligned", src + 1, NULL, VALUE, dst, if (!speed_test("andC_32u", "aligned", g_Iterations,
test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME); (speed_test_fkt)generic->andC_32u,
return SUCCESS; (speed_test_fkt)optimized->andC_32u,
src + 1, VALUE, dst + 1, MAX_TEST_SIZE))
return FALSE;
if (!speed_test("andC_32u", "unaligned", g_Iterations,
(speed_test_fkt)generic->andC_32u,
(speed_test_fkt)optimized->andC_32u,
src + 1, VALUE, dst + 2, MAX_TEST_SIZE))
return FALSE;
return TRUE;
} }
/* ========================================================================= */ /* ========================================================================= */
static BOOL check(const UINT32* src, const UINT32* dst, UINT32 size, UINT32 value) static BOOL check(const UINT32* src, const UINT32* dst, UINT32 size, UINT32 value)
{ {
UINT32 i; UINT32 i;
UINT32 failed = 0;
for (i = 1; i <= size; ++i) for (i = 0; i < size; ++i)
{ {
if (dst[i] != (src[i] | value)) if (dst[i] != (src[i] | value))
{ {
printf("OR-general general FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n", printf("OR-general general FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
i, src[i], value, src[i] | value, dst[i]); i, src[i], value, src[i] | value, dst[i]);
++failed; return FALSE;
} }
} }
@ -123,8 +111,7 @@ static BOOL test_or_32u_func(void)
{ {
pstatus_t status; pstatus_t status;
UINT32 ALIGN(src[FUNC_TEST_SIZE + 3]), ALIGN(dst[FUNC_TEST_SIZE + 3]); UINT32 ALIGN(src[FUNC_TEST_SIZE + 3]), ALIGN(dst[FUNC_TEST_SIZE + 3]);
char testStr[256];
testStr[0] = '\0';
winpr_RAND((BYTE*)src, sizeof(src)); winpr_RAND((BYTE*)src, sizeof(src));
status = generic->orC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE); status = generic->orC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
@ -153,7 +140,8 @@ static BOOL test_or_32u_speed(void)
winpr_RAND((BYTE*)src, sizeof(src)); winpr_RAND((BYTE*)src, sizeof(src));
if (!speed_test("add16s", "aligned", g_Iterations, if (!speed_test("add16s", "aligned", g_Iterations,
generic->orC_32u, optimized->orC_32u, (speed_test_fkt)generic->orC_32u,
(speed_test_fkt)optimized->orC_32u,
src + 1, VALUE, dst + 1, FUNC_TEST_SIZE)) src + 1, VALUE, dst + 1, FUNC_TEST_SIZE))
return FALSE; return FALSE;
@ -167,14 +155,16 @@ int TestPrimitivesAndOr(int argc, char* argv[])
if (!test_and_32u_func()) if (!test_and_32u_func())
return -1; return -1;
if (!test_and_32u_speed())
return -1;
if (!test_or_32u_func()) if (!test_or_32u_func())
return -1; return -1;
if (!test_or_32u_speed()) if (g_TestPrimitivesPerformance)
return -1; {
if (!test_and_32u_speed())
return -1;
if (!test_or_32u_speed())
return -1;
}
return 0; return 0;
} }

View File

@ -24,19 +24,16 @@ static const int YCBCR_TRIAL_ITERATIONS = 1000;
static const float TEST_TIME = 4.0; static const float TEST_TIME = 4.0;
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
int test_RGBToRGB_16s8u_P3AC4R_func(void) static BOOL test_RGBToRGB_16s8u_P3AC4R_func(void)
{ {
INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]); INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]);
UINT32 ALIGN(out1[4096]); UINT32 ALIGN(out1[4096]);
#ifdef WITH_SSE2
UINT32 ALIGN(out2[4096]); UINT32 ALIGN(out2[4096]);
#endif
int i; int i;
int failed = 0; BOOL failed = FALSE;
char testStr[256];
INT16* ptrs[3]; INT16* ptrs[3];
prim_size_t roi = { 64, 64 }; prim_size_t roi = { 64, 64 };
testStr[0] = '\0';
winpr_RAND((BYTE*)r, sizeof(r)); winpr_RAND((BYTE*)r, sizeof(r));
winpr_RAND((BYTE*)g, sizeof(g)); winpr_RAND((BYTE*)g, sizeof(g));
winpr_RAND((BYTE*)b, sizeof(b)); winpr_RAND((BYTE*)b, sizeof(b));
@ -52,56 +49,38 @@ int test_RGBToRGB_16s8u_P3AC4R_func(void)
ptrs[0] = r; ptrs[0] = r;
ptrs[1] = g; ptrs[1] = g;
ptrs[2] = b; ptrs[2] = b;
generic->RGBToRGB_16s8u_P3AC4R((const INT16**) ptrs, 64 * 2, if (generic->RGBToRGB_16s8u_P3AC4R((const INT16**) ptrs, 64 * 2,
(BYTE*) out1, 64 * 4, &roi); (BYTE*) out1, 64 * 4, PIXEL_FORMAT_RGBA32,
#ifdef WITH_SSE2 &roi) != PRIMITIVES_SUCCESS)
return FALSE;
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE)) if (optimized->RGBToRGB_16s8u_P3AC4R((const INT16**) ptrs, 64 * 2,
(BYTE*) out2, 64 * 4, PIXEL_FORMAT_RGBA32,
&roi) != PRIMITIVES_SUCCESS)
return FALSE;
for (i = 0; i < 4096; ++i)
{ {
strcat(testStr, " SSE2"); if (out1[i] != out2[i])
sse2_RGBToRGB_16s8u_P3AC4R((const INT16**) ptrs, 64 * 2,
(BYTE*) out2, 64 * 4, &roi);
for (i = 0; i < 4096; ++i)
{ {
if (out1[i] != out2[i]) printf("RGBToRGB-SSE FAIL: out1[%d]=0x%08x out2[%d]=0x%08x\n",
{ i, out1[i], i, out2[i]);
printf("RGBToRGB-SSE FAIL: out1[%d]=0x%08x out2[%d]=0x%08x\n", failed = TRUE;
i, out1[i], i, out2[i]);
failed = 1;
}
} }
} }
#endif /* i386 */ return !failed;
if (!failed) printf("All RGBToRGB_16s8u_P3AC4R tests passed (%s).\n", testStr);
return (failed > 0) ? FAILURE : SUCCESS;
} }
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
static const prim_size_t roi64x64 = { 64, 64 }; static BOOL test_RGBToRGB_16s8u_P3AC4R_speed(void)
STD_SPEED_TEST(
rgb_to_argb_speed, INT16*, UINT32, dst = dst,
TRUE, generic->RGBToRGB_16s8u_P3AC4R(
(const INT16**) src1, 64 * 2, (BYTE*) dst, 64 * 4, &roi64x64),
#ifdef WITH_SSE2
TRUE, sse2_RGBToRGB_16s8u_P3AC4R(
(const INT16**) src1, 64 * 2, (BYTE*) dst, 64 * 4, &roi64x64),
PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
#else
FALSE, PRIM_NOP, 0, FALSE,
#endif
FALSE, dst = dst);
int test_RGBToRGB_16s8u_P3AC4R_speed(void)
{ {
INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]); const prim_size_t roi64x64 = { 64, 64 };
UINT32 ALIGN(dst[4096]); INT16 ALIGN(r[4096+1]), ALIGN(g[4096+1]), ALIGN(b[4096+1]);
UINT32 ALIGN(dst[4096+1]);
int i; int i;
INT16* ptrs[3]; INT16* ptrs[3];
int size_array[] = { 64 };
winpr_RAND((BYTE*)r, sizeof(r)); winpr_RAND((BYTE*)r, sizeof(r));
winpr_RAND((BYTE*)g, sizeof(g)); winpr_RAND((BYTE*)g, sizeof(g));
winpr_RAND((BYTE*)b, sizeof(b)); winpr_RAND((BYTE*)b, sizeof(b));
@ -114,29 +93,38 @@ int test_RGBToRGB_16s8u_P3AC4R_speed(void)
b[i] &= 0x00FFU; b[i] &= 0x00FFU;
} }
ptrs[0] = r; ptrs[0] = r+1;
ptrs[1] = g; ptrs[1] = g+1;
ptrs[2] = b; ptrs[2] = b+1;
rgb_to_argb_speed("RGBToARGB", "aligned",
(const INT16**) ptrs, NULL, 0, dst, if (!speed_test("RGBToRGB_16s8u_P3AC4R", "aligned", g_Iterations,
size_array, 1, RGB_TRIAL_ITERATIONS, TEST_TIME); (speed_test_fkt)generic->RGBToRGB_16s8u_P3AC4R,
return SUCCESS; (speed_test_fkt)optimized->RGBToRGB_16s8u_P3AC4R,
(const INT16**) ptrs, 64 * 2, (BYTE*) dst, 64 * 4, &roi64x64))
return FALSE;
if (!speed_test("RGBToRGB_16s8u_P3AC4R", "unaligned", g_Iterations,
(speed_test_fkt)generic->RGBToRGB_16s8u_P3AC4R,
(speed_test_fkt)optimized->RGBToRGB_16s8u_P3AC4R,
(const INT16**) ptrs, 64 * 2, ((BYTE*) dst)+1, 64 * 4, &roi64x64))
return FALSE;
return TRUE;
} }
/* ========================================================================= */ /* ========================================================================= */
int test_yCbCrToRGB_16s16s_P3P3_func(void) static BOOL test_yCbCrToRGB_16s16s_P3P3_func(void)
{ {
pstatus_t status;
INT16 ALIGN(y[4096]), ALIGN(cb[4096]), ALIGN(cr[4096]); INT16 ALIGN(y[4096]), ALIGN(cb[4096]), ALIGN(cr[4096]);
INT16 ALIGN(r1[4096]), ALIGN(g1[4096]), ALIGN(b1[4096]); INT16 ALIGN(r1[4096]), ALIGN(g1[4096]), ALIGN(b1[4096]);
INT16 ALIGN(r2[4096]), ALIGN(g2[4096]), ALIGN(b2[4096]); INT16 ALIGN(r2[4096]), ALIGN(g2[4096]), ALIGN(b2[4096]);
int i; int i;
int failed = 0;
char testStr[256];
const INT16* in[3]; const INT16* in[3];
INT16* out1[3]; INT16* out1[3];
INT16* out2[3]; INT16* out2[3];
prim_size_t roi = { 64, 64 }; prim_size_t roi = { 64, 64 };
testStr[0] = '\0';
winpr_RAND((BYTE*)y, sizeof(y)); winpr_RAND((BYTE*)y, sizeof(y));
winpr_RAND((BYTE*)cb, sizeof(cb)); winpr_RAND((BYTE*)cb, sizeof(cb));
winpr_RAND((BYTE*)cr, sizeof(cr)); winpr_RAND((BYTE*)cr, sizeof(cr));
@ -164,57 +152,40 @@ int test_yCbCrToRGB_16s16s_P3P3_func(void)
out2[0] = r2; out2[0] = r2;
out2[1] = g2; out2[1] = g2;
out2[2] = b2; out2[2] = b2;
generic->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out1, 64 * 2, &roi);
#ifdef WITH_SSE2
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE)) status = generic->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out1, 64 * 2, &roi);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out2, 64 * 2, &roi);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
for (i = 0; i < 4096; ++i)
{ {
strcat(testStr, " SSE2"); if ((ABS(r1[i] - r2[i]) > 1)
sse2_yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out2, 64 * 2, &roi); || (ABS(g1[i] - g2[i]) > 1)
|| (ABS(b1[i] - b2[i]) > 1))
for (i = 0; i < 4096; ++i)
{ {
if ((ABS(r1[i] - r2[i]) > 1) printf("YCbCrToRGB-SSE FAIL[%d]: %d,%d,%d vs %d,%d,%d\n", i,
|| (ABS(g1[i] - g2[i]) > 1) r1[i], g1[i], b1[i], r2[i], g2[i], b2[i]);
|| (ABS(b1[i] - b2[i]) > 1)) return FALSE;
{
printf("YCbCrToRGB-SSE FAIL[%d]: %d,%d,%d vs %d,%d,%d\n", i,
r1[i], g1[i], b1[i], r2[i], g2[i], b2[i]);
failed = 1;
}
} }
} }
#endif /* i386 */ return TRUE;
if (!failed) printf("All yCbCrToRGB_16s16s_P3P3 tests passed (%s).\n", testStr);
return (failed > 0) ? FAILURE : SUCCESS;
} }
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
STD_SPEED_TEST(
ycbcr_to_rgb_speed, INT16*, INT16*, dst = dst,
TRUE, generic->yCbCrToRGB_16s16s_P3P3(src1, 64 * 2, dst, 64 * 2, &roi64x64),
#ifdef WITH_SSE2
TRUE, sse2_yCbCrToRGB_16s16s_P3P3(src1, 64 * 2, dst, 64 * 2, &roi64x64),
PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
#elif defined(WITH_NEON)
TRUE, neon_yCbCrToRGB_16s16s_P3P3(src1, 64 * 2, dst, 64 * 2, &roi64x64),
PF_ARM_NEON_INSTRUCTIONS_AVAILABLE, FALSE,
#else
FALSE, PRIM_NOP, 0, FALSE,
#endif
FALSE, dst = dst);
static int test_yCbCrToRGB_16s16s_P3P3_speed(void) static int test_yCbCrToRGB_16s16s_P3P3_speed(void)
{ {
prim_size_t roi = { 64, 64 };
INT16 ALIGN(y[4096]), ALIGN(cb[4096]), ALIGN(cr[4096]); INT16 ALIGN(y[4096]), ALIGN(cb[4096]), ALIGN(cr[4096]);
INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]); INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]);
int i; int i;
const INT16* input[3]; const INT16* input[3];
INT16* output[3]; INT16* output[3];
int size_array[] = { 64 };
winpr_RAND((BYTE*)y, sizeof(y)); winpr_RAND((BYTE*)y, sizeof(y));
winpr_RAND((BYTE*)cb, sizeof(cb)); winpr_RAND((BYTE*)cb, sizeof(cb));
winpr_RAND((BYTE*)cr, sizeof(cr)); winpr_RAND((BYTE*)cr, sizeof(cr));
@ -233,37 +204,35 @@ static int test_yCbCrToRGB_16s16s_P3P3_speed(void)
output[0] = r; output[0] = r;
output[1] = g; output[1] = g;
output[2] = b; output[2] = b;
ycbcr_to_rgb_speed("yCbCrToRGB", "aligned", input, NULL, NULL, output,
size_array, 1, YCBCR_TRIAL_ITERATIONS, TEST_TIME); if (!speed_test("yCbCrToRGB_16s16s_P3P3", "aligned", g_Iterations,
return SUCCESS; (speed_test_fkt)generic->yCbCrToRGB_16s16s_P3P3,
(speed_test_fkt)optimized->yCbCrToRGB_16s16s_P3P3,
input, 64 * 2, output, 64 * 2, &roi))
return FALSE;
return TRUE;
} }
int TestPrimitivesColors(int argc, char* argv[]) int TestPrimitivesColors(int argc, char* argv[])
{ {
int status; prim_test_setup(FALSE);
status = test_RGBToRGB_16s8u_P3AC4R_func();
if (status != SUCCESS) if (!test_RGBToRGB_16s8u_P3AC4R_func())
return 1; return 1;
if (g_TestPrimitivesPerformance) if (g_TestPrimitivesPerformance)
{ {
status = test_RGBToRGB_16s8u_P3AC4R_speed(); if (!test_RGBToRGB_16s8u_P3AC4R_speed())
if (status != SUCCESS)
return 1; return 1;
} }
status = test_yCbCrToRGB_16s16s_P3P3_func(); if (!test_yCbCrToRGB_16s16s_P3P3_func())
if (status != SUCCESS)
return 1; return 1;
if (g_TestPrimitivesPerformance) if (g_TestPrimitivesPerformance)
{ {
status = test_yCbCrToRGB_16s16s_P3P3_speed(); if (!test_yCbCrToRGB_16s16s_P3P3_speed())
if (status != SUCCESS)
return 1; return 1;
} }

View File

@ -19,22 +19,17 @@
#include <winpr/sysinfo.h> #include <winpr/sysinfo.h>
#include "prim_test.h" #include "prim_test.h"
static const int MEMCPY_PRETEST_ITERATIONS = 1000000;
static const int TEST_TIME = 1.0; // seconds
#define COPY_TESTSIZE (256*2+16*2+15+15) #define COPY_TESTSIZE (256*2+16*2+15+15)
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
static int test_copy8u_func(void) static BOOL test_copy8u_func(void)
{ {
primitives_t* prims = primitives_get(); primitives_t* prims = primitives_get();
BYTE ALIGN(data[COPY_TESTSIZE + 15]); BYTE ALIGN(data[COPY_TESTSIZE + 15]);
int i, soff; int i, soff;
int failed = 0;
char testStr[256];
BYTE ALIGN(dest[COPY_TESTSIZE + 15]); BYTE ALIGN(dest[COPY_TESTSIZE + 15]);
testStr[0] = '\0';
winpr_RAND(data, sizeof(data)); winpr_RAND(data, sizeof(data));
strcat(testStr, " ptr");
for (soff = 0; soff < 16; ++soff) for (soff = 0; soff < 16; ++soff)
{ {
@ -47,7 +42,8 @@ static int test_copy8u_func(void)
for (length = 1; length <= COPY_TESTSIZE - doff; ++length) for (length = 1; length <= COPY_TESTSIZE - doff; ++length)
{ {
memset(dest, 0, sizeof(dest)); memset(dest, 0, sizeof(dest));
prims->copy_8u(data + soff, dest + doff, length); if (prims->copy_8u(data + soff, dest + doff, length) != PRIMITIVES_SUCCESS)
return FALSE;
for (i = 0; i < length; ++i) for (i = 0; i < length; ++i)
{ {
@ -57,48 +53,47 @@ static int test_copy8u_func(void)
"data[%d]=0x%02x\n", "data[%d]=0x%02x\n",
doff, length, i + doff, dest[i + doff], doff, length, i + doff, dest[i + doff],
i + soff, data[i + soff]); i + soff, data[i + soff]);
failed = 1; return FALSE;
} }
} }
} }
} }
} }
if (!failed) printf("All copy8 tests passed (%s).\n", testStr); return TRUE;
return (failed > 0) ? FAILURE : SUCCESS;
} }
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
STD_SPEED_TEST(copy8u_speed_test, BYTE, BYTE, dst = dst, static BOOL test_copy8u_speed(void)
TRUE, memcpy(dst, src1, size),
FALSE, PRIM_NOP, 0, FALSE,
TRUE, ippsCopy_8u(src1, dst, size));
int test_copy8u_speed(void)
{ {
BYTE ALIGN(src[MAX_TEST_SIZE + 4]); BYTE ALIGN(src[MAX_TEST_SIZE + 4]);
BYTE ALIGN(dst[MAX_TEST_SIZE + 4]); BYTE ALIGN(dst[MAX_TEST_SIZE + 4]);
copy8u_speed_test("copy8u", "aligned", src, NULL, 0, dst,
test_sizes, NUM_TEST_SIZES, MEMCPY_PRETEST_ITERATIONS, TEST_TIME); if (!speed_test("copy_8u", "aligned", g_Iterations,
copy8u_speed_test("copy8u", "unaligned", src + 1, NULL, 0, dst, (speed_test_fkt)generic->copy_8u,
test_sizes, NUM_TEST_SIZES, MEMCPY_PRETEST_ITERATIONS, TEST_TIME); (speed_test_fkt)optimized->copy_8u,
return SUCCESS; src, dst, MAX_TEST_SIZE))
return FALSE;
if (!speed_test("copy_8u", "unaligned", g_Iterations,
(speed_test_fkt)generic->copy_8u,
(speed_test_fkt)optimized->copy_8u,
src+1, dst+1, MAX_TEST_SIZE))
return FALSE;
return TRUE;
} }
int TestPrimitivesCopy(int argc, char* argv[]) int TestPrimitivesCopy(int argc, char* argv[])
{ {
int status; prim_test_setup(FALSE);
status = test_copy8u_func();
if (status != SUCCESS) if (!test_copy8u_func())
return 1; return 1;
if (g_TestPrimitivesPerformance) if (g_TestPrimitivesPerformance)
{ {
status = test_copy8u_speed(); if (!test_copy8u_speed())
if (status != SUCCESS)
return 1; return 1;
} }

View File

@ -53,14 +53,14 @@ static BOOL test_set8u_func(void)
{ {
UINT32 len; UINT32 len;
memset(dest, 0, sizeof(dest)); memset(dest, 3, sizeof(dest));
for (len = 1; len < 48 - off; ++len) for (len = 1; len < 48 - off; ++len)
{ {
status = generic->set_8u(0xa5, dest + off, len); status = generic->set_8u(0xa5, dest + off, len);
if (status != PRIMITIVES_SUCCESS) if (status != PRIMITIVES_SUCCESS)
return FALSE; return FALSE;
if (!check8(dest, len, off, 0xa8)) if (!check8(dest, len, off, 0xa5))
return FALSE; return FALSE;
} }
} }
@ -69,14 +69,14 @@ static BOOL test_set8u_func(void)
{ {
UINT32 len; UINT32 len;
memset(dest, 0, sizeof(dest)); memset(dest, 3, sizeof(dest));
for (len = 1; len < 48 - off; ++len) for (len = 1; len < 48 - off; ++len)
{ {
status = optimized->set_8u(0xa5, dest + off, len); status = optimized->set_8u(0xa5, dest + off, len);
if (status != PRIMITIVES_SUCCESS) if (status != PRIMITIVES_SUCCESS)
return FALSE; return FALSE;
if (!check8(dest, len, off, 0xa8)) if (!check8(dest, len, off, 0xa5))
return FALSE; return FALSE;
} }
} }
@ -95,8 +95,9 @@ static BOOL test_set8u_speed(void)
{ {
winpr_RAND(&value, sizeof(value)); winpr_RAND(&value, sizeof(value));
if (!speed_test("set_8u", "", g_Iterations, if (!speed_test("set_8u", "", g_Iterations,
generic->set_8u, optimized->set_8u, (speed_test_fkt)generic->set_8u,
value, dest + x, len)) (speed_test_fkt)optimized->set_8u,
value, dest + x, x))
return FALSE; return FALSE;
} }
@ -232,8 +233,9 @@ static BOOL test_set32u_speed(void)
{ {
winpr_RAND(&value, sizeof(value)); winpr_RAND(&value, sizeof(value));
if (!speed_test("set_32u", "", g_Iterations, if (!speed_test("set_32u", "", g_Iterations,
generic->set_32u, optimized->set_32u, (speed_test_fkt)generic->set_32u,
value, dest + x, len)) (speed_test_fkt)optimized->set_32u,
value, dest + x, x))
return FALSE; return FALSE;
} }
@ -251,8 +253,9 @@ static BOOL test_set32s_speed(void)
{ {
winpr_RAND(&value, sizeof(value)); winpr_RAND(&value, sizeof(value));
if (!speed_test("set_32s", "", g_Iterations, if (!speed_test("set_32s", "", g_Iterations,
generic->set_32s, optimized->set_32s, (speed_test_fkt)generic->set_32s,
value, dest + x, len)) (speed_test_fkt)optimized->set_32s,
value, dest + x, x))
return FALSE; return FALSE;
} }
@ -265,21 +268,20 @@ int TestPrimitivesSet(int argc, char* argv[])
if (!test_set8u_func()) if (!test_set8u_func())
return -1; return -1;
if (!test_set8u_speed())
return -1;
if (!test_set32s_func()) if (!test_set32s_func())
return -1; return -1;
if (!test_set32s_speed())
return -1;
if (!test_set32u_func()) if (!test_set32u_func())
return -1; return -1;
if (!test_set32u_speed()) if (g_TestPrimitivesPerformance)
return -1; {
if (!test_set8u_speed())
return -1;
if (!test_set32s_speed())
return -1;
if (!test_set32u_speed())
return -1;
}
return 0; return 0;
} }

View File

@ -20,207 +20,361 @@
#include "prim_test.h" #include "prim_test.h"
#define FUNC_TEST_SIZE 65536 #define FUNC_TEST_SIZE 65536
static const int SHIFT_PRETEST_ITERATIONS = 50000;
static const float TEST_TIME = 1.0;
#ifdef WITH_SSE2 static BOOL test_lShift_16s_func(void)
#define SHIFT_TEST_FUNC(_name_, _type_, _str_, _f1_, _f2_) \
int _name_(void) \
{ \
_type_ ALIGN(src[FUNC_TEST_SIZE+3]), \
ALIGN(d1[FUNC_TEST_SIZE+3]), ALIGN(d2[FUNC_TEST_SIZE+3]); \
int failed = 0; \
int i; \
char testStr[256]; \
testStr[0] = '\0'; \
get_random_data(src, sizeof(src)); \
_f1_(src+1, 3, d1+1, FUNC_TEST_SIZE); \
if (IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) \
{ \
strcat(testStr, " SSE3"); \
/* Aligned */ \
_f2_(src+1, 3, d2+1, FUNC_TEST_SIZE); \
for (i=1; i<=FUNC_TEST_SIZE; ++i) \
{ \
if (d1[i] != d2[i]) \
{ \
printf("%s-SSE-aligned FAIL[%d]: 0x%x>>3=0x%x, got 0x%x\n", \
_str_, i, src[i], d1[i], d2[i]); \
++failed; \
} \
} \
/* Unaligned */ \
_f2_(src+1, 3, d2+2, FUNC_TEST_SIZE); \
for (i=1; i<=FUNC_TEST_SIZE; ++i) \
{ \
if (d1[i] != d2[i+1]) \
{ \
printf("%s-SSE-unaligned FAIL[%d]: 0x%x>>3=0x%x, got 0x%x\n", \
_str_, i, src[i], d1[i], d2[i+1]); \
++failed; \
} \
} \
} \
if (!failed) printf("All %s tests passed (%s).\n", _str_, testStr); \
return (failed > 0) ? FAILURE : SUCCESS; \
}
#else
#define SHIFT_TEST_FUNC(_name_, _type_, _str_, _f1_, _f2_) \
int _name_(void) \
{ \
return SUCCESS; \
}
#endif /* i386 */
SHIFT_TEST_FUNC(test_lShift_16s_func, INT16, "lshift_16s", general_lShiftC_16s,
sse2_lShiftC_16s)
SHIFT_TEST_FUNC(test_lShift_16u_func, UINT16, "lshift_16u", general_lShiftC_16u,
sse2_lShiftC_16u)
SHIFT_TEST_FUNC(test_rShift_16s_func, INT16, "rshift_16s", general_rShiftC_16s,
sse2_rShiftC_16s)
SHIFT_TEST_FUNC(test_rShift_16u_func, UINT16, "rshift_16u", general_rShiftC_16u,
sse2_rShiftC_16u)
/* ========================================================================= */
STD_SPEED_TEST(speed_lShift_16s, INT16, INT16, dst = dst,
TRUE, general_lShiftC_16s(src1, constant, dst, size),
#ifdef WITH_SSE2
TRUE, sse2_lShiftC_16s(src1, constant, dst, size),
PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
#else
FALSE, PRIM_NOP, 0, FALSE,
#endif
TRUE, ippsLShiftC_16s(src1, constant, dst, size));
STD_SPEED_TEST(speed_lShift_16u, UINT16, UINT16, dst = dst,
TRUE, general_lShiftC_16u(src1, constant, dst, size),
#ifdef WITH_SSE2
TRUE, sse2_lShiftC_16u(src1, constant, dst, size),
PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
#else
FALSE, PRIM_NOP, 0, FALSE,
#endif
TRUE, ippsLShiftC_16u(src1, constant, dst, size));
STD_SPEED_TEST(speed_rShift_16s, INT16, INT16, dst = dst,
TRUE, general_rShiftC_16s(src1, constant, dst, size),
#ifdef WITH_SSE2
TRUE, sse2_rShiftC_16s(src1, constant, dst, size),
PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
#else
FALSE, PRIM_NOP, 0, FALSE,
#endif
TRUE, ippsRShiftC_16s(src1, constant, dst, size));
STD_SPEED_TEST(speed_rShift_16u, UINT16, UINT16, dst = dst,
TRUE, general_rShiftC_16u(src1, constant, dst, size),
#ifdef WITH_SSE2
TRUE, sse2_rShiftC_16u(src1, constant, dst, size),
PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
#else
FALSE, PRIM_NOP, 0, FALSE,
#endif
TRUE, ippsRShiftC_16u(src1, constant, dst, size));
/* ------------------------------------------------------------------------- */
int test_lShift_16s_speed(void)
{ {
INT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]); pstatus_t status;
winpr_RAND(src, sizeof(src)); INT16 ALIGN(src[FUNC_TEST_SIZE+3]);
speed_lShift_16s("lShift_16s", "aligned", src, NULL, 3, dst, INT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); UINT32 val;
speed_lShift_16s("lShift_16s", "unaligned", src + 1, NULL, 3, dst,
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); winpr_RAND((BYTE*)&val, sizeof(val));
return SUCCESS; winpr_RAND((BYTE*)src, sizeof(src));
val = (val % (FUNC_TEST_SIZE - 1)) + 1;
/* Aligned */
status = generic->lShiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->lShiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
/* Unaligned */
status = generic->lShiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->lShiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
return TRUE;
}
static BOOL test_lShift_16u_func(void)
{
pstatus_t status;
UINT16 ALIGN(src[FUNC_TEST_SIZE+3]);
UINT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
UINT32 val;
winpr_RAND((BYTE*)&val, sizeof(val));
winpr_RAND((BYTE*)src, sizeof(src));
val = (val % (FUNC_TEST_SIZE - 1)) + 1;
/* Aligned */
status = generic->lShiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->lShiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
/* Unaligned */
status = generic->lShiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->lShiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
return TRUE;
}
static BOOL test_rShift_16s_func(void)
{
pstatus_t status;
INT16 ALIGN(src[FUNC_TEST_SIZE+3]);
INT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
UINT32 val;
winpr_RAND((BYTE*)&val, sizeof(val));
winpr_RAND((BYTE*)src, sizeof(src));
val = (val % (FUNC_TEST_SIZE - 1)) + 1;
/* Aligned */
status = generic->rShiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->rShiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
/* Unaligned */
status = generic->rShiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->rShiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
return TRUE;
}
static BOOL test_rShift_16u_func(void)
{
pstatus_t status;
UINT16 ALIGN(src[FUNC_TEST_SIZE+3]);
UINT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
UINT32 val;
winpr_RAND((BYTE*)&val, sizeof(val));
winpr_RAND((BYTE*)src, sizeof(src));
val = (val % (FUNC_TEST_SIZE - 1)) + 1;
/* Aligned */
status = generic->rShiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->rShiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
/* Unaligned */
status = generic->rShiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->rShiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
return TRUE;
}
static BOOL test_ShiftWrapper_16s_func(void)
{
pstatus_t status;
INT16 ALIGN(src[FUNC_TEST_SIZE+3]);
INT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
UINT32 tmp;
INT32 val;
winpr_RAND((BYTE*)&tmp, sizeof(tmp));
winpr_RAND((BYTE*)src, sizeof(src));
val = (tmp % (FUNC_TEST_SIZE - 1)) + 1;
/* Aligned */
status = generic->shiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = generic->shiftC_16s(src+1, -val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16s(src+1, -val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
/* Unaligned */
status = generic->shiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = generic->shiftC_16s(src+1, -val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16s(src+1, -val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
return TRUE;
}
static BOOL test_ShiftWrapper_16u_func(void)
{
pstatus_t status;
UINT16 ALIGN(src[FUNC_TEST_SIZE+3]);
UINT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
UINT32 tmp;
INT32 val;
winpr_RAND((BYTE*)&tmp, sizeof(tmp));
winpr_RAND((BYTE*)src, sizeof(src));
val = (tmp % (FUNC_TEST_SIZE - 1)) + 1;
/* Aligned */
status = generic->shiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = generic->shiftC_16u(src+1, -val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16u(src+1, -val, d1+1, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
/* Unaligned */
status = generic->shiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = generic->shiftC_16u(src+1, -val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->shiftC_16u(src+1, -val, d1+2, FUNC_TEST_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
return TRUE;
} }
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
int test_lShift_16u_speed(void) static BOOL test_lShift_16s_speed(void)
{ {
UINT32 val;
INT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]);
winpr_RAND((BYTE*)src, sizeof(src));
winpr_RAND((BYTE*)&val, sizeof(val));
if (!speed_test("lShift_16s", "aligned", g_Iterations,
(speed_test_fkt)generic->lShiftC_16s,
(speed_test_fkt)optimized->lShiftC_16s, src, val,
dst, MAX_TEST_SIZE))
return FALSE;
if (!speed_test("lShift_16s", "unaligned", g_Iterations,
(speed_test_fkt)generic->lShiftC_16s,
(speed_test_fkt)optimized->lShiftC_16s, src + 1, val,
dst, MAX_TEST_SIZE))
return FALSE;
return TRUE;
}
/* ------------------------------------------------------------------------- */
static BOOL test_lShift_16u_speed(void)
{
UINT32 val;
UINT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]); UINT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]);
winpr_RAND(src, sizeof(src));
speed_lShift_16u("lShift_16u", "aligned", src, NULL, 3, dst, winpr_RAND((BYTE*)&val, sizeof(val));
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); winpr_RAND((BYTE*)src, sizeof(src));
speed_lShift_16u("lShift_16u", "unaligned", src + 1, NULL, 3, dst,
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); if (!speed_test("lShift_16u", "aligned", g_Iterations,
return SUCCESS; (speed_test_fkt)generic->lShiftC_16u,
(speed_test_fkt)optimized->lShiftC_16u, src, val,
dst, MAX_TEST_SIZE))
return FALSE;
if (!speed_test("lShift_16u", "unaligned", g_Iterations,
(speed_test_fkt)generic->lShiftC_16u,
(speed_test_fkt)optimized->lShiftC_16u, src + 1, val,
dst, MAX_TEST_SIZE))
return FALSE;
return TRUE;
} }
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
int test_rShift_16s_speed(void) static BOOL test_rShift_16s_speed(void)
{ {
INT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]); UINT32 val;
winpr_RAND(src, sizeof(src)); INT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]);
speed_rShift_16s("rShift_16s", "aligned", src, NULL, 3, dst,
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); winpr_RAND((BYTE*)src, sizeof(src));
speed_rShift_16s("rShift_16s", "unaligned", src + 1, NULL, 3, dst, winpr_RAND((BYTE*)&val, sizeof(val));
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); if (!speed_test("rShift_16s", "aligned", g_Iterations,
return SUCCESS; (speed_test_fkt)generic->rShiftC_16s,
(speed_test_fkt)optimized->rShiftC_16s, src, val,
dst, MAX_TEST_SIZE))
return FALSE;
if (!speed_test("rShift_16s", "unaligned", g_Iterations,
(speed_test_fkt)generic->rShiftC_16s,
(speed_test_fkt)optimized->rShiftC_16s, src + 1, val,
dst, MAX_TEST_SIZE))
return FALSE;
return TRUE;
} }
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
int test_rShift_16u_speed(void) static BOOL test_rShift_16u_speed(void)
{ {
UINT32 val;
UINT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]); UINT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]);
winpr_RAND(src, sizeof(src));
speed_rShift_16u("rShift_16u", "aligned", src, NULL, 3, dst, winpr_RAND((BYTE*)&val, sizeof(val));
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); winpr_RAND((BYTE*)src, sizeof(src));
speed_rShift_16u("rShift_16u", "unaligned", src + 1, NULL, 3, dst,
test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); if (!speed_test("rShift_16u", "aligned", g_Iterations,
return SUCCESS; (speed_test_fkt)generic->rShiftC_16u,
(speed_test_fkt)optimized->rShiftC_16u, src, val,
dst, MAX_TEST_SIZE))
return FALSE;
if (!speed_test("rShift_16u", "unaligned", g_Iterations,
(speed_test_fkt)generic->rShiftC_16u,
(speed_test_fkt)optimized->rShiftC_16u, src + 1, val,
dst, MAX_TEST_SIZE))
return FALSE;
return TRUE;
} }
int TestPrimitivesShift(int argc, char* argv[]) int TestPrimitivesShift(int argc, char* argv[])
{ {
int status; prim_test_setup(FALSE);
status = test_lShift_16s_func();
if (status != SUCCESS) if (!test_lShift_16s_func())
return 1; return 1;
if (g_TestPrimitivesPerformance) if (g_TestPrimitivesPerformance)
{ {
status = test_lShift_16s_speed(); if (!test_lShift_16s_speed())
if (status != SUCCESS)
return 1; return 1;
} }
status = test_lShift_16u_func(); if (!test_lShift_16u_func())
if (status != SUCCESS)
return 1; return 1;
if (g_TestPrimitivesPerformance) if (g_TestPrimitivesPerformance)
{ {
status = test_lShift_16u_speed(); if (!test_lShift_16u_speed())
if (status != SUCCESS)
return 1; return 1;
} }
status = test_rShift_16s_func(); if (!test_rShift_16s_func())
if (status != SUCCESS)
return 1; return 1;
if (g_TestPrimitivesPerformance) if (g_TestPrimitivesPerformance)
{ {
status = test_rShift_16s_speed(); if (!test_rShift_16s_speed())
if (status != SUCCESS)
return 1; return 1;
} }
status = test_rShift_16u_func(); if (!test_rShift_16u_func())
if (status != SUCCESS)
return 1; return 1;
if (g_TestPrimitivesPerformance) if (g_TestPrimitivesPerformance)
{ {
status = test_rShift_16u_speed(); if (!test_rShift_16u_speed())
if (status != SUCCESS)
return 1; return 1;
} }
if (!test_ShiftWrapper_16s_func())
return 1;
if (!test_ShiftWrapper_16u_func())
return 1;
return 0; return 0;
} }

View File

@ -19,103 +19,71 @@
#include <winpr/sysinfo.h> #include <winpr/sysinfo.h>
#include "prim_test.h" #include "prim_test.h"
static const int SIGN_PRETEST_ITERATIONS = 100000; #define TEST_BUFFER_SIZE 65535
static const float TEST_TIME = 1.0;
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
static int test_sign16s_func(void) static BOOL test_sign16s_func(void)
{ {
INT16 ALIGN(src[65535]), ALIGN(d1[65535]); pstatus_t status;
#ifdef WITH_SSE2 INT16 ALIGN(src[TEST_BUFFER_SIZE]);
INT16 ALIGN(d2[65535]); INT16 ALIGN(d1[TEST_BUFFER_SIZE]);
int i; INT16 ALIGN(d2[TEST_BUFFER_SIZE]);
#endif
int failed = 0;
char testStr[256];
/* Test when we can reach 16-byte alignment */
testStr[0] = '\0';
winpr_RAND(src, sizeof(src));
general_sign_16s(src + 1, d1 + 1, 65535);
#ifdef WITH_SSE2
if (IsProcessorFeaturePresentEx(PF_EX_SSSE3)) winpr_RAND((BYTE*)src, sizeof(src));
{
strcat(testStr, " SSSE3");
ssse3_sign_16s(src + 1, d2 + 1, 65535);
for (i = 1; i < 65535; ++i) status = generic->sign_16s(src + 1, d1 + 1, TEST_BUFFER_SIZE);
{ if (status != PRIMITIVES_SUCCESS)
if (d1[i] != d2[i]) return FALSE;
{ status = optimized->sign_16s(src + 1, d2 + 1, TEST_BUFFER_SIZE);
printf("SIGN16s-SSE-aligned FAIL[%d] of %d: want %d, got %d\n", if (status != PRIMITIVES_SUCCESS)
i, src[i], d1[i], d2[i]); return FALSE;
++failed;
}
}
}
#endif /* i386 */ if (memcmp(d1, d2, sizeof(d1)) != 0)
/* Test when we cannot reach 16-byte alignment */ return FALSE;
winpr_RAND(src, sizeof(src));
general_sign_16s(src + 1, d1 + 2, 65535);
#ifdef WITH_SSE2
if (IsProcessorFeaturePresentEx(PF_EX_SSSE3)) status = generic->sign_16s(src + 1, d1 + 2, TEST_BUFFER_SIZE);
{ if (status != PRIMITIVES_SUCCESS)
ssse3_sign_16s(src + 1, d2 + 2, 65535); return FALSE;
status = optimized->sign_16s(src + 1, d2 + 2, TEST_BUFFER_SIZE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
for (i = 2; i < 65535; ++i) if (memcmp(d1, d2, sizeof(d1)) != 0)
{ return FALSE;
if (d1[i] != d2[i])
{
printf("SIGN16s-SSE-unaligned FAIL[%d] of %d: want %d, got %d\n",
i, src[i - 1], d1[i], d2[i]);
++failed;
}
}
}
#endif /* i386 */ return TRUE;
if (!failed) printf("All sign16s tests passed (%s).\n", testStr);
return (failed > 0) ? FAILURE : SUCCESS;
} }
/* ------------------------------------------------------------------------- */
STD_SPEED_TEST(sign16s_speed_test, INT16, INT16, dst = dst,
TRUE, general_sign_16s(src1, dst, size),
#ifdef WITH_SSE2
TRUE, ssse3_sign_16s(src1, dst, size), PF_EX_SSSE3, TRUE,
#else
FALSE, PRIM_NOP, 0, FALSE,
#endif
FALSE, dst = dst);
static int test_sign16s_speed(void) static int test_sign16s_speed(void)
{ {
INT16 ALIGN(src[MAX_TEST_SIZE + 3]), ALIGN(dst[MAX_TEST_SIZE + 3]); INT16 ALIGN(src[MAX_TEST_SIZE + 3]), ALIGN(dst[MAX_TEST_SIZE + 3]);
winpr_RAND(src, sizeof(src)); winpr_RAND((BYTE*)src, sizeof(src));
sign16s_speed_test("sign16s", "aligned", src, NULL, 0, dst,
test_sizes, NUM_TEST_SIZES, SIGN_PRETEST_ITERATIONS, TEST_TIME); if (!speed_test("sign16s", "aligned", g_Iterations,
sign16s_speed_test("sign16s", "unaligned", src + 1, NULL, 0, dst, (speed_test_fkt)generic->sign_16s,
test_sizes, NUM_TEST_SIZES, SIGN_PRETEST_ITERATIONS, TEST_TIME); (speed_test_fkt)optimized->sign_16s, src + 1, dst + 1,
return SUCCESS; MAX_TEST_SIZE))
return FALSE;
if (!speed_test("sign16s", "unaligned", g_Iterations,
(speed_test_fkt)generic->sign_16s,
(speed_test_fkt)optimized->sign_16s, src + 1, dst + 2,
MAX_TEST_SIZE))
return FALSE;
return TRUE;
} }
int TestPrimitivesSign(int argc, char* argv[]) int TestPrimitivesSign(int argc, char* argv[])
{ {
int status; prim_test_setup(FALSE);
status = test_sign16s_func();
if (status != SUCCESS) if (!test_sign16s_func())
return 1; return 1;
if (g_TestPrimitivesPerformance) if (g_TestPrimitivesPerformance)
{ {
status = test_sign16s_speed(); if (!test_sign16s_speed())
if (status != SUCCESS)
return 1; return 1;
} }

View File

@ -23,105 +23,103 @@
#include <winpr/sysinfo.h> #include <winpr/sysinfo.h>
#include "prim_test.h" #include "prim_test.h"
static const int YCOCG_TRIAL_ITERATIONS = 20000;
static const float TEST_TIME = 4.0;
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
int test_YCoCgRToRGB_8u_AC4R_func(void) static BOOL test_YCoCgRToRGB_8u_AC4R_func(void)
{ {
#ifdef WITH_SSE2 BOOL result = TRUE;
int i; pstatus_t status;
INT32 ALIGN(out_sse[4098]), ALIGN(out_sse_inv[4098]); INT32 ALIGN(out_sse[4098]), ALIGN(out_sse_inv[4098]);
#endif
INT32 ALIGN(in[4098]); INT32 ALIGN(in[4098]);
INT32 ALIGN(out_c[4098]), ALIGN(out_c_inv[4098]); INT32 ALIGN(out_c[4098]), ALIGN(out_c_inv[4098]);
char testStr[256];
BOOL failed = FALSE;
testStr[0] = '\0';
winpr_RAND(in, sizeof(in));
general_YCoCgToRGB_8u_AC4R((const BYTE*)(in + 1), 63 * 4,
(BYTE*) out_c, 63 * 4, 63, 61, 2, TRUE, FALSE);
general_YCoCgToRGB_8u_AC4R((const BYTE*)(in + 1), 63 * 4,
(BYTE*) out_c_inv, 63 * 4, 63, 61, 2, TRUE, TRUE);
#ifdef WITH_SSE2
if (IsProcessorFeaturePresentEx(PF_EX_SSSE3)) UINT32 i, x;
const UINT32 formats[] = {
PIXEL_FORMAT_ARGB32,
PIXEL_FORMAT_ABGR32,
PIXEL_FORMAT_RGBA32,
PIXEL_FORMAT_RGBX32,
PIXEL_FORMAT_BGRA32,
PIXEL_FORMAT_BGRX32
};
winpr_RAND((BYTE*)in, sizeof(in));
for (x=0; x<sizeof(formats)/sizeof(formats[0]); x++)
{ {
strcat(testStr, " SSSE3"); UINT32 format = formats[x];
ssse3_YCoCgRToRGB_8u_AC4R((const BYTE*)(in + 1), 63 * 4,
(BYTE*) out_sse, 63 * 4, 63, 61, 2, TRUE, FALSE); status = generic->YCoCgToRGB_8u_AC4R(
(const BYTE*)(in + 1), 63 * 4,
(BYTE*) out_c, format, 63 * 4, 63, 61, 2, TRUE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = generic->YCoCgToRGB_8u_AC4R(
(const BYTE*)(in + 1), 63 * 4,
(BYTE*) out_c_inv, format, 63 * 4, 63, 61, 2, TRUE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->YCoCgToRGB_8u_AC4R(
(const BYTE*)(in + 1), 63 * 4,
(BYTE*) out_sse, format, 63 * 4, 63, 61, 2, TRUE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
status = optimized->YCoCgToRGB_8u_AC4R(
(const BYTE*)(in + 1), 63 * 4,
(BYTE*) out_sse_inv, format, 63 * 4, 63, 61, 2, TRUE);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
for (i = 0; i < 63 * 61; ++i) for (i = 0; i < 63 * 61; ++i)
{ {
if (out_c[i] != out_sse[i]) if (out_c[i] != out_sse[i])
{ {
printf("YCoCgRToRGB-SSE FAIL[%d]: 0x%08x -> C 0x%08x vs SSE 0x%08x\n", i, printf("optimized->YCoCgRToRGB FAIL[%d]: 0x%08x -> C 0x%08x vs optimized 0x%08x\n", i,
in[i + 1], out_c[i], out_sse[i]); in[i + 1], out_c[i], out_sse[i]);
failed = TRUE; result = FALSE;
} }
} }
ssse3_YCoCgRToRGB_8u_AC4R((const BYTE*)(in + 1), 63 * 4,
(BYTE*) out_sse_inv, 63 * 4, 63, 61, 2, TRUE, TRUE);
for (i = 0; i < 63 * 61; ++i) for (i = 0; i < 63 * 61; ++i)
{ {
if (out_c_inv[i] != out_sse_inv[i]) if (out_c_inv[i] != out_sse_inv[i])
{ {
printf("YCoCgRToRGB-SSE inverted FAIL[%d]: 0x%08x -> C 0x%08x vs SSE 0x%08x\n", printf("optimized->YCoCgRToRGB inverted FAIL[%d]: 0x%08x -> C 0x%08x vs optimized 0x%08x\n",
i, i,
in[i + 1], out_c_inv[i], out_sse_inv[i]); in[i + 1], out_c_inv[i], out_sse_inv[i]);
failed = TRUE; result = FALSE;
} }
} }
} }
return result;
#endif /* i386 */
if (!failed) printf("All YCoCgRToRGB_8u_AC4R tests passed (%s).\n", testStr);
return (failed > 0) ? FAILURE : SUCCESS;
} }
/* ------------------------------------------------------------------------- */
STD_SPEED_TEST(
ycocg_to_rgb_speed, BYTE, BYTE, PRIM_NOP,
TRUE, general_YCoCgToRGB_8u_AC4R(src1, 64 * 4, dst, 64 * 4, 64, 64, 2, FALSE,
FALSE),
#ifdef WITH_SSE2
TRUE, ssse3_YCoCgRToRGB_8u_AC4R(src1, 64 * 4, dst, 64 * 4, 64, 64, 2, FALSE,
FALSE),
PF_EX_SSSE3, TRUE,
#else
FALSE, PRIM_NOP, 0, FALSE,
#endif
FALSE, PRIM_NOP);
static int test_YCoCgRToRGB_8u_AC4R_speed(void) static int test_YCoCgRToRGB_8u_AC4R_speed(void)
{ {
INT32 ALIGN(in[4096]); INT32 ALIGN(in[4096]);
INT32 ALIGN(out[4096]); INT32 ALIGN(out[4096]);
int size_array[] = { 64 };
winpr_RAND(in, sizeof(in)); winpr_RAND((BYTE*)in, sizeof(in));
ycocg_to_rgb_speed("YCoCgToRGB", "aligned", (const BYTE*) in,
0, 0, (BYTE*) out, if (!speed_test("YCoCgToRGB_8u_AC4R", "aligned", g_Iterations,
size_array, 1, YCOCG_TRIAL_ITERATIONS, TEST_TIME); (speed_test_fkt)generic->YCoCgToRGB_8u_AC4R,
return SUCCESS; (speed_test_fkt)optimized->YCoCgToRGB_8u_AC4R,
in, 64 * 4, out, 64 * 4, 64, 64, 2, FALSE, FALSE))
return FALSE;
return TRUE;
} }
int TestPrimitivesYCoCg(int argc, char* argv[]) int TestPrimitivesYCoCg(int argc, char* argv[])
{ {
int status; prim_test_setup(FALSE);
status = test_YCoCgRToRGB_8u_AC4R_func();
if (status != SUCCESS) if (!test_YCoCgRToRGB_8u_AC4R_func())
return 1; return 1;
if (g_TestPrimitivesPerformance) if (g_TestPrimitivesPerformance)
{ {
status = test_YCoCgRToRGB_8u_AC4R_speed(); if (!test_YCoCgRToRGB_8u_AC4R_speed())
if (status != SUCCESS)
return 1; return 1;
} }

View File

@ -38,8 +38,8 @@ static void get_size(UINT32* width, UINT32* height)
winpr_RAND((BYTE*)width, sizeof(*width)); winpr_RAND((BYTE*)width, sizeof(*width));
winpr_RAND((BYTE*)height, sizeof(*height)); winpr_RAND((BYTE*)height, sizeof(*height));
// TODO: Algorithm only works on even resolutions... // TODO: Algorithm only works on even resolutions...
*width = (*width % 4000) << 1; *width = (*width % 64) << 1;
*height = (*height % 4000 << 1); *height = (*height % 64 << 1);
} }
static BOOL check_padding(const BYTE* psrc, size_t size, size_t padding, static BOOL check_padding(const BYTE* psrc, size_t size, size_t padding,
@ -370,11 +370,13 @@ static BOOL TestPrimitiveYUV(BOOL use444)
if (use444) if (use444)
{ {
if (prims->RGBToYUV444_8u_P3AC4R(rgb, stride, yuv, yuv_step, if (prims->RGBToYUV444_8u_P3AC4R(rgb, PIXEL_FORMAT_BGRA32,
stride, yuv, yuv_step,
&roi) != PRIMITIVES_SUCCESS) &roi) != PRIMITIVES_SUCCESS)
goto fail; goto fail;
} }
else if (prims->RGBToYUV420_8u_P3AC4R(rgb, stride, yuv, yuv_step, else if (prims->RGBToYUV420_8u_P3AC4R(rgb, PIXEL_FORMAT_BGRA32,
stride, yuv, yuv_step,
&roi) != PRIMITIVES_SUCCESS) &roi) != PRIMITIVES_SUCCESS)
goto fail; goto fail;
@ -429,16 +431,16 @@ int TestPrimitivesYUV(int argc, char* argv[])
UINT32 x; UINT32 x;
int rc = -1; int rc = -1;
prim_test_setup(FALSE);
for (x = 0; x < 10; x++) for (x = 0; x < 10; x++)
{ {
/* TODO: This test fails on value comparison,
* there seems to be some issue left with encoder / decoder pass.
if (!TestPrimitiveYUV(FALSE))
goto end;
*/
if (!TestPrimitiveYUV(TRUE)) if (!TestPrimitiveYUV(TRUE))
goto end; goto end;
if (!TestPrimitiveYUV(FALSE))
goto end;
if (!TestPrimitiveYUVCombine()) if (!TestPrimitiveYUVCombine())
goto end; goto end;
} }

View File

@ -43,13 +43,6 @@
extern int test_sizes[]; extern int test_sizes[];
#define NUM_TEST_SIZES 10 #define NUM_TEST_SIZES 10
#ifndef SUCCESS
#define SUCCESS 0
#endif
#ifndef FAILURE
#define FAILURE 1
#endif
extern BOOL g_TestPrimitivesPerformance; extern BOOL g_TestPrimitivesPerformance;
extern UINT32 g_Iterations; extern UINT32 g_Iterations;
@ -58,8 +51,10 @@ extern primitives_t* optimized;
void prim_test_setup(BOOL performance); void prim_test_setup(BOOL performance);
typedef pstatus_t (*speed_test_fkt)();
BOOL speed_test(const char* name, const char* dsc, UINT32 iterations, BOOL speed_test(const char* name, const char* dsc, UINT32 iterations,
pstatus_t (*generic)(), pstatus_t (*optimised)(), speed_test_fkt generic, speed_test_fkt optimized,
...); ...);
#endif // !__PRIMTEST_H_INCLUDED__ #endif // !__PRIMTEST_H_INCLUDED__