Fixed SSE checks and RGBToRGB_16s8u_P3AC4R

This commit is contained in:
Armin Novak 2017-02-16 09:35:42 +01:00
parent 0d43201e3c
commit ee3a3a787b

View File

@ -718,6 +718,7 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_BGRX(
const UINT16* pr = (const UINT16*)(pSrc[0]);
const UINT16* pg = (const UINT16*)(pSrc[1]);
const UINT16* pb = (const UINT16*)(pSrc[2]);
const UINT32 pad = roi->width % 16;
const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
BYTE* out;
UINT32 srcbump, dstbump, y;
@ -729,7 +730,7 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_BGRX(
{
UINT32 x;
for (x = 0; x < roi->width; x += 16)
for (x = 0; x < roi->width - pad; x += 16)
{
__m128i r, g, b;
/* The comments below pretend these are 8-byte registers
@ -737,25 +738,25 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_BGRX(
*/
{
__m128i R0, R1;
R0 = _mm_loadu_si128((__m128i*)pb);
R0 = _mm_load_si128((__m128i*)pb);
pb += 8; /* R0 = 00B300B200B100B0 */
R1 = _mm_loadu_si128((__m128i*)pb);
R1 = _mm_load_si128((__m128i*)pb);
pb += 8; /* R1 = 00B700B600B500B4 */
b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
}
{
__m128i R0, R1;
R0 = _mm_loadu_si128((__m128i*)pg);
R0 = _mm_load_si128((__m128i*)pg);
pg += 8; /* R1 = 00G300G200G100G0 */
R1 = _mm_loadu_si128((__m128i*)pg);
R1 = _mm_load_si128((__m128i*)pg);
pg += 8; /* R2 = 00G700G600G500G4 */
g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
}
{
__m128i R0, R1;
R0 = _mm_loadu_si128((__m128i*)pr);
R0 = _mm_load_si128((__m128i*)pr);
pr += 8; /* R0 = 00R300R200R100R0 */
R1 = _mm_loadu_si128((__m128i*)pr);
R1 = _mm_load_si128((__m128i*)pr);
pr += 8; /* R3 = 00R700R600R500R4 */
r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
}
@ -790,6 +791,17 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_BGRX(
}
}
for (x = 0; x < pad; x++)
{
const BYTE R = CLIP(*pr++);
const BYTE G = CLIP(*pg++);
const BYTE B = CLIP(*pb++);
*out++ = B;
*out++ = G;
*out++ = R;
*out++ = 0xFF;
}
/* Jump to next row. */
pr += srcbump;
pg += srcbump;
@ -810,6 +822,7 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_RGBX(
const UINT16* pr = (const UINT16*)(pSrc[0]);
const UINT16* pg = (const UINT16*)(pSrc[1]);
const UINT16* pb = (const UINT16*)(pSrc[2]);
const UINT32 pad = roi->width % 16;
const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
BYTE* out;
UINT32 srcbump, dstbump, y;
@ -821,7 +834,7 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_RGBX(
{
UINT32 x;
for (x = 0; x < roi->width; x += 16)
for (x = 0; x < roi->width - pad; x += 16)
{
__m128i r, g, b;
/* The comments below pretend these are 8-byte registers
@ -829,25 +842,25 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_RGBX(
*/
{
__m128i R0, R1;
R0 = _mm_loadu_si128((__m128i*)pb);
R0 = _mm_load_si128((__m128i*)pb);
pb += 8; /* R0 = 00B300B200B100B0 */
R1 = _mm_loadu_si128((__m128i*)pb);
R1 = _mm_load_si128((__m128i*)pb);
pb += 8; /* R1 = 00B700B600B500B4 */
b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
}
{
__m128i R0, R1;
R0 = _mm_loadu_si128((__m128i*)pg);
R0 = _mm_load_si128((__m128i*)pg);
pg += 8; /* R1 = 00G300G200G100G0 */
R1 = _mm_loadu_si128((__m128i*)pg);
R1 = _mm_load_si128((__m128i*)pg);
pg += 8; /* R2 = 00G700G600G500G4 */
g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
}
{
__m128i R0, R1;
R0 = _mm_loadu_si128((__m128i*)pr);
R0 = _mm_load_si128((__m128i*)pr);
pr += 8; /* R0 = 00R300R200R100R0 */
R1 = _mm_loadu_si128((__m128i*)pr);
R1 = _mm_load_si128((__m128i*)pr);
pr += 8; /* R3 = 00R700R600R500R4 */
r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
}
@ -882,6 +895,17 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_RGBX(
}
}
for (x = 0; x < pad; x++)
{
const BYTE R = CLIP(*pr++);
const BYTE G = CLIP(*pg++);
const BYTE B = CLIP(*pb++);
*out++ = R;
*out++ = G;
*out++ = B;
*out++ = 0xFF;
}
/* Jump to next row. */
pr += srcbump;
pg += srcbump;
@ -902,6 +926,7 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XBGR(
const UINT16* pr = (const UINT16*)(pSrc[0]);
const UINT16* pg = (const UINT16*)(pSrc[1]);
const UINT16* pb = (const UINT16*)(pSrc[2]);
const UINT32 pad = roi->width % 16;
const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
BYTE* out;
UINT32 srcbump, dstbump, y;
@ -913,7 +938,7 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XBGR(
{
UINT32 x;
for (x = 0; x < roi->width; x += 16)
for (x = 0; x < roi->width - pad; x += 16)
{
__m128i r, g, b;
/* The comments below pretend these are 8-byte registers
@ -921,25 +946,25 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XBGR(
*/
{
__m128i R0, R1;
R0 = _mm_loadu_si128((__m128i*)pb);
R0 = _mm_load_si128((__m128i*)pb);
pb += 8; /* R0 = 00B300B200B100B0 */
R1 = _mm_loadu_si128((__m128i*)pb);
R1 = _mm_load_si128((__m128i*)pb);
pb += 8; /* R1 = 00B700B600B500B4 */
b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
}
{
__m128i R0, R1;
R0 = _mm_loadu_si128((__m128i*)pg);
R0 = _mm_load_si128((__m128i*)pg);
pg += 8; /* R1 = 00G300G200G100G0 */
R1 = _mm_loadu_si128((__m128i*)pg);
R1 = _mm_load_si128((__m128i*)pg);
pg += 8; /* R2 = 00G700G600G500G4 */
g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
}
{
__m128i R0, R1;
R0 = _mm_loadu_si128((__m128i*)pr);
R0 = _mm_load_si128((__m128i*)pr);
pr += 8; /* R0 = 00R300R200R100R0 */
R1 = _mm_loadu_si128((__m128i*)pr);
R1 = _mm_load_si128((__m128i*)pr);
pr += 8; /* R3 = 00R700R600R500R4 */
r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
}
@ -974,6 +999,17 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XBGR(
}
}
for (x = 0; x < pad; x++)
{
const BYTE R = CLIP(*pr++);
const BYTE G = CLIP(*pg++);
const BYTE B = CLIP(*pb++);
*out++ = 0xFF;
*out++ = B;
*out++ = G;
*out++ = R;
}
/* Jump to next row. */
pr += srcbump;
pg += srcbump;
@ -995,6 +1031,7 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XRGB(
const UINT16* pg = (const UINT16*)(pSrc[1]);
const UINT16* pb = (const UINT16*)(pSrc[2]);
const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
const UINT32 pad = roi->width % 16;
BYTE* out;
UINT32 srcbump, dstbump, y;
out = (BYTE*) pDst;
@ -1005,7 +1042,7 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XRGB(
{
UINT32 x;
for (x = 0; x < roi->width; x += 16)
for (x = 0; x < roi->width - pad; x += 16)
{
__m128i r, g, b;
/* The comments below pretend these are 8-byte registers
@ -1013,25 +1050,25 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XRGB(
*/
{
__m128i R0, R1;
R0 = _mm_loadu_si128((__m128i*)pb);
R0 = _mm_load_si128((__m128i*)pb);
pb += 8; /* R0 = 00B300B200B100B0 */
R1 = _mm_loadu_si128((__m128i*)pb);
R1 = _mm_load_si128((__m128i*)pb);
pb += 8; /* R1 = 00B700B600B500B4 */
b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
}
{
__m128i R0, R1;
R0 = _mm_loadu_si128((__m128i*)pg);
R0 = _mm_load_si128((__m128i*)pg);
pg += 8; /* R1 = 00G300G200G100G0 */
R1 = _mm_loadu_si128((__m128i*)pg);
R1 = _mm_load_si128((__m128i*)pg);
pg += 8; /* R2 = 00G700G600G500G4 */
g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
}
{
__m128i R0, R1;
R0 = _mm_loadu_si128((__m128i*)pr);
R0 = _mm_load_si128((__m128i*)pr);
pr += 8; /* R0 = 00R300R200R100R0 */
R1 = _mm_loadu_si128((__m128i*)pr);
R1 = _mm_load_si128((__m128i*)pr);
pr += 8; /* R3 = 00R700R600R500R4 */
r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
}
@ -1066,6 +1103,17 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XRGB(
}
}
for (x = 0; x < pad; x++)
{
const BYTE R = CLIP(*pr++);
const BYTE G = CLIP(*pg++);
const BYTE B = CLIP(*pb++);
*out++ = 0xFF;
*out++ = R;
*out++ = G;
*out++ = B;
}
/* Jump to next row. */
pr += srcbump;
pg += srcbump;
@ -1084,6 +1132,10 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
UINT32 DstFormat,
const prim_size_t* roi)
{
if (((UINT64)pSrc[0] & 0x0f) || ((UINT64)pSrc[0] & 0x0f) || ((UINT64)pSrc[0] & 0x0f) ||
(srcStep & 0x0f) || ((UINT64)pDst & 0x0f) || (dstStep & 0x0f))
return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
switch (DstFormat)
{
case PIXEL_FORMAT_BGRA32: