Merge pull request #10720 from akallabeth/prim-copy-fix2
[primitives,copy] remove bgr24 to bgrx32 optimized
This commit is contained in:
commit
6ebef063b5
@ -733,6 +733,9 @@ BOOL freerdp_image_copy_overlap(BYTE* pDstData, DWORD DstFormat, UINT32 nDstStep
|
||||
SSIZE_T dstVOffset = 0;
|
||||
SSIZE_T dstVMultiplier = 1;
|
||||
|
||||
WINPR_ASSERT(overlapping(pDstData, nXDst, nYDst, nDstStep, dstByte, pSrcData, nXSrc, nYSrc,
|
||||
nSrcStep, srcByte, nWidth, nHeight));
|
||||
|
||||
if ((nWidth == 0) || (nHeight == 0))
|
||||
return TRUE;
|
||||
|
||||
@ -1608,6 +1611,9 @@ BOOL freerdp_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstForma
|
||||
if (!prims)
|
||||
prims = primitives_get();
|
||||
|
||||
WINPR_ASSERT(!overlapping(pDstData, nXDst, nYDst, nDstStep, FreeRDPGetBytesPerPixel(DstFormat),
|
||||
pSrcData, nXSrc, nYSrc, nSrcStep, FreeRDPGetBytesPerPixel(SrcFormat),
|
||||
nWidth, nHeight));
|
||||
WINPR_ASSERT(prims);
|
||||
WINPR_ASSERT(prims->copy_no_overlap);
|
||||
return prims->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
|
||||
|
@ -1372,22 +1372,11 @@ static UINT gdi_SurfaceToSurface(RdpgfxClientContext* context,
|
||||
if (!is_rect_valid(&rect, surfaceDst->width, surfaceDst->height))
|
||||
goto fail;
|
||||
|
||||
if (surfaceDst == surfaceSrc)
|
||||
{
|
||||
if (!freerdp_image_copy_overlap(
|
||||
surfaceDst->data, surfaceDst->format, surfaceDst->scanline, destPt->x,
|
||||
destPt->y, nWidth, nHeight, surfaceSrc->data, surfaceSrc->format,
|
||||
surfaceSrc->scanline, rectSrc->left, rectSrc->top, NULL, FREERDP_FLIP_NONE))
|
||||
goto fail;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!freerdp_image_copy_no_overlap(
|
||||
surfaceDst->data, surfaceDst->format, surfaceDst->scanline, destPt->x,
|
||||
destPt->y, nWidth, nHeight, surfaceSrc->data, surfaceSrc->format,
|
||||
surfaceSrc->scanline, rectSrc->left, rectSrc->top, NULL, FREERDP_FLIP_NONE))
|
||||
goto fail;
|
||||
}
|
||||
if (!freerdp_image_copy(surfaceDst->data, surfaceDst->format, surfaceDst->scanline,
|
||||
destPt->x, destPt->y, nWidth, nHeight, surfaceSrc->data,
|
||||
surfaceSrc->format, surfaceSrc->scanline, rectSrc->left,
|
||||
rectSrc->top, NULL, FREERDP_FLIP_NONE))
|
||||
goto fail;
|
||||
|
||||
invalidRect = rect;
|
||||
region16_union_rect(&surfaceDst->invalidRegion, &surfaceDst->invalidRegion, &invalidRect);
|
||||
|
@ -146,6 +146,8 @@ static INLINE pstatus_t generic_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDs
|
||||
const SSIZE_T srcByte = 3;
|
||||
const SSIZE_T dstByte = 4;
|
||||
|
||||
const UINT32 width = nWidth - nWidth % 8;
|
||||
|
||||
for (SSIZE_T y = 0; y < nHeight; y++)
|
||||
{
|
||||
const BYTE* WINPR_RESTRICT srcLine =
|
||||
@ -153,7 +155,16 @@ static INLINE pstatus_t generic_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDs
|
||||
BYTE* WINPR_RESTRICT dstLine =
|
||||
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
|
||||
|
||||
for (SSIZE_T x = 0; x < nWidth; x++)
|
||||
SSIZE_T x = 0;
|
||||
WINPR_PRAGMA_UNROLL_LOOP
|
||||
for (; x < width; x++)
|
||||
{
|
||||
dstLine[(x + nXDst) * dstByte + 0] = srcLine[(x + nXSrc) * srcByte + 0];
|
||||
dstLine[(x + nXDst) * dstByte + 1] = srcLine[(x + nXSrc) * srcByte + 1];
|
||||
dstLine[(x + nXDst) * dstByte + 2] = srcLine[(x + nXSrc) * srcByte + 2];
|
||||
}
|
||||
|
||||
for (; x < nWidth; x++)
|
||||
{
|
||||
dstLine[(x + nXDst) * dstByte + 0] = srcLine[(x + nXSrc) * srcByte + 0];
|
||||
dstLine[(x + nXDst) * dstByte + 1] = srcLine[(x + nXSrc) * srcByte + 1];
|
||||
@ -174,6 +185,8 @@ static INLINE pstatus_t generic_image_copy_bgrx32_bgrx32(
|
||||
const SSIZE_T srcByte = 4;
|
||||
const SSIZE_T dstByte = 4;
|
||||
|
||||
const UINT32 width = nWidth - nWidth % 8;
|
||||
|
||||
for (SSIZE_T y = 0; y < nHeight; y++)
|
||||
{
|
||||
const BYTE* WINPR_RESTRICT srcLine =
|
||||
@ -181,7 +194,15 @@ static INLINE pstatus_t generic_image_copy_bgrx32_bgrx32(
|
||||
BYTE* WINPR_RESTRICT dstLine =
|
||||
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
|
||||
|
||||
for (SSIZE_T x = 0; x < nWidth; x++)
|
||||
SSIZE_T x = 0;
|
||||
WINPR_PRAGMA_UNROLL_LOOP
|
||||
for (; x < width; x++)
|
||||
{
|
||||
dstLine[(x + nXDst) * dstByte + 0] = srcLine[(x + nXSrc) * srcByte + 0];
|
||||
dstLine[(x + nXDst) * dstByte + 1] = srcLine[(x + nXSrc) * srcByte + 1];
|
||||
dstLine[(x + nXDst) * dstByte + 2] = srcLine[(x + nXSrc) * srcByte + 2];
|
||||
}
|
||||
for (; x < nWidth; x++)
|
||||
{
|
||||
dstLine[(x + nXDst) * dstByte + 0] = srcLine[(x + nXSrc) * srcByte + 0];
|
||||
dstLine[(x + nXDst) * dstByte + 1] = srcLine[(x + nXSrc) * srcByte + 1];
|
||||
@ -284,10 +305,6 @@ static INLINE pstatus_t generic_image_copy_no_overlap_dst_alpha(
|
||||
return generic_image_copy_bgrx32_bgrx32(
|
||||
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
|
||||
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
|
||||
case PIXEL_FORMAT_BGR24:
|
||||
return generic_image_copy_bgr24_bgrx32(
|
||||
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
|
||||
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
@ -52,9 +52,17 @@ static INLINE pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstDa
|
||||
const SSIZE_T srcByte = 3;
|
||||
const SSIZE_T dstByte = 4;
|
||||
|
||||
const __m256i mask = _mm256_set_epi32(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
|
||||
const __m256i mask = _mm256_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000,
|
||||
0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000);
|
||||
const __m256i smask = _mm256_set_epi32(0xff171615, 0xff141312, 0xff1110ff, 0xffffffff,
|
||||
0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
|
||||
const __m256i shelpmask = _mm256_set_epi32(0xffffffff, 0xffffffff, 0xffffff1f, 0xff1e1d1c,
|
||||
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
|
||||
const SSIZE_T rem = nWidth % 8;
|
||||
const SSIZE_T width = nWidth - rem;
|
||||
|
||||
const size_t align = nSrcStep % 32;
|
||||
const BOOL fast = (align == 0) ? TRUE : (align >= 8 - MIN(8, rem) ? TRUE : FALSE);
|
||||
for (SSIZE_T y = 0; y < nHeight; y++)
|
||||
{
|
||||
const BYTE* WINPR_RESTRICT srcLine =
|
||||
@ -63,15 +71,30 @@ static INLINE pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstDa
|
||||
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
|
||||
|
||||
SSIZE_T x = 0;
|
||||
for (; x < width; x += 8)
|
||||
|
||||
/* Ensure alignment requirements can be met */
|
||||
if (fast)
|
||||
{
|
||||
const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
|
||||
__m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
|
||||
const __m256i s0 = _mm256_loadu_si256(src);
|
||||
const __m256i s1 = _mm256_loadu_si256(dst);
|
||||
const __m256i s2 = _mm256_shuffle_epi8(s1, mask);
|
||||
__m256i d0 = _mm256_blendv_epi8(s2, s0, mask);
|
||||
_mm256_storeu_si256(dst, d0);
|
||||
for (; x < width; x += 8)
|
||||
{
|
||||
const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
|
||||
__m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
|
||||
const __m256i s0 = _mm256_loadu_si256(src);
|
||||
__m256i s1 = _mm256_shuffle_epi8(s0, smask);
|
||||
|
||||
/* _mm256_shuffle_epi8 can not cross 128bit lanes.
|
||||
* manually copy these bytes with extract/insert */
|
||||
const __m256i sx = _mm256_broadcastsi128_si256(_mm256_extractf128_si256(s0, 0));
|
||||
const __m256i sxx = _mm256_shuffle_epi8(sx, shelpmask);
|
||||
const __m256i bmask =
|
||||
_mm256_set_epi32(0x00000000, 0x00000000, 0x000000FF, 0x00FFFFFF, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000);
|
||||
const __m256i merged = _mm256_blendv_epi8(s1, sxx, bmask);
|
||||
|
||||
const __m256i s2 = _mm256_loadu_si256(dst);
|
||||
__m256i d0 = _mm256_blendv_epi8(merged, s2, mask);
|
||||
_mm256_storeu_si256(dst, d0);
|
||||
}
|
||||
}
|
||||
for (; x < nWidth; x++)
|
||||
{
|
||||
@ -140,7 +163,8 @@ static pstatus_t avx2_image_copy_no_overlap_dst_alpha(
|
||||
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
|
||||
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
|
||||
SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
|
||||
UINT32 flags, SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier,
|
||||
SSIZE_T dstVOffset)
|
||||
{
|
||||
WINPR_ASSERT(pDstData);
|
||||
WINPR_ASSERT(pSrcData);
|
||||
@ -176,44 +200,9 @@ static pstatus_t avx2_image_copy_no_overlap_dst_alpha(
|
||||
break;
|
||||
}
|
||||
|
||||
return avx2_image_copy_no_overlap_convert(
|
||||
pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
|
||||
nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
|
||||
}
|
||||
|
||||
pstatus_t avx2_image_copy_no_overlap_convert(
|
||||
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
|
||||
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
|
||||
SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
|
||||
{
|
||||
const SSIZE_T srcByte = FreeRDPGetBytesPerPixel(SrcFormat);
|
||||
const SSIZE_T dstByte = FreeRDPGetBytesPerPixel(DstFormat);
|
||||
|
||||
const UINT32 width = nWidth - nWidth % 8;
|
||||
for (SSIZE_T y = 0; y < nHeight; y++)
|
||||
{
|
||||
const BYTE* WINPR_RESTRICT srcLine =
|
||||
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
|
||||
BYTE* WINPR_RESTRICT dstLine =
|
||||
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
|
||||
|
||||
SSIZE_T x = 0;
|
||||
WINPR_PRAGMA_UNROLL_LOOP
|
||||
for (; x < width; x++)
|
||||
{
|
||||
const UINT32 color = FreeRDPReadColor_int(&srcLine[(x + nXSrc) * srcByte], SrcFormat);
|
||||
const UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette);
|
||||
FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor);
|
||||
}
|
||||
for (; x < nWidth; x++)
|
||||
{
|
||||
const UINT32 color = FreeRDPReadColor_int(&srcLine[(x + nXSrc) * srcByte], SrcFormat);
|
||||
const UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette);
|
||||
FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor);
|
||||
}
|
||||
}
|
||||
return PRIMITIVES_SUCCESS;
|
||||
primitives_t* gen = primitives_get_generic();
|
||||
return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
|
||||
pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
|
||||
}
|
||||
|
||||
static pstatus_t avx2_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
|
||||
@ -253,7 +242,7 @@ static pstatus_t avx2_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD
|
||||
if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
|
||||
return avx2_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
|
||||
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
|
||||
nXSrc, nYSrc, palette, srcVMultiplier,
|
||||
nXSrc, nYSrc, palette, flags, srcVMultiplier,
|
||||
srcVOffset, dstVMultiplier, dstVOffset);
|
||||
else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
|
||||
return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
|
||||
@ -261,10 +250,11 @@ static pstatus_t avx2_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD
|
||||
nXSrc, nYSrc, palette, srcVMultiplier,
|
||||
srcVOffset, dstVMultiplier, dstVOffset, flags);
|
||||
else
|
||||
return avx2_image_copy_no_overlap_convert(pDstData, DstFormat, nDstStep, nXDst, nYDst,
|
||||
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
|
||||
nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset,
|
||||
dstVMultiplier, dstVOffset);
|
||||
{
|
||||
primitives_t* gen = primitives_get_generic();
|
||||
return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
|
||||
pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -52,8 +52,12 @@ static INLINE pstatus_t sse_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstDat
|
||||
const SSIZE_T srcByte = 3;
|
||||
const SSIZE_T dstByte = 4;
|
||||
|
||||
const __m128i mask = _mm_set_epi32(0xFF, 0xFF, 0xFF, 0xFF);
|
||||
const __m128i mask = _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000);
|
||||
const __m128i smask = _mm_set_epi32(0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
|
||||
const SSIZE_T rem = nWidth % 4;
|
||||
|
||||
const size_t align = nSrcStep % 64;
|
||||
const BOOL fast = (align == 0) ? TRUE : (align >= 16 - MIN(16, rem) ? TRUE : FALSE);
|
||||
const SSIZE_T width = nWidth - rem;
|
||||
for (SSIZE_T y = 0; y < nHeight; y++)
|
||||
{
|
||||
@ -63,15 +67,20 @@ static INLINE pstatus_t sse_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstDat
|
||||
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
|
||||
|
||||
SSIZE_T x = 0;
|
||||
for (; x < width; x += 4)
|
||||
/* Ensure alignment requirements can be met */
|
||||
if (fast)
|
||||
{
|
||||
const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
|
||||
__m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
|
||||
const __m128i s0 = _mm_loadu_si128(src);
|
||||
const __m128i s1 = _mm_loadu_si128(dst);
|
||||
const __m128i s2 = _mm_shuffle_epi8(s1, mask);
|
||||
__m128i d0 = _mm_blendv_epi8(s2, s0, mask);
|
||||
_mm_storeu_si128(dst, d0);
|
||||
for (; x < width; x += 4)
|
||||
{
|
||||
const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
|
||||
__m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
|
||||
const __m128i s0 = _mm_loadu_si128(src);
|
||||
const __m128i s1 = _mm_shuffle_epi8(s0, smask);
|
||||
const __m128i s2 = _mm_loadu_si128(dst);
|
||||
|
||||
__m128i d0 = _mm_blendv_epi8(s1, s2, mask);
|
||||
_mm_storeu_si128(dst, d0);
|
||||
}
|
||||
}
|
||||
for (; x < nWidth; x++)
|
||||
{
|
||||
@ -138,7 +147,8 @@ static pstatus_t sse_image_copy_no_overlap_dst_alpha(
|
||||
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
|
||||
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
|
||||
SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
|
||||
UINT32 flags, SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier,
|
||||
SSIZE_T dstVOffset)
|
||||
{
|
||||
WINPR_ASSERT(pDstData);
|
||||
WINPR_ASSERT(pSrcData);
|
||||
@ -174,45 +184,9 @@ static pstatus_t sse_image_copy_no_overlap_dst_alpha(
|
||||
break;
|
||||
}
|
||||
|
||||
/* Fall back to pixel copy */
|
||||
return sse_image_copy_no_overlap_convert(
|
||||
pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
|
||||
nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
|
||||
}
|
||||
|
||||
pstatus_t sse_image_copy_no_overlap_convert(
|
||||
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
|
||||
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
|
||||
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
|
||||
SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
|
||||
{
|
||||
const SSIZE_T srcByte = FreeRDPGetBytesPerPixel(SrcFormat);
|
||||
const SSIZE_T dstByte = FreeRDPGetBytesPerPixel(DstFormat);
|
||||
|
||||
const UINT32 width = nWidth - nWidth % 8;
|
||||
for (SSIZE_T y = 0; y < nHeight; y++)
|
||||
{
|
||||
const BYTE* WINPR_RESTRICT srcLine =
|
||||
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
|
||||
BYTE* WINPR_RESTRICT dstLine =
|
||||
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
|
||||
|
||||
SSIZE_T x = 0;
|
||||
WINPR_PRAGMA_UNROLL_LOOP
|
||||
for (; x < width; x++)
|
||||
{
|
||||
const UINT32 color = FreeRDPReadColor_int(&srcLine[(x + nXSrc) * srcByte], SrcFormat);
|
||||
const UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette);
|
||||
FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor);
|
||||
}
|
||||
for (; x < nWidth; x++)
|
||||
{
|
||||
const UINT32 color = FreeRDPReadColor_int(&srcLine[(x + nXSrc) * srcByte], SrcFormat);
|
||||
const UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette);
|
||||
FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor);
|
||||
}
|
||||
}
|
||||
return PRIMITIVES_SUCCESS;
|
||||
primitives_t* gen = primitives_get_generic();
|
||||
return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
|
||||
pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
|
||||
}
|
||||
|
||||
static pstatus_t sse_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
|
||||
@ -252,7 +226,7 @@ static pstatus_t sse_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD
|
||||
if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
|
||||
return sse_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
|
||||
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
|
||||
nXSrc, nYSrc, palette, srcVMultiplier,
|
||||
nXSrc, nYSrc, palette, flags, srcVMultiplier,
|
||||
srcVOffset, dstVMultiplier, dstVOffset);
|
||||
else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
|
||||
return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
|
||||
@ -260,10 +234,11 @@ static pstatus_t sse_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD
|
||||
nXSrc, nYSrc, palette, srcVMultiplier,
|
||||
srcVOffset, dstVMultiplier, dstVOffset, flags);
|
||||
else
|
||||
return sse_image_copy_no_overlap_convert(pDstData, DstFormat, nDstStep, nXDst, nYDst,
|
||||
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
|
||||
nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset,
|
||||
dstVMultiplier, dstVOffset);
|
||||
{
|
||||
primitives_t* gen = primitives_get_generic();
|
||||
return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
|
||||
pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -12,7 +12,10 @@
|
||||
* permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <freerdp/config.h>
|
||||
#include <winpr/crypto.h>
|
||||
|
||||
#include <winpr/sysinfo.h>
|
||||
#include "prim_test.h"
|
||||
@ -71,10 +74,161 @@ static BOOL test_copy8u_speed(void)
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static BYTE* rand_alloc(size_t w, size_t h, size_t bpp, size_t pad, BYTE** copy)
|
||||
{
|
||||
const size_t s = w * bpp + pad;
|
||||
BYTE* ptr = calloc(s, h);
|
||||
if (!ptr)
|
||||
return NULL;
|
||||
|
||||
winpr_RAND(ptr, s * h);
|
||||
|
||||
if (copy)
|
||||
{
|
||||
BYTE* ptr2 = calloc(s, h);
|
||||
if (!ptr2)
|
||||
{
|
||||
free(ptr);
|
||||
return NULL;
|
||||
}
|
||||
memcpy(ptr2, ptr, s * h);
|
||||
*copy = ptr2;
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
static size_t runcount = 0;
|
||||
|
||||
static BOOL test_copy_no_overlap_off(BOOL verbose, UINT32 srcFormat, UINT32 dstFormat, UINT32 flags,
|
||||
UINT32 pad, UINT32 w, UINT32 h, UINT32 dxoff, UINT32 dyoff,
|
||||
UINT32 sxoff, UINT32 syoff)
|
||||
{
|
||||
BOOL rc = FALSE;
|
||||
primitives_t* gen = primitives_get_generic();
|
||||
primitives_t* prims = primitives_get();
|
||||
if (!gen || !prims)
|
||||
return FALSE;
|
||||
|
||||
runcount++;
|
||||
|
||||
WINPR_ASSERT(dxoff < w);
|
||||
WINPR_ASSERT(sxoff < w);
|
||||
WINPR_ASSERT(dyoff < h);
|
||||
WINPR_ASSERT(syoff < h);
|
||||
|
||||
const UINT32 sbpp = FreeRDPGetBytesPerPixel(srcFormat);
|
||||
const UINT32 dbpp = FreeRDPGetBytesPerPixel(dstFormat);
|
||||
|
||||
if (verbose)
|
||||
{
|
||||
(void)fprintf(stderr,
|
||||
"run src: %s, dst: %s [flags 0x%08" PRIx32 "] %" PRIu32 "x%" PRIu32
|
||||
", soff=%" PRIu32 "x%" PRIu32 ", doff=%" PRIu32 "x%" PRIu32 ", pad=%" PRIu32
|
||||
"\n",
|
||||
FreeRDPGetColorFormatName(srcFormat), FreeRDPGetColorFormatName(dstFormat),
|
||||
flags, w, h, sxoff, syoff, dxoff, dyoff, pad);
|
||||
}
|
||||
|
||||
const UINT32 sstride = (w + sxoff) * sbpp + pad;
|
||||
const UINT32 dstride = (w + dxoff) * dbpp + pad;
|
||||
BYTE* dst2 = NULL;
|
||||
BYTE* src2 = NULL;
|
||||
BYTE* dst1 = rand_alloc(w + dxoff, h + dyoff, dbpp, pad, &dst2);
|
||||
BYTE* src1 = rand_alloc(w + sxoff, h + syoff, sbpp, pad, &src2);
|
||||
if (!dst1 || !dst2 || !src1 || !src2)
|
||||
goto fail;
|
||||
|
||||
if (gen->copy_no_overlap(dst1, dstFormat, dstride, dxoff, dyoff, w, h, src1, srcFormat, sstride,
|
||||
sxoff, syoff, NULL, flags) != PRIMITIVES_SUCCESS)
|
||||
goto fail;
|
||||
|
||||
if (memcmp(src1, src2, 1ULL * sstride * h) != 0)
|
||||
goto fail;
|
||||
|
||||
if (prims->copy_no_overlap(dst2, dstFormat, dstride, dxoff, dyoff, w, h, src1, srcFormat,
|
||||
sstride, sxoff, syoff, NULL, flags) != PRIMITIVES_SUCCESS)
|
||||
goto fail;
|
||||
|
||||
if (memcmp(src1, src2, 1ULL * sstride * h) != 0)
|
||||
goto fail;
|
||||
|
||||
if (memcmp(dst1, dst2, 1ULL * dstride * h) != 0)
|
||||
goto fail;
|
||||
|
||||
if (flags == FREERDP_KEEP_DST_ALPHA)
|
||||
{
|
||||
for (size_t y = 0; y < h; y++)
|
||||
{
|
||||
const BYTE* d1 = &dst1[(y + dyoff) * dstride];
|
||||
const BYTE* d2 = &dst2[(y + dyoff) * dstride];
|
||||
for (size_t x = 0; x < w; x++)
|
||||
{
|
||||
const UINT32 c1 = FreeRDPReadColor(&d1[(x + dxoff) * dbpp], dstFormat);
|
||||
const UINT32 c2 = FreeRDPReadColor(&d2[(x + dxoff) * dbpp], dstFormat);
|
||||
BYTE a1 = 0;
|
||||
BYTE a2 = 0;
|
||||
FreeRDPSplitColor(c1, dstFormat, NULL, NULL, NULL, &a1, NULL);
|
||||
FreeRDPSplitColor(c2, dstFormat, NULL, NULL, NULL, &a2, NULL);
|
||||
if (a1 != a2)
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
}
|
||||
rc = TRUE;
|
||||
|
||||
fail:
|
||||
if (!rc)
|
||||
{
|
||||
(void)fprintf(stderr, "failed to compare copy_no_overlap(%s -> %s [0x%08" PRIx32 "])\n",
|
||||
FreeRDPGetColorFormatName(srcFormat), FreeRDPGetColorFormatName(dstFormat),
|
||||
flags);
|
||||
}
|
||||
free(dst1);
|
||||
free(dst2);
|
||||
free(src1);
|
||||
free(src2);
|
||||
return rc;
|
||||
}
|
||||
|
||||
static BOOL test_copy_no_overlap(BOOL verbose, UINT32 srcFormat, UINT32 dstFormat, UINT32 flags,
|
||||
UINT32 width, UINT32 height)
|
||||
{
|
||||
BOOL rc = TRUE;
|
||||
const UINT32 mw = 4;
|
||||
const UINT32 mh = 4;
|
||||
for (UINT32 dxoff = 0; dxoff < mw; dxoff++)
|
||||
{
|
||||
for (UINT32 dyoff = 0; dyoff <= mh; dyoff++)
|
||||
{
|
||||
for (UINT32 sxoff = 0; sxoff <= mw; sxoff++)
|
||||
{
|
||||
for (UINT32 syoff = 0; syoff <= mh; syoff++)
|
||||
{
|
||||
/* We need minimum alignment of 8 bytes.
|
||||
* AVX2 can read 8 pixels (at most 8x4=32 bytes) per step
|
||||
* if we have 24bpp input that is 24 bytes with 8 bytes read
|
||||
* out of bound */
|
||||
for (UINT32 pad = 8; pad <= 12; pad++)
|
||||
{
|
||||
if (!test_copy_no_overlap_off(verbose, srcFormat, dstFormat, flags, pad,
|
||||
width, height, dxoff, dyoff, sxoff, syoff))
|
||||
rc = FALSE;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
int TestPrimitivesCopy(int argc, char* argv[])
|
||||
{
|
||||
WINPR_UNUSED(argc);
|
||||
WINPR_UNUSED(argv);
|
||||
|
||||
const BOOL verbose = argc > 1;
|
||||
|
||||
prim_test_setup(FALSE);
|
||||
|
||||
if (!test_copy8u_func())
|
||||
@ -86,5 +240,48 @@ int TestPrimitivesCopy(int argc, char* argv[])
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
const UINT32 flags[] = {
|
||||
FREERDP_FLIP_NONE,
|
||||
FREERDP_KEEP_DST_ALPHA,
|
||||
FREERDP_FLIP_HORIZONTAL,
|
||||
FREERDP_KEEP_DST_ALPHA | FREERDP_FLIP_HORIZONTAL,
|
||||
#if 0
|
||||
FREERDP_FLIP_VERTICAL,
|
||||
FREERDP_FLIP_VERTICAL | FREERDP_FLIP_HORIZONTAL,
|
||||
FREERDP_KEEP_DST_ALPHA | FREERDP_FLIP_VERTICAL,
|
||||
FREERDP_KEEP_DST_ALPHA | FREERDP_FLIP_VERTICAL | FREERDP_FLIP_HORIZONTAL
|
||||
#endif
|
||||
};
|
||||
const UINT32 formats[] = {
|
||||
PIXEL_FORMAT_BGRA32,
|
||||
PIXEL_FORMAT_BGRX32,
|
||||
PIXEL_FORMAT_BGR24
|
||||
#if 0 /* Only the previous 3 have SIMD optimizations, so skip the rest */
|
||||
, PIXEL_FORMAT_RGB24,
|
||||
PIXEL_FORMAT_ABGR32, PIXEL_FORMAT_ARGB32, PIXEL_FORMAT_XBGR32,
|
||||
PIXEL_FORMAT_XRGB32, PIXEL_FORMAT_RGBA32, PIXEL_FORMAT_RGBX32
|
||||
#endif
|
||||
};
|
||||
|
||||
int rc = 0;
|
||||
for (size_t z = 0; z < ARRAYSIZE(flags); z++)
|
||||
{
|
||||
const UINT32 flag = flags[z];
|
||||
for (size_t x = 0; x < ARRAYSIZE(formats); x++)
|
||||
{
|
||||
const UINT32 sformat = formats[x];
|
||||
for (size_t y = 0; y < ARRAYSIZE(formats); y++)
|
||||
{
|
||||
const UINT32 dformat = formats[y];
|
||||
|
||||
if (!test_copy_no_overlap(verbose, sformat, dformat, flag, 21, 17))
|
||||
rc = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (verbose)
|
||||
(void)fprintf(stderr, "runcount=%" PRIuz "\n", runcount);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user