[primitives] add image copy primitive

* move freerdp_image_copy_no_overlap implementation to primitives
* add SSE4.1 and AVX2 optimizations
This commit is contained in:
akallabeth 2024-06-11 09:51:29 +02:00
parent 2ee987e665
commit 311068e605
No known key found for this signature in database
GPG Key ID: A49454A3FC909FD5
9 changed files with 1023 additions and 214 deletions

View File

@ -104,6 +104,12 @@ typedef pstatus_t (*__add_16s_t)(const INT16* WINPR_RESTRICT pSrc1,
UINT32 len);
typedef pstatus_t (*__add_16s_inplace_t)(INT16* WINPR_RESTRICT pSrcDst1,
INT16* WINPR_RESTRICT pSrcDst2, UINT32 len);
typedef pstatus_t (*__copy_no_overlap_t)(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData,
DWORD SrcFormat, UINT32 nSrcStep, UINT32 nXSrc,
UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
UINT32 flags);
typedef pstatus_t (*__lShiftC_16s_inplace_t)(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 len);
typedef pstatus_t (*__lShiftC_16s_t)(const INT16* pSrc, UINT32 val, INT16* pSrcDst, UINT32 len);
typedef pstatus_t (*__lShiftC_16u_t)(const UINT16* pSrc, UINT32 val, UINT16* pSrcDst, UINT32 len);
@ -222,6 +228,7 @@ typedef struct
*/
__add_16s_inplace_t add_16s_inplace;
__lShiftC_16s_inplace_t lShiftC_16s_inplace;
__copy_no_overlap_t copy_no_overlap;
} primitives_t;
typedef enum

View File

@ -5,6 +5,7 @@ set(CODEC_SRCS
bulk.h
dsp.c
color.c
color.h
audio.c
planar.c
bitmap.c

View File

@ -39,17 +39,9 @@
#include <libswscale/swscale.h>
#endif
#define TAG FREERDP_TAG("color")
#include "color.h"
static INLINE BOOL FreeRDPWriteColorIgnoreAlpha_int(BYTE* WINPR_RESTRICT dst, UINT32 format,
UINT32 color);
static INLINE BOOL FreeRDPWriteColor_int(BYTE* WINPR_RESTRICT dst, UINT32 format, UINT32 color);
static INLINE UINT32 FreeRDPReadColor_int(const BYTE* WINPR_RESTRICT src, UINT32 format);
static INLINE DWORD FreeRDPAreColorFormatsEqualNoAlpha_int(DWORD first, DWORD second)
{
const DWORD mask = (DWORD) ~(8UL << 12UL);
return (first & mask) == (second & mask);
}
#define TAG FREERDP_TAG("color")
BYTE* freerdp_glyph_convert(UINT32 width, UINT32 height, const BYTE* WINPR_RESTRICT data)
{
@ -733,102 +725,6 @@ static INLINE BOOL freerdp_image_copy_no_overlap_dst_alpha(
srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
}
BOOL freerdp_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep,
UINT32 nXDst, UINT32 nYDst, UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
{
const SSIZE_T dstByte = FreeRDPGetBytesPerPixel(DstFormat);
const SSIZE_T srcByte = FreeRDPGetBytesPerPixel(SrcFormat);
const SSIZE_T copyDstWidth = nWidth * dstByte;
const SSIZE_T xSrcOffset = nXSrc * srcByte;
const SSIZE_T xDstOffset = nXDst * dstByte;
const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) ? TRUE : FALSE;
SSIZE_T srcVOffset = 0;
SSIZE_T srcVMultiplier = 1;
SSIZE_T dstVOffset = 0;
SSIZE_T dstVMultiplier = 1;
if ((nWidth == 0) || (nHeight == 0))
return TRUE;
if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
return FALSE;
if (!pDstData || !pSrcData)
return FALSE;
if (nDstStep == 0)
nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
if (nSrcStep == 0)
nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
if (vSrcVFlip)
{
srcVOffset = (nHeight - 1ll) * nSrcStep;
srcVMultiplier = -1;
}
if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
return freerdp_image_copy_no_overlap_dst_alpha(
pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, SrcFormat,
nSrcStep, nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset, dstVMultiplier,
dstVOffset);
else if (FreeRDPAreColorFormatsEqualNoAlpha_int(SrcFormat, DstFormat))
{
if (!vSrcVFlip && (nDstStep == nSrcStep) && (xSrcOffset == 0) && (xDstOffset == 0))
{
const void* src = &pSrcData[1ull * nYSrc * nSrcStep];
void* dst = &pDstData[1ull * nYDst * nDstStep];
memcpy(dst, src, 1ull * nDstStep * nHeight);
}
else
{
for (SSIZE_T y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
memcpy(&dstLine[xDstOffset], &srcLine[xSrcOffset], copyDstWidth);
}
}
}
else
{
for (SSIZE_T y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
UINT32 color = FreeRDPReadColor_int(&srcLine[nXSrc * srcByte], SrcFormat);
UINT32 oldColor = color;
UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette);
FreeRDPWriteColor_int(&dstLine[nXDst * dstByte], DstFormat, dstColor);
for (SSIZE_T x = 1; x < nWidth; x++)
{
color = FreeRDPReadColor_int(&srcLine[(x + nXSrc) * srcByte], SrcFormat);
if (color == oldColor)
{
FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor);
}
else
{
oldColor = color;
dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette);
FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor);
}
}
}
}
return TRUE;
}
BOOL freerdp_image_copy_overlap(BYTE* pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst,
UINT32 nYDst, UINT32 nWidth, UINT32 nHeight, const BYTE* pSrcData,
DWORD SrcFormat, UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
@ -1608,124 +1504,16 @@ BOOL FreeRDPWriteColorIgnoreAlpha(BYTE* WINPR_RESTRICT dst, UINT32 format, UINT3
return FreeRDPWriteColorIgnoreAlpha_int(dst, format, color);
}
BOOL FreeRDPWriteColorIgnoreAlpha_int(BYTE* WINPR_RESTRICT dst, UINT32 format, UINT32 color)
{
switch (format)
{
case PIXEL_FORMAT_XBGR32:
case PIXEL_FORMAT_XRGB32:
case PIXEL_FORMAT_ABGR32:
case PIXEL_FORMAT_ARGB32:
{
const UINT32 tmp = ((UINT32)dst[0] << 24ULL) | (color & 0x00FFFFFFULL);
return FreeRDPWriteColor_int(dst, format, tmp);
}
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_RGBX32:
case PIXEL_FORMAT_BGRA32:
case PIXEL_FORMAT_RGBA32:
{
const UINT32 tmp = ((UINT32)dst[3]) | (color & 0xFFFFFF00ULL);
return FreeRDPWriteColor_int(dst, format, tmp);
}
default:
return FreeRDPWriteColor_int(dst, format, color);
}
}
BOOL FreeRDPWriteColor(BYTE* WINPR_RESTRICT dst, UINT32 format, UINT32 color)
{
return FreeRDPWriteColor_int(dst, format, color);
}
BOOL FreeRDPWriteColor_int(BYTE* WINPR_RESTRICT dst, UINT32 format, UINT32 color)
{
switch (FreeRDPGetBitsPerPixel(format))
{
case 32:
dst[0] = (BYTE)(color >> 24);
dst[1] = (BYTE)(color >> 16);
dst[2] = (BYTE)(color >> 8);
dst[3] = (BYTE)color;
break;
case 24:
dst[0] = (BYTE)(color >> 16);
dst[1] = (BYTE)(color >> 8);
dst[2] = (BYTE)color;
break;
case 16:
dst[1] = (BYTE)(color >> 8);
dst[0] = (BYTE)color;
break;
case 15:
if (!FreeRDPColorHasAlpha(format))
color = color & 0x7FFF;
dst[1] = (BYTE)(color >> 8);
dst[0] = (BYTE)color;
break;
case 8:
dst[0] = (BYTE)color;
break;
default:
WLog_ERR(TAG, "Unsupported format %s", FreeRDPGetColorFormatName(format));
return FALSE;
}
return TRUE;
}
UINT32 FreeRDPReadColor(const BYTE* WINPR_RESTRICT src, UINT32 format)
{
return FreeRDPReadColor_int(src, format);
}
UINT32 FreeRDPReadColor_int(const BYTE* WINPR_RESTRICT src, UINT32 format)
{
UINT32 color = 0;
switch (FreeRDPGetBitsPerPixel(format))
{
case 32:
color =
((UINT32)src[0] << 24) | ((UINT32)src[1] << 16) | ((UINT32)src[2] << 8) | src[3];
break;
case 24:
color = ((UINT32)src[0] << 16) | ((UINT32)src[1] << 8) | src[2];
break;
case 16:
color = ((UINT32)src[1] << 8) | src[0];
break;
case 15:
color = ((UINT32)src[1] << 8) | src[0];
if (!FreeRDPColorHasAlpha(format))
color = color & 0x7FFF;
break;
case 8:
case 4:
case 1:
color = *src;
break;
default:
WLog_ERR(TAG, "Unsupported format %s", FreeRDPGetColorFormatName(format));
color = 0;
break;
}
return color;
}
UINT32 FreeRDPGetColor(UINT32 format, BYTE r, BYTE g, BYTE b, BYTE a)
{
UINT32 _r = r;
@ -1817,3 +1605,20 @@ UINT32 FreeRDPGetColor(UINT32 format, BYTE r, BYTE g, BYTE b, BYTE a)
return 0;
}
}
BOOL freerdp_image_copy_no_overlap(BYTE* pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst,
UINT32 nYDst, UINT32 nWidth, UINT32 nHeight,
const BYTE* pSrcData, DWORD SrcFormat, UINT32 nSrcStep,
UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* palette,
UINT32 flags)
{
static primitives_t* prims = NULL;
if (!prims)
prims = primitives_get();
WINPR_ASSERT(prims);
WINPR_ASSERT(prims->copy_no_overlap);
return prims->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette,
flags) == PRIMITIVES_SUCCESS;
}

147
libfreerdp/codec/color.h Normal file
View File

@ -0,0 +1,147 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* codec color
*
* Copyright 2024 Armin Novak <anovak@thincast.com>
* Copyright 2024 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FREERDP_LIB_CODEC_COLOR_H
#define FREERDP_LIB_CODEC_COLOR_H
#include <winpr/winpr.h>
#include <winpr/wtypes.h>
#include <freerdp/codec/color.h>
#include <freerdp/log.h>
#define INT_COLOR_TAG FREERDP_TAG("codec.color.h")
static INLINE DWORD FreeRDPAreColorFormatsEqualNoAlpha_int(DWORD first, DWORD second)
{
const DWORD mask = (DWORD) ~(8UL << 12UL);
return (first & mask) == (second & mask);
}
static INLINE BOOL FreeRDPWriteColor_int(BYTE* WINPR_RESTRICT dst, UINT32 format, UINT32 color)
{
switch (FreeRDPGetBitsPerPixel(format))
{
case 32:
dst[0] = (BYTE)(color >> 24);
dst[1] = (BYTE)(color >> 16);
dst[2] = (BYTE)(color >> 8);
dst[3] = (BYTE)color;
break;
case 24:
dst[0] = (BYTE)(color >> 16);
dst[1] = (BYTE)(color >> 8);
dst[2] = (BYTE)color;
break;
case 16:
dst[1] = (BYTE)(color >> 8);
dst[0] = (BYTE)color;
break;
case 15:
if (!FreeRDPColorHasAlpha(format))
color = color & 0x7FFF;
dst[1] = (BYTE)(color >> 8);
dst[0] = (BYTE)color;
break;
case 8:
dst[0] = (BYTE)color;
break;
default:
WLog_ERR(INT_COLOR_TAG, "Unsupported format %s", FreeRDPGetColorFormatName(format));
return FALSE;
}
return TRUE;
}
static INLINE BOOL FreeRDPWriteColorIgnoreAlpha_int(BYTE* WINPR_RESTRICT dst, UINT32 format,
UINT32 color)
{
switch (format)
{
case PIXEL_FORMAT_XBGR32:
case PIXEL_FORMAT_XRGB32:
case PIXEL_FORMAT_ABGR32:
case PIXEL_FORMAT_ARGB32:
{
const UINT32 tmp = ((UINT32)dst[0] << 24ULL) | (color & 0x00FFFFFFULL);
return FreeRDPWriteColor_int(dst, format, tmp);
}
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_RGBX32:
case PIXEL_FORMAT_BGRA32:
case PIXEL_FORMAT_RGBA32:
{
const UINT32 tmp = ((UINT32)dst[3]) | (color & 0xFFFFFF00ULL);
return FreeRDPWriteColor_int(dst, format, tmp);
}
default:
return FreeRDPWriteColor_int(dst, format, color);
}
}
static INLINE UINT32 FreeRDPReadColor_int(const BYTE* WINPR_RESTRICT src, UINT32 format)
{
UINT32 color = 0;
switch (FreeRDPGetBitsPerPixel(format))
{
case 32:
color =
((UINT32)src[0] << 24) | ((UINT32)src[1] << 16) | ((UINT32)src[2] << 8) | src[3];
break;
case 24:
color = ((UINT32)src[0] << 16) | ((UINT32)src[1] << 8) | src[2];
break;
case 16:
color = ((UINT32)src[1] << 8) | src[0];
break;
case 15:
color = ((UINT32)src[1] << 8) | src[0];
if (!FreeRDPColorHasAlpha(format))
color = color & 0x7FFF;
break;
case 8:
case 4:
case 1:
color = *src;
break;
default:
WLog_ERR(INT_COLOR_TAG, "Unsupported format %s", FreeRDPGetColorFormatName(format));
color = 0;
break;
}
return color;
}
#endif

View File

@ -6,6 +6,7 @@ set(PRIMITIVES_SRCS
prim_alphaComp.c
prim_colors.c
prim_copy.c
prim_copy.h
prim_set.c
prim_shift.c
prim_sign.c
@ -17,6 +18,8 @@ set(PRIMITIVES_SRCS
if (WITH_SSE2 OR WITH_NEON)
set(PRIMITIVES_SSE2_SRCS
prim_colors_opt.c
prim_copy_sse.c
prim_copy_avx2.c
prim_set_opt.c)
set(PRIMITIVES_SSE3_SRCS
@ -71,6 +74,8 @@ if(WITH_SSE2)
if (PRIMITIVES_SSSE3_SRCS)
set_source_files_properties(${PRIMITIVES_SSSE3_SRCS} PROPERTIES COMPILE_FLAGS "-mssse3" )
endif()
set_source_files_properties(prim_copy_sse.c PROPERTIES COMPILE_FLAGS "-msse4.1" )
set_source_files_properties(prim_copy_avx2.c PROPERTIES COMPILE_FLAGS "-mavx2" )
endif()
if(MSVC)

View File

@ -18,7 +18,15 @@
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <freerdp/log.h>
#include "prim_internal.h"
#include "prim_copy.h"
#include "../codec/color.h"
#include <freerdp/codec/color.h>
#define TAG FREERDP_TAG("primitives.copy")
static primitives_t* generic = NULL;
@ -128,6 +136,247 @@ static pstatus_t general_copy_8u_AC4r(const BYTE* pSrc, INT32 srcStep, BYTE* pDs
return PRIMITIVES_SUCCESS;
}
static INLINE pstatus_t generic_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData,
UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
SSIZE_T srcVMultiplier, SSIZE_T srcVOffset,
SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
{
const SSIZE_T srcByte = 3;
const SSIZE_T dstByte = 4;
for (SSIZE_T y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
for (SSIZE_T x = 0; x < nWidth; x++)
{
dstLine[(x + nXDst) * dstByte + 0] = srcLine[(x + nXSrc) * srcByte + 0];
dstLine[(x + nXDst) * dstByte + 1] = srcLine[(x + nXSrc) * srcByte + 1];
dstLine[(x + nXDst) * dstByte + 2] = srcLine[(x + nXSrc) * srcByte + 2];
}
}
return PRIMITIVES_SUCCESS;
}
static INLINE pstatus_t generic_image_copy_bgrx32_bgrx32(
BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, UINT32 nSrcStep, UINT32 nXSrc,
UINT32 nYSrc, SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier,
SSIZE_T dstVOffset)
{
const SSIZE_T srcByte = 4;
const SSIZE_T dstByte = 4;
for (SSIZE_T y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
for (SSIZE_T x = 0; x < nWidth; x++)
{
dstLine[(x + nXDst) * dstByte + 0] = srcLine[(x + nXSrc) * srcByte + 0];
dstLine[(x + nXDst) * dstByte + 1] = srcLine[(x + nXSrc) * srcByte + 1];
dstLine[(x + nXDst) * dstByte + 2] = srcLine[(x + nXSrc) * srcByte + 2];
}
}
return PRIMITIVES_SUCCESS;
}
pstatus_t generic_image_copy_no_overlap_convert(
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
{
const SSIZE_T srcByte = FreeRDPGetBytesPerPixel(SrcFormat);
const SSIZE_T dstByte = FreeRDPGetBytesPerPixel(DstFormat);
const UINT32 width = nWidth - nWidth % 8;
for (SSIZE_T y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
SSIZE_T x = 0;
WINPR_PRAGMA_UNROLL_LOOP
for (; x < width; x++)
{
const UINT32 color = FreeRDPReadColor_int(&srcLine[(x + nXSrc) * srcByte], SrcFormat);
const UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette);
FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor);
}
for (; x < nWidth; x++)
{
const UINT32 color = FreeRDPReadColor_int(&srcLine[(x + nXSrc) * srcByte], SrcFormat);
const UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette);
FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor);
}
}
return PRIMITIVES_SUCCESS;
}
pstatus_t generic_image_copy_no_overlap_memcpy(
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset,
UINT32 flags)
{
const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) ? TRUE : FALSE;
const SSIZE_T dstByte = FreeRDPGetBytesPerPixel(DstFormat);
const SSIZE_T srcByte = FreeRDPGetBytesPerPixel(SrcFormat);
const SSIZE_T copyDstWidth = nWidth * dstByte;
const SSIZE_T xSrcOffset = nXSrc * srcByte;
const SSIZE_T xDstOffset = nXDst * dstByte;
if (!vSrcVFlip && (nDstStep == nSrcStep) && (xSrcOffset == 0) && (xDstOffset == 0))
{
const void* src = &pSrcData[1ull * nYSrc * nSrcStep];
void* dst = &pDstData[1ull * nYDst * nDstStep];
memcpy(dst, src, 1ull * nDstStep * nHeight);
}
else
{
for (SSIZE_T y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
memcpy(&dstLine[xDstOffset], &srcLine[xSrcOffset], copyDstWidth);
}
}
return PRIMITIVES_SUCCESS;
}
static INLINE pstatus_t generic_image_copy_no_overlap_dst_alpha(
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
{
WINPR_ASSERT(pDstData);
WINPR_ASSERT(pSrcData);
switch (SrcFormat)
{
case PIXEL_FORMAT_BGR24:
switch (DstFormat)
{
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
return generic_image_copy_bgr24_bgrx32(
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
default:
break;
}
break;
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
switch (DstFormat)
{
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
return generic_image_copy_bgrx32_bgrx32(
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
default:
break;
}
break;
default:
break;
}
return generic_image_copy_no_overlap_convert(
pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
}
static INLINE pstatus_t generic_image_copy_no_overlap_no_alpha(
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset,
UINT32 flags)
{
if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
nXSrc, nYSrc, palette, srcVMultiplier,
srcVOffset, dstVMultiplier, dstVOffset, flags);
else
return generic_image_copy_no_overlap_convert(pDstData, DstFormat, nDstStep, nXDst, nYDst,
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
nXSrc, nYSrc, palette, srcVMultiplier,
srcVOffset, dstVMultiplier, dstVOffset);
}
static pstatus_t generic_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
const gdiPalette* WINPR_RESTRICT palette,
UINT32 flags)
{
const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) ? TRUE : FALSE;
SSIZE_T srcVOffset = 0;
SSIZE_T srcVMultiplier = 1;
SSIZE_T dstVOffset = 0;
SSIZE_T dstVMultiplier = 1;
if ((nWidth == 0) || (nHeight == 0))
return PRIMITIVES_SUCCESS;
if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
return -1;
if (!pDstData || !pSrcData)
return -1;
if (nDstStep == 0)
nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
if (nSrcStep == 0)
nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
if (vSrcVFlip)
{
srcVOffset = (nHeight - 1ll) * nSrcStep;
srcVMultiplier = -1;
}
if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
return generic_image_copy_no_overlap_dst_alpha(
pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, SrcFormat,
nSrcStep, nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset, dstVMultiplier,
dstVOffset);
else
return generic_image_copy_no_overlap_no_alpha(
pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, SrcFormat,
nSrcStep, nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset,
flags);
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
void primitives_init_copy(primitives_t* prims)
{
@ -136,6 +385,7 @@ void primitives_init_copy(primitives_t* prims)
prims->copy_8u_AC4r = general_copy_8u_AC4r;
/* This is just an alias with void* parameters */
prims->copy = (__copy_t)(prims->copy_8u);
prims->copy_no_overlap = generic_image_copy_no_overlap;
}
#if defined(WITH_SSE2) || defined(WITH_NEON)
@ -153,5 +403,7 @@ void primitives_init_copy_opt(primitives_t* prims)
*/
/* This is just an alias with void* parameters */
prims->copy = (__copy_t)(prims->copy_8u);
primitives_init_copy_sse(prims);
primitives_init_copy_avx2(prims);
}
#endif

View File

@ -0,0 +1,42 @@
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Primitives copy
*
* Copyright 2024 Armin Novak <anovak@thincast.com>
* Copyright 2024 Thincast Technologies GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef FREERDP_LIB_PRIM_COPY_H
#define FREERDP_LIB_PRIM_COPY_H
#include <winpr/wtypes.h>
#include <freerdp/primitives.h>
pstatus_t generic_image_copy_no_overlap_convert(
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset);
pstatus_t generic_image_copy_no_overlap_memcpy(
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset,
UINT32 flags);
extern void primitives_init_copy_sse(primitives_t* prims);
extern void primitives_init_copy_avx2(primitives_t* prims);
#endif

View File

@ -0,0 +1,276 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Copy operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <winpr/sysinfo.h>
#include <freerdp/config.h>
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <freerdp/log.h>
#include "prim_internal.h"
#include "prim_copy.h"
#include "../codec/color.h"
#include <freerdp/codec/color.h>
#define TAG FREERDP_TAG("primitives.copy")
#if defined(WITH_SSE2)
#include <emmintrin.h>
#include <immintrin.h>
static INLINE pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
SSIZE_T srcVMultiplier, SSIZE_T srcVOffset,
SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
{
const SSIZE_T srcByte = 3;
const SSIZE_T dstByte = 4;
const __m256i mask = _mm256_set_epi32(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
const SSIZE_T rem = nWidth % 8;
const SSIZE_T width = nWidth - rem;
for (SSIZE_T y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
SSIZE_T x = 0;
for (; x < width; x += 8)
{
const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
__m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
const __m256i s0 = _mm256_loadu_si256(src);
const __m256i s1 = _mm256_loadu_si256(dst);
const __m256i s2 = _mm256_shuffle_epi8(s1, mask);
__m256i d0 = _mm256_blendv_epi8(s2, s0, mask);
_mm256_storeu_si256(dst, d0);
}
for (; x < nWidth; x++)
{
const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
BYTE* dst = &dstLine[(x + nXDst) * dstByte];
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
}
}
return PRIMITIVES_SUCCESS;
}
static INLINE pstatus_t avx2_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData,
UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
SSIZE_T srcVMultiplier, SSIZE_T srcVOffset,
SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
{
const SSIZE_T srcByte = 4;
const SSIZE_T dstByte = 4;
const __m256i mask =
_mm256_setr_epi8(0xFF, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0x00,
0xFF, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0x00,
0xFF, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0x00);
const SSIZE_T rem = nWidth % 8;
const SSIZE_T width = nWidth - rem;
for (SSIZE_T y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
SSIZE_T x = 0;
for (; x < width; x += 8)
{
const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
__m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
const __m256i s0 = _mm256_loadu_si256(src);
const __m256i s1 = _mm256_loadu_si256(dst);
__m256i d0 = _mm256_blendv_epi8(s1, s0, mask);
_mm256_storeu_si256(dst, d0);
}
for (; x < nWidth; x++)
{
const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
BYTE* dst = &dstLine[(x + nXDst) * dstByte];
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
}
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t avx2_image_copy_no_overlap_dst_alpha(
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
{
WINPR_ASSERT(pDstData);
WINPR_ASSERT(pSrcData);
switch (SrcFormat)
{
case PIXEL_FORMAT_BGR24:
switch (DstFormat)
{
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
return avx2_image_copy_bgr24_bgrx32(
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
default:
break;
}
break;
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
switch (DstFormat)
{
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
return avx2_image_copy_bgrx32_bgrx32(
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
default:
break;
}
break;
default:
break;
}
WLog_DBG(TAG, "unsupported format src %s --> dst %s", FreeRDPGetColorFormatName(SrcFormat),
FreeRDPGetColorFormatName(DstFormat));
return -1;
}
static INLINE pstatus_t avx2_image_copy_no_overlap_convert(
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
{
const SSIZE_T srcByte = FreeRDPGetBytesPerPixel(SrcFormat);
const SSIZE_T dstByte = FreeRDPGetBytesPerPixel(DstFormat);
const UINT32 width = nWidth - nWidth % 8;
for (SSIZE_T y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
SSIZE_T x = 0;
WINPR_PRAGMA_UNROLL_LOOP
for (; x < width; x++)
{
const UINT32 color = FreeRDPReadColor_int(&srcLine[(x + nXSrc) * srcByte], SrcFormat);
const UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette);
FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor);
}
for (; x < nWidth; x++)
{
const UINT32 color = FreeRDPReadColor_int(&srcLine[(x + nXSrc) * srcByte], SrcFormat);
const UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette);
FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor);
}
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t avx2_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
{
const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) ? TRUE : FALSE;
SSIZE_T srcVOffset = 0;
SSIZE_T srcVMultiplier = 1;
SSIZE_T dstVOffset = 0;
SSIZE_T dstVMultiplier = 1;
if ((nWidth == 0) || (nHeight == 0))
return PRIMITIVES_SUCCESS;
if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
return -1;
if (!pDstData || !pSrcData)
return -1;
if (nDstStep == 0)
nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
if (nSrcStep == 0)
nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
if (vSrcVFlip)
{
srcVOffset = (nHeight - 1ll) * nSrcStep;
srcVMultiplier = -1;
}
if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
return avx2_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
nXSrc, nYSrc, palette, srcVMultiplier,
srcVOffset, dstVMultiplier, dstVOffset);
else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
nXSrc, nYSrc, palette, srcVMultiplier,
srcVOffset, dstVMultiplier, dstVOffset, flags);
else
return avx2_image_copy_no_overlap_convert(pDstData, DstFormat, nDstStep, nXDst, nYDst,
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset,
dstVMultiplier, dstVOffset);
}
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_copy_avx2(primitives_t* prims)
{
#if defined(WITH_SSE2)
if (IsProcessorFeaturePresent(PF_AVX2_INSTRUCTIONS_AVAILABLE))
{
prims->copy_no_overlap = avx2_image_copy_no_overlap;
}
#else
WINPR_UNUSED(prims);
#endif
}

View File

@ -0,0 +1,274 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Copy operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#include <winpr/sysinfo.h>
#include <freerdp/config.h>
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <freerdp/log.h>
#include "prim_internal.h"
#include "prim_copy.h"
#include "../codec/color.h"
#include <freerdp/codec/color.h>
#define TAG FREERDP_TAG("primitives.copy")
#if defined(WITH_SSE2)
#include <emmintrin.h>
#include <immintrin.h>
static INLINE pstatus_t sse_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
SSIZE_T srcVMultiplier, SSIZE_T srcVOffset,
SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
{
const SSIZE_T srcByte = 3;
const SSIZE_T dstByte = 4;
const __m128i mask = _mm_set_epi32(0xFF, 0xFF, 0xFF, 0xFF);
const SSIZE_T rem = nWidth % 4;
const SSIZE_T width = nWidth - rem;
for (SSIZE_T y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
SSIZE_T x = 0;
for (; x < width; x += 4)
{
const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
__m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
const __m128i s0 = _mm_loadu_si128(src);
const __m128i s1 = _mm_loadu_si128(dst);
const __m128i s2 = _mm_shuffle_epi8(s1, mask);
__m128i d0 = _mm_blendv_epi8(s2, s0, mask);
_mm_storeu_si128(dst, d0);
}
for (; x < nWidth; x++)
{
const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
BYTE* dst = &dstLine[(x + nXDst) * dstByte];
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
}
}
return PRIMITIVES_SUCCESS;
}
static INLINE pstatus_t sse_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
SSIZE_T srcVMultiplier, SSIZE_T srcVOffset,
SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
{
const SSIZE_T srcByte = 4;
const SSIZE_T dstByte = 4;
const __m128i mask = _mm_setr_epi8(0xFF, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0x00, 0xFF, 0xFF,
0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0x00);
const SSIZE_T rem = nWidth % 4;
const SSIZE_T width = nWidth - rem;
for (SSIZE_T y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
SSIZE_T x = 0;
for (; x < width; x += 4)
{
const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
__m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
const __m128i s0 = _mm_loadu_si128(src);
const __m128i s1 = _mm_loadu_si128(dst);
__m128i d0 = _mm_blendv_epi8(s1, s0, mask);
_mm_storeu_si128(dst, d0);
}
for (; x < nWidth; x++)
{
const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
BYTE* dst = &dstLine[(x + nXDst) * dstByte];
*dst++ = *src++;
*dst++ = *src++;
*dst++ = *src++;
}
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t sse_image_copy_no_overlap_dst_alpha(
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
{
WINPR_ASSERT(pDstData);
WINPR_ASSERT(pSrcData);
switch (SrcFormat)
{
case PIXEL_FORMAT_BGR24:
switch (DstFormat)
{
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
return sse_image_copy_bgr24_bgrx32(
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
default:
break;
}
break;
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
switch (DstFormat)
{
case PIXEL_FORMAT_BGRX32:
case PIXEL_FORMAT_BGRA32:
return sse_image_copy_bgrx32_bgrx32(
pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
default:
break;
}
break;
default:
break;
}
WLog_DBG(TAG, "unsupported format src %s --> dst %s", FreeRDPGetColorFormatName(SrcFormat),
FreeRDPGetColorFormatName(DstFormat));
return -1;
}
static INLINE pstatus_t sse_image_copy_no_overlap_convert(
BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
{
const SSIZE_T srcByte = FreeRDPGetBytesPerPixel(SrcFormat);
const SSIZE_T dstByte = FreeRDPGetBytesPerPixel(DstFormat);
const UINT32 width = nWidth - nWidth % 8;
for (SSIZE_T y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
&pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
BYTE* WINPR_RESTRICT dstLine =
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
SSIZE_T x = 0;
WINPR_PRAGMA_UNROLL_LOOP
for (; x < width; x++)
{
const UINT32 color = FreeRDPReadColor(&srcLine[(x + nXSrc) * srcByte], SrcFormat);
const UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette);
FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor);
}
for (; x < nWidth; x++)
{
const UINT32 color = FreeRDPReadColor(&srcLine[(x + nXSrc) * srcByte], SrcFormat);
const UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette);
FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor);
}
}
return PRIMITIVES_SUCCESS;
}
static pstatus_t sse_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
UINT32 nWidth, UINT32 nHeight,
const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
{
const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) ? TRUE : FALSE;
SSIZE_T srcVOffset = 0;
SSIZE_T srcVMultiplier = 1;
SSIZE_T dstVOffset = 0;
SSIZE_T dstVMultiplier = 1;
if ((nWidth == 0) || (nHeight == 0))
return PRIMITIVES_SUCCESS;
if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
return -1;
if (!pDstData || !pSrcData)
return -1;
if (nDstStep == 0)
nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
if (nSrcStep == 0)
nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
if (vSrcVFlip)
{
srcVOffset = (nHeight - 1ll) * nSrcStep;
srcVMultiplier = -1;
}
if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
return sse_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
nXSrc, nYSrc, palette, srcVMultiplier,
srcVOffset, dstVMultiplier, dstVOffset);
else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
nXSrc, nYSrc, palette, srcVMultiplier,
srcVOffset, dstVMultiplier, dstVOffset, flags);
else
return sse_image_copy_no_overlap_convert(pDstData, DstFormat, nDstStep, nXDst, nYDst,
nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset,
dstVMultiplier, dstVOffset);
}
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_copy_sse(primitives_t* prims)
{
#if defined(WITH_SSE2)
if (IsProcessorFeaturePresent(PF_SSE4_1_INSTRUCTIONS_AVAILABLE))
{
prims->copy_no_overlap = sse_image_copy_no_overlap;
}
#else
WINPR_UNUSED(prims);
#endif
}