[primitives,sse] skip optimized copy on wrong alignment

This commit is contained in:
akallabeth 2024-10-10 10:51:26 +02:00
parent 62e8270db1
commit f5171b6b47
No known key found for this signature in database
GPG Key ID: A49454A3FC909FD5
2 changed files with 31 additions and 16 deletions

View File

@ -55,6 +55,9 @@ static INLINE pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstDa
const __m256i mask = _mm256_set_epi32(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
const SSIZE_T rem = nWidth % 8;
const SSIZE_T width = nWidth - rem;
const size_t align = nSrcStep % 32;
const BOOL fast = (align == 0) ? TRUE : (align >= 8 - MIN(8, rem) ? TRUE : FALSE);
for (SSIZE_T y = 0; y < nHeight; y++)
{
const BYTE* WINPR_RESTRICT srcLine =
@ -63,15 +66,20 @@ static INLINE pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstDa
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
SSIZE_T x = 0;
for (; x < width; x += 8)
/* Ensure alignment requirements can be met */
if (fast)
{
const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
__m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
const __m256i s0 = _mm256_loadu_si256(src);
const __m256i s1 = _mm256_loadu_si256(dst);
const __m256i s2 = _mm256_shuffle_epi8(s1, mask);
__m256i d0 = _mm256_blendv_epi8(s2, s0, mask);
_mm256_storeu_si256(dst, d0);
for (; x < width; x += 8)
{
const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
__m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
const __m256i s0 = _mm256_loadu_si256(src);
const __m256i s1 = _mm256_loadu_si256(dst);
const __m256i s2 = _mm256_shuffle_epi8(s1, mask);
__m256i d0 = _mm256_blendv_epi8(s2, s0, mask);
_mm256_storeu_si256(dst, d0);
}
}
for (; x < nWidth; x++)
{

View File

@ -54,6 +54,9 @@ static INLINE pstatus_t sse_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstDat
const __m128i mask = _mm_set_epi32(0xFF, 0xFF, 0xFF, 0xFF);
const SSIZE_T rem = nWidth % 4;
const size_t align = nSrcStep % 16;
const BOOL fast = (align == 0) ? TRUE : (align >= 4 - MIN(4, rem) ? TRUE : FALSE);
const SSIZE_T width = nWidth - rem;
for (SSIZE_T y = 0; y < nHeight; y++)
{
@ -63,15 +66,19 @@ static INLINE pstatus_t sse_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstDat
&pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
SSIZE_T x = 0;
for (; x < width; x += 4)
/* Ensure alignment requirements can be met */
if (fast)
{
const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
__m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
const __m128i s0 = _mm_loadu_si128(src);
const __m128i s1 = _mm_loadu_si128(dst);
const __m128i s2 = _mm_shuffle_epi8(s1, mask);
__m128i d0 = _mm_blendv_epi8(s2, s0, mask);
_mm_storeu_si128(dst, d0);
for (; x < width; x += 4)
{
const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
__m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
const __m128i s0 = _mm_loadu_si128(src);
const __m128i s1 = _mm_loadu_si128(dst);
const __m128i s2 = _mm_shuffle_epi8(s1, mask);
__m128i d0 = _mm_blendv_epi8(s2, s0, mask);
_mm_storeu_si128(dst, d0);
}
}
for (; x < nWidth; x++)
{