diff --git a/libfreerdp/primitives/CMakeLists.txt b/libfreerdp/primitives/CMakeLists.txt
index 8f56fe936..7526f7ace 100644
--- a/libfreerdp/primitives/CMakeLists.txt
+++ b/libfreerdp/primitives/CMakeLists.txt
@@ -2,63 +2,86 @@
 
 set(PRIMITIVES_SRCS
 	prim_add.c
+	prim_add.h
 	prim_andor.c
+	prim_andor.h
 	prim_alphaComp.c
+	prim_alphaComp.h
 	prim_colors.c
+	prim_colors.h
 	prim_copy.c
 	prim_copy.h
 	prim_set.c
+	prim_set.h
 	prim_shift.c
+	prim_shift.h
 	prim_sign.c
+	prim_sign.h
 	prim_YUV.c
+	prim_YUV.h
 	prim_YCoCg.c
+	prim_YCoCg.h
 	primitives.c
 	prim_internal.h)
 
-if (WITH_SSE2 OR WITH_NEON)
-	set(PRIMITIVES_SSE2_SRCS
-		prim_colors_opt.c
-		prim_copy_sse.c
-		prim_copy_avx2.c
-		prim_set_opt.c)
+set(PRIMITIVES_SSE2_SRCS
+	sse/prim_colors_sse2.c
+	sse/prim_set_sse2.c
+	)
 
-	set(PRIMITIVES_SSE3_SRCS
-		prim_add_opt.c
-		prim_alphaComp_opt.c
-		prim_andor_opt.c
-		prim_shift_opt.c)
+set(PRIMITIVES_SSE3_SRCS
+	sse/prim_add_sse3.c
+	sse/prim_alphaComp_sse3.c
+	sse/prim_andor_sse3.c
+	sse/prim_shift_sse3.c
+	)
 
-	set(PRIMITIVES_SSSE3_SRCS
-		prim_sign_opt.c
-		prim_YCoCg_opt.c)
+set(PRIMITIVES_SSSE3_SRCS
+	sse/prim_YUV_ssse3.c
+	sse/prim_sign_ssse3.c
+	sse/prim_YCoCg_ssse3.c
+	)
 
-	if (WITH_SSE2)
-		set(PRIMITIVES_SSSE3_SRCS ${PRIMITIVES_SSSE3_SRCS}
-			prim_YUV_ssse3.c)
-	endif()
+set(PRIMITIVES_SSE4_1_SRCS
+	sse/prim_copy_sse4_1.c
+	)
 
-	if (WITH_NEON)
-		set(PRIMITIVES_SSSE3_SRCS ${PRIMITIVES_SSSE3_SRCS}
-			prim_YUV_neon.c)
-	endif()
-endif()
+set(PRIMITIVES_SSE4_2_SRCS
+	)
+
+set(PRIMITIVES_AVX2_SRCS
+	sse/prim_copy_avx2.c
+	)
+
+set(PRIMITIVES_NEON_SRCS
+	neon/prim_colors_neon.c
+	neon/prim_YCoCg_neon.c
+	neon/prim_YUV_neon.c
+	)
+
+set(PRIMITIVES_OPENCL_SRCS
+	opencl/prim_YUV_opencl.c
+)
 
 if (WITH_OPENCL)
-	set(PRIMITIVES_OPENCL_SRCS prim_YUV_opencl.c)
-
 		freerdp_include_directory_add(${OpenCL_INCLUDE_DIRS})
 		freerdp_library_add(OpenCL::OpenCL)
 
 endif()
 
 set(PRIMITIVES_OPT_SRCS
+	${PRIMITIVES_NEON_SRCS}
 	${PRIMITIVES_SSE2_SRCS}
 	${PRIMITIVES_SSE3_SRCS}
 	${PRIMITIVES_SSSE3_SRCS}
+	${PRIMITIVES_SSE4_1_SRCS}
+	${PRIMITIVES_SSE4_2_SRCS}
+	${PRIMITIVES_AVX2_SRCS}
 	${PRIMITIVES_OPENCL_SRCS})
 
 set(PRIMITIVES_SRCS ${PRIMITIVES_SRCS} ${PRIMITIVES_OPT_SRCS})
 
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 add_library(freerdp-primitives OBJECT
 	${PRIMITIVES_SRCS}
 )
@@ -74,8 +97,15 @@ if(WITH_SSE2)
 		if (PRIMITIVES_SSSE3_SRCS)
 			set_source_files_properties(${PRIMITIVES_SSSE3_SRCS} PROPERTIES COMPILE_FLAGS "-mssse3" )
 		endif()
-		set_source_files_properties(prim_copy_sse.c PROPERTIES COMPILE_FLAGS "-msse4.1" )
-		set_source_files_properties(prim_copy_avx2.c PROPERTIES COMPILE_FLAGS "-mavx2" )
+		if (PRIMITIVES_SSE4_1_SRCS)
+			set_source_files_properties(${PRIMITIVES_SSE4_1_SRCS} PROPERTIES COMPILE_FLAGS "-msse4.1" )
+		endif()
+		if (PRIMITIVES_SSE4_2_SRCS)
+			set_source_files_properties(${PRIMITIVES_SSE4_2_SRCS} PROPERTIES COMPILE_FLAGS "-msse4.2" )
+		endif()
+		if (PRIMITIVES_AVX2_SRCS)
+			set_source_files_properties(${PRIMITIVES_AVX2_SRCS} PROPERTIES COMPILE_FLAGS "-mavx2" )
+		endif()
 	endif()
 
 	if(MSVC)
diff --git a/libfreerdp/primitives/neon/prim_YCoCg_neon.c b/libfreerdp/primitives/neon/prim_YCoCg_neon.c
new file mode 100644
index 000000000..ff1ff002d
--- /dev/null
+++ b/libfreerdp/primitives/neon/prim_YCoCg_neon.c
@@ -0,0 +1,173 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized YCoCg<->RGB conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#if defined(WITH_NEON)
+#include <arm_neon.h>
+#endif
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+#include "prim_YCoCg.h"
+
+#if defined(WITH_NEON)
+static primitives_t* generic = NULL;
+
+static pstatus_t neon_YCoCgToRGB_8u_X(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
+                                      BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
+                                      UINT32 width, UINT32 height, UINT8 shift, BYTE bPos,
+                                      BYTE gPos, BYTE rPos, BYTE aPos, BOOL alpha)
+{
+	BYTE* dptr = pDst;
+	const BYTE* sptr = pSrc;
+	const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
+	const int8_t cll = shift - 1; /* -1 builds in the /2's */
+	const UINT32 srcPad = srcStep - (width * 4);
+	const UINT32 dstPad = dstStep - (width * formatSize);
+	const UINT32 pad = width % 8;
+	const uint8x8_t aVal = vdup_n_u8(0xFF);
+	const int8x8_t cllv = vdup_n_s8(cll);
+
+	for (UINT32 y = 0; y < height; y++)
+	{
+		for (UINT32 x = 0; x < width - pad; x += 8)
+		{
+			/* Note: shifts must be done before sign-conversion. */
+			const uint8x8x4_t raw = vld4_u8(sptr);
+			const int8x8_t CgRaw = vreinterpret_s8_u8(vshl_u8(raw.val[0], cllv));
+			const int8x8_t CoRaw = vreinterpret_s8_u8(vshl_u8(raw.val[1], cllv));
+			const int16x8_t Cg = vmovl_s8(CgRaw);
+			const int16x8_t Co = vmovl_s8(CoRaw);
+			const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(raw.val[2])); /* UINT8 -> INT16 */
+			const int16x8_t T = vsubq_s16(Y, Cg);
+			const int16x8_t R = vaddq_s16(T, Co);
+			const int16x8_t G = vaddq_s16(Y, Cg);
+			const int16x8_t B = vsubq_s16(T, Co);
+			uint8x8x4_t bgrx;
+			bgrx.val[bPos] = vqmovun_s16(B);
+			bgrx.val[gPos] = vqmovun_s16(G);
+			bgrx.val[rPos] = vqmovun_s16(R);
+
+			if (alpha)
+				bgrx.val[aPos] = raw.val[3];
+			else
+				bgrx.val[aPos] = aVal;
+
+			vst4_u8(dptr, bgrx);
+			sptr += sizeof(raw);
+			dptr += sizeof(bgrx);
+		}
+
+		for (UINT32 x = 0; x < pad; x++)
+		{
+			/* Note: shifts must be done before sign-conversion. */
+			const INT16 Cg = (INT16)((INT8)((*sptr++) << cll));
+			const INT16 Co = (INT16)((INT8)((*sptr++) << cll));
+			const INT16 Y = (INT16)(*sptr++); /* UINT8->INT16 */
+			const INT16 T = Y - Cg;
+			const INT16 R = T + Co;
+			const INT16 G = Y + Cg;
+			const INT16 B = T - Co;
+			BYTE bgra[4];
+			bgra[bPos] = CLIP(B);
+			bgra[gPos] = CLIP(G);
+			bgra[rPos] = CLIP(R);
+			bgra[aPos] = *sptr++;
+
+			if (!alpha)
+				bgra[aPos] = 0xFF;
+
+			*dptr++ = bgra[0];
+			*dptr++ = bgra[1];
+			*dptr++ = bgra[2];
+			*dptr++ = bgra[3];
+		}
+
+		sptr += srcPad;
+		dptr += dstPad;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_YCoCgToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
+                                         BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
+                                         UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha)
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 2, 1, 0, 3, withAlpha);
+
+		case PIXEL_FORMAT_BGRX32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 2, 1, 0, 3, withAlpha);
+
+		case PIXEL_FORMAT_RGBA32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 0, 1, 2, 3, withAlpha);
+
+		case PIXEL_FORMAT_RGBX32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 0, 1, 2, 3, withAlpha);
+
+		case PIXEL_FORMAT_ARGB32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 1, 2, 3, 0, withAlpha);
+
+		case PIXEL_FORMAT_XRGB32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 1, 2, 3, 0, withAlpha);
+
+		case PIXEL_FORMAT_ABGR32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 3, 2, 1, 0, withAlpha);
+
+		case PIXEL_FORMAT_XBGR32:
+			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
+			                            shift, 3, 2, 1, 0, withAlpha);
+
+		default:
+			return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
+			                                   height, shift, withAlpha);
+	}
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_YCoCg_neon(primitives_t* WINPR_RESTRICT prims)
+{
+#if defined(WITH_NEON)
+	generic = primitives_get_generic();
+	primitives_init_YCoCg(prims);
+
+	if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
+	{
+		prims->YCoCgToRGB_8u_AC4R = neon_YCoCgToRGB_8u_AC4R;
+	}
+#else
+	WINPR_UNUSED(prims);
+#endif
+}
diff --git a/libfreerdp/primitives/prim_YUV_neon.c b/libfreerdp/primitives/neon/prim_YUV_neon.c
similarity index 99%
rename from libfreerdp/primitives/prim_YUV_neon.c
rename to libfreerdp/primitives/neon/prim_YUV_neon.c
index 107ced2b2..fd1cafac4 100644
--- a/libfreerdp/primitives/prim_YUV_neon.c
+++ b/libfreerdp/primitives/neon/prim_YUV_neon.c
@@ -28,11 +28,9 @@
 #include <freerdp/primitives.h>
 
 #include "prim_internal.h"
+#include "prim_YUV.h"
 
-#if !defined(WITH_NEON)
-#error "This file must only be included if WITH_NEON is active!"
-#endif
-
+#if defined(WITH_NEON)
 #include <arm_neon.h>
 
 static primitives_t* generic = NULL;
@@ -742,9 +740,11 @@ static pstatus_t neon_YUV420CombineToYUV444(avc444_frame_type type,
 			return -1;
 	}
 }
+#endif
 
-void primitives_init_YUV_opt(primitives_t* prims)
+void primitives_init_YUV_neon(primitives_t* prims)
 {
+#if defined(WITH_NEON)
 	generic = primitives_get_generic();
 	primitives_init_YUV(prims);
 
@@ -754,4 +754,7 @@ void primitives_init_YUV_opt(primitives_t* prims)
 		prims->YUV444ToRGB_8u_P3AC4R = neon_YUV444ToRGB_8u_P3AC4R;
 		prims->YUV420CombineToYUV444 = neon_YUV420CombineToYUV444;
 	}
+#else
+	WINPR_UNUSED(prims);
+#endif
 }
diff --git a/libfreerdp/primitives/neon/prim_colors_neon.c b/libfreerdp/primitives/neon/prim_colors_neon.c
new file mode 100644
index 000000000..cf61c4055
--- /dev/null
+++ b/libfreerdp/primitives/neon/prim_colors_neon.c
@@ -0,0 +1,365 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized Color conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * Copyright 2011 Stephen Erisman
+ * Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
+ * Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <freerdp/config.h>
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#if defined(WITH_NEON)
+#include <arm_neon.h>
+#endif
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+#include "prim_colors.h"
+
+/*---------------------------------------------------------------------------*/
+#ifdef WITH_NEON
+static primitives_t* generic = NULL;
+
+static pstatus_t
+neon_yCbCrToRGB_16s16s_P3P3(const INT16* const WINPR_RESTRICT pSrc[3], INT32 srcStep,
+                            INT16* WINPR_RESTRICT pDst[3], INT32 dstStep,
+                            const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+	/* TODO: If necessary, check alignments and call the general version. */
+	int16x8_t zero = vdupq_n_s16(0);
+	int16x8_t max = vdupq_n_s16(255);
+	int16x8_t r_cr = vdupq_n_s16(22986);  //  1.403 << 14
+	int16x8_t g_cb = vdupq_n_s16(-5636);  // -0.344 << 14
+	int16x8_t g_cr = vdupq_n_s16(-11698); // -0.714 << 14
+	int16x8_t b_cb = vdupq_n_s16(28999);  //  1.770 << 14
+	int16x8_t c4096 = vdupq_n_s16(4096);
+	int16x8_t* y_buf = (int16x8_t*)pSrc[0];
+	int16x8_t* cb_buf = (int16x8_t*)pSrc[1];
+	int16x8_t* cr_buf = (int16x8_t*)pSrc[2];
+	int16x8_t* r_buf = (int16x8_t*)pDst[0];
+	int16x8_t* g_buf = (int16x8_t*)pDst[1];
+	int16x8_t* b_buf = (int16x8_t*)pDst[2];
+	int srcbump = srcStep / sizeof(int16x8_t);
+	int dstbump = dstStep / sizeof(int16x8_t);
+	int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
+
+	for (int yp = 0; yp < roi->height; ++yp)
+	{
+		for (int i = 0; i < imax; i++)
+		{
+			/*
+			    In order to use NEON signed 16-bit integer multiplication we need to convert
+			    the floating point factors to signed int without loosing information.
+			    The result of this multiplication is 32 bit and we have a NEON instruction
+			    that returns the hi word of the saturated double.
+			    Thus we will multiply the factors by the highest possible 2^n, take the
+			    upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
+			    shift by 1 to reverse the doubling) and correct	this result by multiplying it
+			    by 2^(16-n).
+			    For the given factors in the conversion matrix the best possible n is 14.
+
+			    Example for calculating r:
+			    r = (y>>5) + 128 + (cr*1.403)>>5                       // our base formula
+			    r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5      // see above
+			    r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5             // simplification
+			    r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
+			*/
+			/* y = (y_buf[i] + 4096) >> 2 */
+			int16x8_t y = vld1q_s16((INT16*)&y_buf[i]);
+			y = vaddq_s16(y, c4096);
+			y = vshrq_n_s16(y, 2);
+			/* cb = cb_buf[i]; */
+			int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
+			/* cr = cr_buf[i]; */
+			int16x8_t cr = vld1q_s16((INT16*)&cr_buf[i]);
+			/* (y + HIWORD(cr*22986)) >> 3 */
+			int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
+			r = vshrq_n_s16(r, 3);
+			/* r_buf[i] = CLIP(r); */
+			r = vminq_s16(vmaxq_s16(r, zero), max);
+			vst1q_s16((INT16*)&r_buf[i], r);
+			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
+			int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
+			g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
+			g = vshrq_n_s16(g, 3);
+			/* g_buf[i] = CLIP(g); */
+			g = vminq_s16(vmaxq_s16(g, zero), max);
+			vst1q_s16((INT16*)&g_buf[i], g);
+			/* (y + HIWORD(cb*28999)) >> 3 */
+			int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
+			b = vshrq_n_s16(b, 3);
+			/* b_buf[i] = CLIP(b); */
+			b = vminq_s16(vmaxq_s16(b, zero), max);
+			vst1q_s16((INT16*)&b_buf[i], b);
+		}
+
+		y_buf += srcbump;
+		cb_buf += srcbump;
+		cr_buf += srcbump;
+		r_buf += dstbump;
+		g_buf += dstbump;
+		b_buf += dstbump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(const INT16* const WINPR_RESTRICT pSrc[3],
+                                                UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
+                                                UINT32 dstStep,
+                                                const prim_size_t* WINPR_RESTRICT roi, uint8_t rPos,
+                                                uint8_t gPos, uint8_t bPos, uint8_t aPos)
+{
+	BYTE* pRGB = pDst;
+	const INT16* pY = pSrc[0];
+	const INT16* pCb = pSrc[1];
+	const INT16* pCr = pSrc[2];
+	const size_t srcPad = (srcStep - (roi->width * sizeof(INT16))) / sizeof(INT16);
+	const size_t dstPad = (dstStep - (roi->width * 4)) / 4;
+	const size_t pad = roi->width % 8;
+	const int16x4_t c4096 = vdup_n_s16(4096);
+
+	for (UINT32 y = 0; y < roi->height; y++)
+	{
+		for (UINT32 x = 0; x < roi->width - pad; x += 8)
+		{
+			const int16x8_t Y = vld1q_s16(pY);
+			const int16x4_t Yh = vget_high_s16(Y);
+			const int16x4_t Yl = vget_low_s16(Y);
+			const int32x4_t YhAdd = vaddl_s16(Yh, c4096); /* Y + 4096 */
+			const int32x4_t YlAdd = vaddl_s16(Yl, c4096); /* Y + 4096 */
+			const int32x4_t YhW = vshlq_n_s32(YhAdd, 16);
+			const int32x4_t YlW = vshlq_n_s32(YlAdd, 16);
+			const int16x8_t Cr = vld1q_s16(pCr);
+			const int16x4_t Crh = vget_high_s16(Cr);
+			const int16x4_t Crl = vget_low_s16(Cr);
+			const int16x8_t Cb = vld1q_s16(pCb);
+			const int16x4_t Cbh = vget_high_s16(Cb);
+			const int16x4_t Cbl = vget_low_s16(Cb);
+			uint8x8x4_t bgrx;
+			{
+				/* R */
+				const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916); /* 1.402525 * 2^16 */
+				const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916); /* 1.402525 * 2^16 */
+				const int32x4_t CrhRa = vaddq_s32(CrhR, YhW);
+				const int32x4_t CrlRa = vaddq_s32(CrlR, YlW);
+				const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21));
+				const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21));
+				const int16x8_t Rs = vcombine_s16(Rsl, Rsh);
+				bgrx.val[rPos] = vqmovun_s16(Rs);
+			}
+			{
+				/* G */
+				const int32x4_t CbGh = vmull_n_s16(Cbh, 22527);            /* 0.343730 * 2^16 */
+				const int32x4_t CbGl = vmull_n_s16(Cbl, 22527);            /* 0.343730 * 2^16 */
+				const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819); /* 0.714401 * 2^16 */
+				const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819); /* 0.714401 * 2^16 */
+				const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh);
+				const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl);
+				const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh);
+				const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl);
+				const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21));
+				const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21));
+				const int16x8_t Gs = vcombine_s16(Gsl, Gsh);
+				const uint8x8_t G = vqmovun_s16(Gs);
+				bgrx.val[gPos] = G;
+			}
+			{
+				/* B */
+				const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992); /* 1.769905 * 2^16 */
+				const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992); /* 1.769905 * 2^16 */
+				const int32x4_t YCbBh = vaddq_s32(CbBh, YhW);
+				const int32x4_t YCbBl = vaddq_s32(CbBl, YlW);
+				const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21));
+				const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21));
+				const int16x8_t Bs = vcombine_s16(Bsl, Bsh);
+				const uint8x8_t B = vqmovun_s16(Bs);
+				bgrx.val[bPos] = B;
+			}
+			/* A */
+			{
+				bgrx.val[aPos] = vdup_n_u8(0xFF);
+			}
+			vst4_u8(pRGB, bgrx);
+			pY += 8;
+			pCb += 8;
+			pCr += 8;
+			pRGB += 32;
+		}
+
+		for (UINT32 x = 0; x < pad; x++)
+		{
+			const INT32 divisor = 16;
+			const INT32 Y = ((*pY++) + 4096) << divisor;
+			const INT32 Cb = (*pCb++);
+			const INT32 Cr = (*pCr++);
+			const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
+			const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
+			const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
+			const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
+			INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
+			INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
+			INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
+			BYTE bgrx[4];
+			bgrx[bPos] = CLIP(B);
+			bgrx[gPos] = CLIP(G);
+			bgrx[rPos] = CLIP(R);
+			bgrx[aPos] = 0xFF;
+			*pRGB++ = bgrx[0];
+			*pRGB++ = bgrx[1];
+			*pRGB++ = bgrx[2];
+			*pRGB++ = bgrx[3];
+		}
+
+		pY += srcPad;
+		pCb += srcPad;
+		pCr += srcPad;
+		pRGB += dstPad;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3],
+                                              UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
+                                              UINT32 dstStep, UINT32 DstFormat,
+                                              const prim_size_t* WINPR_RESTRICT roi)
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
+
+		case PIXEL_FORMAT_RGBA32:
+		case PIXEL_FORMAT_RGBX32:
+			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
+
+		case PIXEL_FORMAT_ARGB32:
+		case PIXEL_FORMAT_XRGB32:
+			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
+
+		case PIXEL_FORMAT_ABGR32:
+		case PIXEL_FORMAT_XBGR32:
+			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
+
+		default:
+			return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+	}
+}
+
+static pstatus_t neon_RGBToRGB_16s8u_P3AC4R_X(
+    const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+    UINT32 srcStep,                            /* bytes between rows in source data */
+    BYTE* WINPR_RESTRICT pDst,                 /* 32-bit interleaved ARGB (ABGR?) data */
+    UINT32 dstStep,                            /* bytes between rows in dest data */
+    const prim_size_t* WINPR_RESTRICT roi,     /* region of interest */
+    uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
+{
+	UINT32 pad = roi->width % 8;
+
+	for (UINT32 y = 0; y < roi->height; y++)
+	{
+		const INT16* pr = (INT16*)(((BYTE*)pSrc[0]) + y * srcStep);
+		const INT16* pg = (INT16*)(((BYTE*)pSrc[1]) + y * srcStep);
+		const INT16* pb = (INT16*)(((BYTE*)pSrc[2]) + y * srcStep);
+		BYTE* dst = pDst + y * dstStep;
+
+		for (UINT32 x = 0; x < roi->width - pad; x += 8)
+		{
+			int16x8_t r = vld1q_s16(pr);
+			int16x8_t g = vld1q_s16(pg);
+			int16x8_t b = vld1q_s16(pb);
+			uint8x8x4_t bgrx;
+			bgrx.val[aPos] = vdup_n_u8(0xFF);
+			bgrx.val[rPos] = vqmovun_s16(r);
+			bgrx.val[gPos] = vqmovun_s16(g);
+			bgrx.val[bPos] = vqmovun_s16(b);
+			vst4_u8(dst, bgrx);
+			pr += 8;
+			pg += 8;
+			pb += 8;
+			dst += 32;
+		}
+
+		for (UINT32 x = 0; x < pad; x++)
+		{
+			BYTE bgrx[4];
+			bgrx[bPos] = *pb++;
+			bgrx[gPos] = *pg++;
+			bgrx[rPos] = *pr++;
+			bgrx[aPos] = 0xFF;
+			*dst++ = bgrx[0];
+			*dst++ = bgrx[1];
+			*dst++ = bgrx[2];
+			*dst++ = bgrx[3];
+		}
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+static pstatus_t
+neon_RGBToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
+                           UINT32 srcStep,            /* bytes between rows in source data */
+                           BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
+                           UINT32 dstStep,            /* bytes between rows in dest data */
+                           UINT32 DstFormat,
+                           const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
+{
+	switch (DstFormat)
+	{
+		case PIXEL_FORMAT_BGRA32:
+		case PIXEL_FORMAT_BGRX32:
+			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
+
+		case PIXEL_FORMAT_RGBA32:
+		case PIXEL_FORMAT_RGBX32:
+			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
+
+		case PIXEL_FORMAT_ARGB32:
+		case PIXEL_FORMAT_XRGB32:
+			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
+
+		case PIXEL_FORMAT_ABGR32:
+		case PIXEL_FORMAT_XBGR32:
+			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
+
+		default:
+			return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
+	}
+}
+#endif /* WITH_NEON */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_colors_neon(primitives_t* prims)
+{
+#if defined(WITH_NEON)
+	generic = primitives_get_generic();
+	primitives_init_colors(prims);
+
+	if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
+	{
+		prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R;
+		prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R;
+		prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
+	}
+#else
+	WINPR_UNUSED(prims);
+#endif
+}
diff --git a/libfreerdp/primitives/prim_YUV_opencl.c b/libfreerdp/primitives/opencl/prim_YUV_opencl.c
similarity index 99%
rename from libfreerdp/primitives/prim_YUV_opencl.c
rename to libfreerdp/primitives/opencl/prim_YUV_opencl.c
index 2ca1b31d8..304d44d96 100644
--- a/libfreerdp/primitives/prim_YUV_opencl.c
+++ b/libfreerdp/primitives/opencl/prim_YUV_opencl.c
@@ -30,7 +30,6 @@
 #else
 #include <CL/cl.h>
 #endif
-#endif
 
 #include <freerdp/log.h>
 #define TAG FREERDP_TAG("primitives")
@@ -481,9 +480,11 @@ static pstatus_t opencl_YUV444ToRGB_8u_P3AC4R(const BYTE* const WINPR_RESTRICT p
 
 	return opencl_YUVToRGB(kernel_name, pSrc, srcStep, pDst, dstStep, roi);
 }
+#endif
 
 BOOL primitives_init_opencl(primitives_t* prims)
 {
+#if defined(WITH_OPENCL)
 	primitives_t* p = primitives_get_by_type(PRIMITIVES_ONLY_CPU);
 	if (!prims || !p)
 		return FALSE;
@@ -496,5 +497,6 @@ BOOL primitives_init_opencl(primitives_t* prims)
 	prims->YUV444ToRGB_8u_P3AC4R = opencl_YUV444ToRGB_8u_P3AC4R;
 	prims->flags |= PRIM_FLAGS_HAVE_EXTGPU;
 	prims->uninit = primitives_uninit_opencl;
+#endif
 	return TRUE;
 }
diff --git a/libfreerdp/primitives/primitives.cl b/libfreerdp/primitives/opencl/primitives.cl
similarity index 100%
rename from libfreerdp/primitives/primitives.cl
rename to libfreerdp/primitives/opencl/primitives.cl
diff --git a/libfreerdp/primitives/prim_YCoCg.c b/libfreerdp/primitives/prim_YCoCg.c
index 7c1a429f9..4a36f1e9e 100644
--- a/libfreerdp/primitives/prim_YCoCg.c
+++ b/libfreerdp/primitives/prim_YCoCg.c
@@ -23,6 +23,7 @@
 #include <freerdp/primitives.h>
 
 #include "prim_internal.h"
+#include "prim_YCoCg.h"
 
 /* helper function to convert raw 8 bit values to signed 16bit values.
  */
@@ -71,3 +72,9 @@ void primitives_init_YCoCg(primitives_t* prims)
 {
 	prims->YCoCgToRGB_8u_AC4R = general_YCoCgToRGB_8u_AC4R;
 }
+
+void primitives_init_YCoCg_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	primitives_init_YCoCg_ssse3(prims);
+	primitives_init_YCoCg_neon(prims);
+}
diff --git a/libfreerdp/primitives/prim_YCoCg.h b/libfreerdp/primitives/prim_YCoCg.h
new file mode 100644
index 000000000..bd878d5b6
--- /dev/null
+++ b/libfreerdp/primitives/prim_YCoCg.h
@@ -0,0 +1,31 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Primitives copy
+ *
+ * Copyright 2024 Armin Novak <anovak@thincast.com>
+ * Copyright 2024 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FREERDP_LIB_PRIM_YCoCg_H
+#define FREERDP_LIB_PRIM_YCoCg_H
+
+#include <winpr/wtypes.h>
+#include <freerdp/config.h>
+#include <freerdp/primitives.h>
+
+void primitives_init_YCoCg_ssse3(primitives_t* WINPR_RESTRICT prims);
+void primitives_init_YCoCg_neon(primitives_t* WINPR_RESTRICT prims);
+
+#endif
diff --git a/libfreerdp/primitives/prim_YUV.c b/libfreerdp/primitives/prim_YUV.c
index ec021399a..e0d6d5812 100644
--- a/libfreerdp/primitives/prim_YUV.c
+++ b/libfreerdp/primitives/prim_YUV.c
@@ -29,6 +29,7 @@
 #include <freerdp/primitives.h>
 #include <freerdp/codec/color.h>
 #include "prim_internal.h"
+#include "prim_YUV.h"
 
 static pstatus_t general_LumaToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[3],
                                       const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],
@@ -1875,3 +1876,9 @@ void primitives_init_YUV(primitives_t* WINPR_RESTRICT prims)
 	prims->RGBToAVC444YUV = general_RGBToAVC444YUV;
 	prims->RGBToAVC444YUVv2 = general_RGBToAVC444YUVv2;
 }
+
+void primitives_init_YUV_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	primitives_init_YUV_ssse3(prims);
+	primitives_init_YUV_neon(prims);
+}
diff --git a/libfreerdp/primitives/prim_YUV.h b/libfreerdp/primitives/prim_YUV.h
new file mode 100644
index 000000000..0f2f12a19
--- /dev/null
+++ b/libfreerdp/primitives/prim_YUV.h
@@ -0,0 +1,31 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Primitives copy
+ *
+ * Copyright 2024 Armin Novak <anovak@thincast.com>
+ * Copyright 2024 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FREERDP_LIB_PRIM_YUV_H
+#define FREERDP_LIB_PRIM_YUV_H
+
+#include <winpr/wtypes.h>
+#include <freerdp/config.h>
+#include <freerdp/primitives.h>
+
+void primitives_init_YUV_ssse3(primitives_t* prims);
+void primitives_init_YUV_neon(primitives_t* prims);
+
+#endif
diff --git a/libfreerdp/primitives/prim_add.c b/libfreerdp/primitives/prim_add.c
index 6a9a9994d..768419c8f 100644
--- a/libfreerdp/primitives/prim_add.c
+++ b/libfreerdp/primitives/prim_add.c
@@ -22,6 +22,7 @@
 #include <freerdp/primitives.h>
 
 #include "prim_internal.h"
+#include "prim_add.h"
 
 /* ----------------------------------------------------------------------------
  * 16-bit signed add with saturation (under and over).
@@ -74,3 +75,8 @@ void primitives_init_add(primitives_t* prims)
 	prims->add_16s = general_add_16s;
 	prims->add_16s_inplace = general_add_16s_inplace;
 }
+
+void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	primitives_init_add_sse3(prims);
+}
diff --git a/libfreerdp/primitives/prim_add.h b/libfreerdp/primitives/prim_add.h
new file mode 100644
index 000000000..1c151f0f1
--- /dev/null
+++ b/libfreerdp/primitives/prim_add.h
@@ -0,0 +1,30 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Primitives copy
+ *
+ * Copyright 2024 Armin Novak <anovak@thincast.com>
+ * Copyright 2024 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FREERDP_LIB_PRIM_ADD_H
+#define FREERDP_LIB_PRIM_ADD_H
+
+#include <winpr/wtypes.h>
+#include <freerdp/config.h>
+#include <freerdp/primitives.h>
+
+void primitives_init_add_sse3(primitives_t* prims);
+
+#endif
diff --git a/libfreerdp/primitives/prim_alphaComp.c b/libfreerdp/primitives/prim_alphaComp.c
index ae4917d88..5657d70f2 100644
--- a/libfreerdp/primitives/prim_alphaComp.c
+++ b/libfreerdp/primitives/prim_alphaComp.c
@@ -25,6 +25,7 @@
 #include <freerdp/primitives.h>
 
 #include "prim_internal.h"
+#include "prim_alphaComp.h"
 
 #define ALPHA(_k_) (((_k_)&0xFF000000U) >> 24)
 #define RED(_k_) (((_k_)&0x00FF0000U) >> 16)
@@ -91,3 +92,8 @@ void primitives_init_alphaComp(primitives_t* prims)
 {
 	prims->alphaComp_argb = general_alphaComp_argb;
 }
+
+void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	primitives_init_alphaComp_sse3(prims);
+}
diff --git a/libfreerdp/primitives/prim_alphaComp.h b/libfreerdp/primitives/prim_alphaComp.h
new file mode 100644
index 000000000..efd6a58f0
--- /dev/null
+++ b/libfreerdp/primitives/prim_alphaComp.h
@@ -0,0 +1,30 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Primitives copy
+ *
+ * Copyright 2024 Armin Novak <anovak@thincast.com>
+ * Copyright 2024 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FREERDP_LIB_PRIM_ALPHA_COMP_H
+#define FREERDP_LIB_PRIM_ALPHA_COMP_H
+
+#include <winpr/wtypes.h>
+#include <freerdp/config.h>
+#include <freerdp/primitives.h>
+
+void primitives_init_alphaComp_sse3(primitives_t* prims);
+
+#endif
diff --git a/libfreerdp/primitives/prim_andor.c b/libfreerdp/primitives/prim_andor.c
index 92165464b..d4f8d821d 100644
--- a/libfreerdp/primitives/prim_andor.c
+++ b/libfreerdp/primitives/prim_andor.c
@@ -19,6 +19,7 @@
 #include <freerdp/primitives.h>
 
 #include "prim_internal.h"
+#include "prim_andor.h"
 
 /* ----------------------------------------------------------------------------
  * 32-bit AND with a constant.
@@ -55,3 +56,8 @@ void primitives_init_andor(primitives_t* prims)
 	prims->andC_32u = general_andC_32u;
 	prims->orC_32u = general_orC_32u;
 }
+
+void primitives_init_andor_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	primitives_init_andor_sse3(prims);
+}
diff --git a/libfreerdp/primitives/prim_andor.h b/libfreerdp/primitives/prim_andor.h
new file mode 100644
index 000000000..6ec37e4af
--- /dev/null
+++ b/libfreerdp/primitives/prim_andor.h
@@ -0,0 +1,30 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Primitives copy
+ *
+ * Copyright 2024 Armin Novak <anovak@thincast.com>
+ * Copyright 2024 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FREERDP_LIB_PRIM_ANDOR_H
+#define FREERDP_LIB_PRIM_ANDOR_H
+
+#include <winpr/wtypes.h>
+#include <freerdp/config.h>
+#include <freerdp/primitives.h>
+
+void primitives_init_andor_sse3(primitives_t* prims);
+
+#endif
diff --git a/libfreerdp/primitives/prim_colors.c b/libfreerdp/primitives/prim_colors.c
index 4a23129a6..4448a3319 100644
--- a/libfreerdp/primitives/prim_colors.c
+++ b/libfreerdp/primitives/prim_colors.c
@@ -24,6 +24,7 @@
 #include <freerdp/codec/color.h>
 
 #include "prim_internal.h"
+#include "prim_colors.h"
 
 #ifndef MINMAX
 #define MINMAX(_v_, _l_, _h_) ((_v_) < (_l_) ? (_l_) : ((_v_) > (_h_) ? (_h_) : (_v_)))
@@ -507,3 +508,10 @@ void primitives_init_colors(primitives_t* prims)
 	prims->RGBToYCbCr_16s16s_P3P3 = general_RGBToYCbCr_16s16s_P3P3;
 	prims->RGBToRGB_16s8u_P3AC4R = general_RGBToRGB_16s8u_P3AC4R;
 }
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_colors_opt(primitives_t* prims)
+{
+	primitives_init_colors_sse2(prims);
+	primitives_init_colors_neon(prims);
+}
diff --git a/libfreerdp/primitives/prim_colors.h b/libfreerdp/primitives/prim_colors.h
new file mode 100644
index 000000000..65bbd43cf
--- /dev/null
+++ b/libfreerdp/primitives/prim_colors.h
@@ -0,0 +1,31 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Primitives colors
+ *
+ * Copyright 2024 Armin Novak <anovak@thincast.com>
+ * Copyright 2024 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FREERDP_LIB_PRIM_COLORS_H
+#define FREERDP_LIB_PRIM_COLORS_H
+
+#include <winpr/wtypes.h>
+#include <freerdp/config.h>
+#include <freerdp/primitives.h>
+
+void primitives_init_colors_sse2(primitives_t* prims);
+void primitives_init_colors_neon(primitives_t* prims);
+
+#endif
diff --git a/libfreerdp/primitives/prim_copy.c b/libfreerdp/primitives/prim_copy.c
index 9021399eb..195d289bd 100644
--- a/libfreerdp/primitives/prim_copy.c
+++ b/libfreerdp/primitives/prim_copy.c
@@ -392,22 +392,8 @@ void primitives_init_copy(primitives_t* prims)
 	prims->copy_no_overlap = generic_image_copy_no_overlap;
 }
 
-#if defined(WITH_SSE2) || defined(WITH_NEON)
 void primitives_init_copy_opt(primitives_t* prims)
 {
-	generic = primitives_get_generic();
-	primitives_init_copy(prims);
-	/* Pick tuned versions if possible. */
-	/* Performance with an SSE2 version with no prefetch seemed to be
-	 * all over the map vs. memcpy.
-	 * Sometimes it was significantly faster, sometimes dreadfully slower,
-	 * and it seemed to vary a lot depending on block size and processor.
-	 * Hence, no SSE version is used here unless once can be written that
-	 * is consistently faster than memcpy.
-	 */
-	/* This is just an alias with void* parameters */
-	prims->copy = (__copy_t)(prims->copy_8u);
-	primitives_init_copy_sse(prims);
+	primitives_init_copy_sse41(prims);
 	primitives_init_copy_avx2(prims);
 }
-#endif
diff --git a/libfreerdp/primitives/prim_copy.h b/libfreerdp/primitives/prim_copy.h
index 18b927d0d..ace7f925f 100644
--- a/libfreerdp/primitives/prim_copy.h
+++ b/libfreerdp/primitives/prim_copy.h
@@ -22,6 +22,7 @@
 #define FREERDP_LIB_PRIM_COPY_H
 
 #include <winpr/wtypes.h>
+#include <freerdp/config.h>
 #include <freerdp/primitives.h>
 
 pstatus_t generic_image_copy_no_overlap_convert(
@@ -37,6 +38,7 @@ pstatus_t generic_image_copy_no_overlap_memcpy(
     SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset,
     UINT32 flags);
 
-extern void primitives_init_copy_sse(primitives_t* prims);
-extern void primitives_init_copy_avx2(primitives_t* prims);
+void primitives_init_copy_sse41(primitives_t* prims);
+void primitives_init_copy_avx2(primitives_t* prims);
+
 #endif
diff --git a/libfreerdp/primitives/prim_internal.h b/libfreerdp/primitives/prim_internal.h
index cf5c12467..b3a2a5cf0 100644
--- a/libfreerdp/primitives/prim_internal.h
+++ b/libfreerdp/primitives/prim_internal.h
@@ -275,7 +275,6 @@ FREERDP_LOCAL void primitives_init_colors(primitives_t* prims);
 FREERDP_LOCAL void primitives_init_YCoCg(primitives_t* prims);
 FREERDP_LOCAL void primitives_init_YUV(primitives_t* prims);
 
-#if defined(WITH_SSE2) || defined(WITH_NEON)
 FREERDP_LOCAL void primitives_init_copy_opt(primitives_t* prims);
 FREERDP_LOCAL void primitives_init_set_opt(primitives_t* prims);
 FREERDP_LOCAL void primitives_init_add_opt(primitives_t* prims);
@@ -286,7 +285,6 @@ FREERDP_LOCAL void primitives_init_alphaComp_opt(primitives_t* prims);
 FREERDP_LOCAL void primitives_init_colors_opt(primitives_t* prims);
 FREERDP_LOCAL void primitives_init_YCoCg_opt(primitives_t* prims);
 FREERDP_LOCAL void primitives_init_YUV_opt(primitives_t* prims);
-#endif
 
 #if defined(WITH_OPENCL)
 FREERDP_LOCAL BOOL primitives_init_opencl(primitives_t* prims);
diff --git a/libfreerdp/primitives/prim_set.c b/libfreerdp/primitives/prim_set.c
index c4012e645..3735e4836 100644
--- a/libfreerdp/primitives/prim_set.c
+++ b/libfreerdp/primitives/prim_set.c
@@ -22,6 +22,7 @@
 #include <freerdp/primitives.h>
 
 #include "prim_internal.h"
+#include "prim_set.h"
 
 /* ========================================================================= */
 static pstatus_t general_set_8u(BYTE val, BYTE* pDst, UINT32 len)
@@ -120,3 +121,8 @@ void primitives_init_set(primitives_t* prims)
 	prims->set_32u = general_set_32u;
 	prims->zero = general_zero;
 }
+
+void primitives_init_set_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	primitives_init_set_sse2(prims);
+}
diff --git a/libfreerdp/primitives/prim_set.h b/libfreerdp/primitives/prim_set.h
new file mode 100644
index 000000000..81f81243d
--- /dev/null
+++ b/libfreerdp/primitives/prim_set.h
@@ -0,0 +1,30 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Primitives copy
+ *
+ * Copyright 2024 Armin Novak <anovak@thincast.com>
+ * Copyright 2024 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FREERDP_LIB_PRIM_SET_H
+#define FREERDP_LIB_PRIM_SET_H
+
+#include <winpr/wtypes.h>
+#include <freerdp/config.h>
+#include <freerdp/primitives.h>
+
+void primitives_init_set_sse2(primitives_t* prims);
+
+#endif
diff --git a/libfreerdp/primitives/prim_shift.c b/libfreerdp/primitives/prim_shift.c
index 3677fd113..9af34fde0 100644
--- a/libfreerdp/primitives/prim_shift.c
+++ b/libfreerdp/primitives/prim_shift.c
@@ -19,6 +19,8 @@
 #include <freerdp/primitives.h>
 
 #include "prim_internal.h"
+#include "prim_shift.h"
+
 /* ------------------------------------------------------------------------- */
 static INLINE INT16 shift(INT16 val, UINT32 sh)
 {
@@ -133,3 +135,8 @@ void primitives_init_shift(primitives_t* prims)
 	prims->shiftC_16s = general_shiftC_16s;
 	prims->shiftC_16u = general_shiftC_16u;
 }
+
+void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	primitives_init_shift_sse3(prims);
+}
diff --git a/libfreerdp/primitives/prim_shift.h b/libfreerdp/primitives/prim_shift.h
new file mode 100644
index 000000000..d7cc32324
--- /dev/null
+++ b/libfreerdp/primitives/prim_shift.h
@@ -0,0 +1,30 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Primitives copy
+ *
+ * Copyright 2024 Armin Novak <anovak@thincast.com>
+ * Copyright 2024 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FREERDP_LIB_PRIM_SHIFT_H
+#define FREERDP_LIB_PRIM_SHIFT_H
+
+#include <winpr/wtypes.h>
+#include <freerdp/config.h>
+#include <freerdp/primitives.h>
+
+extern void primitives_init_shift_sse3(primitives_t* prims);
+
+#endif
diff --git a/libfreerdp/primitives/prim_sign.c b/libfreerdp/primitives/prim_sign.c
index d89dc474f..efa18850d 100644
--- a/libfreerdp/primitives/prim_sign.c
+++ b/libfreerdp/primitives/prim_sign.c
@@ -19,6 +19,7 @@
 #include <freerdp/primitives.h>
 
 #include "prim_internal.h"
+#include "prim_sign.h"
 
 /* ----------------------------------------------------------------------------
  * Set pDst to the sign-value of the 16-bit values in pSrc (-1, 0, or 1).
@@ -40,3 +41,8 @@ void primitives_init_sign(primitives_t* prims)
 	/* Start with the default. */
 	prims->sign_16s = general_sign_16s;
 }
+
+void primitives_init_sign_opt(primitives_t* WINPR_RESTRICT prims)
+{
+	primitives_init_sign_ssse3(prims);
+}
diff --git a/libfreerdp/primitives/prim_sign.h b/libfreerdp/primitives/prim_sign.h
new file mode 100644
index 000000000..61ac2dbb3
--- /dev/null
+++ b/libfreerdp/primitives/prim_sign.h
@@ -0,0 +1,30 @@
+/**
+ * FreeRDP: A Remote Desktop Protocol Implementation
+ * Primitives copy
+ *
+ * Copyright 2024 Armin Novak <anovak@thincast.com>
+ * Copyright 2024 Thincast Technologies GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FREERDP_LIB_PRIM_SIGN_H
+#define FREERDP_LIB_PRIM_SIGN_H
+
+#include <winpr/wtypes.h>
+#include <freerdp/config.h>
+#include <freerdp/primitives.h>
+
+void primitives_init_sign_ssse3(primitives_t* prims);
+
+#endif
diff --git a/libfreerdp/primitives/prim_YCoCg_opt.c b/libfreerdp/primitives/sse/prim_YCoCg_ssse3.c
similarity index 74%
rename from libfreerdp/primitives/prim_YCoCg_opt.c
rename to libfreerdp/primitives/sse/prim_YCoCg_ssse3.c
index bba13fac3..8408d50ef 100644
--- a/libfreerdp/primitives/prim_YCoCg_opt.c
+++ b/libfreerdp/primitives/sse/prim_YCoCg_ssse3.c
@@ -23,19 +23,19 @@
 #include <freerdp/primitives.h>
 #include <winpr/sysinfo.h>
 
+#include "prim_YCoCg.h"
+
 #ifdef WITH_SSE2
 #include <emmintrin.h>
 #include <tmmintrin.h>
-#elif defined(WITH_NEON)
-#include <arm_neon.h>
-#endif /* WITH_SSE2 else WITH_NEON */
+#endif
 
 #include "prim_internal.h"
 #include "prim_templates.h"
 
+#ifdef WITH_SSE2
 static primitives_t* generic = NULL;
 
-#ifdef WITH_SSE2
 /* ------------------------------------------------------------------------- */
 static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSrc, UINT32 srcStep,
                                                   BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
@@ -411,9 +411,7 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT
 
 	return PRIMITIVES_SUCCESS;
 }
-#endif /* WITH_SSE2 */
 
-#ifdef WITH_SSE2
 /* ------------------------------------------------------------------------- */
 static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
                                            BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat,
@@ -437,153 +435,22 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT3
 			                                   height, shift, withAlpha);
 	}
 }
-#elif defined(WITH_NEON)
 
-static pstatus_t neon_YCoCgToRGB_8u_X(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
-                                      BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
-                                      UINT32 width, UINT32 height, UINT8 shift, BYTE bPos,
-                                      BYTE gPos, BYTE rPos, BYTE aPos, BOOL alpha)
-{
-	BYTE* dptr = pDst;
-	const BYTE* sptr = pSrc;
-	const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
-	const int8_t cll = shift - 1; /* -1 builds in the /2's */
-	const UINT32 srcPad = srcStep - (width * 4);
-	const UINT32 dstPad = dstStep - (width * formatSize);
-	const UINT32 pad = width % 8;
-	const uint8x8_t aVal = vdup_n_u8(0xFF);
-	const int8x8_t cllv = vdup_n_s8(cll);
-
-	for (UINT32 y = 0; y < height; y++)
-	{
-		for (UINT32 x = 0; x < width - pad; x += 8)
-		{
-			/* Note: shifts must be done before sign-conversion. */
-			const uint8x8x4_t raw = vld4_u8(sptr);
-			const int8x8_t CgRaw = vreinterpret_s8_u8(vshl_u8(raw.val[0], cllv));
-			const int8x8_t CoRaw = vreinterpret_s8_u8(vshl_u8(raw.val[1], cllv));
-			const int16x8_t Cg = vmovl_s8(CgRaw);
-			const int16x8_t Co = vmovl_s8(CoRaw);
-			const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(raw.val[2])); /* UINT8 -> INT16 */
-			const int16x8_t T = vsubq_s16(Y, Cg);
-			const int16x8_t R = vaddq_s16(T, Co);
-			const int16x8_t G = vaddq_s16(Y, Cg);
-			const int16x8_t B = vsubq_s16(T, Co);
-			uint8x8x4_t bgrx;
-			bgrx.val[bPos] = vqmovun_s16(B);
-			bgrx.val[gPos] = vqmovun_s16(G);
-			bgrx.val[rPos] = vqmovun_s16(R);
-
-			if (alpha)
-				bgrx.val[aPos] = raw.val[3];
-			else
-				bgrx.val[aPos] = aVal;
-
-			vst4_u8(dptr, bgrx);
-			sptr += sizeof(raw);
-			dptr += sizeof(bgrx);
-		}
-
-		for (UINT32 x = 0; x < pad; x++)
-		{
-			/* Note: shifts must be done before sign-conversion. */
-			const INT16 Cg = (INT16)((INT8)((*sptr++) << cll));
-			const INT16 Co = (INT16)((INT8)((*sptr++) << cll));
-			const INT16 Y = (INT16)(*sptr++); /* UINT8->INT16 */
-			const INT16 T = Y - Cg;
-			const INT16 R = T + Co;
-			const INT16 G = Y + Cg;
-			const INT16 B = T - Co;
-			BYTE bgra[4];
-			bgra[bPos] = CLIP(B);
-			bgra[gPos] = CLIP(G);
-			bgra[rPos] = CLIP(R);
-			bgra[aPos] = *sptr++;
-
-			if (!alpha)
-				bgra[aPos] = 0xFF;
-
-			*dptr++ = bgra[0];
-			*dptr++ = bgra[1];
-			*dptr++ = bgra[2];
-			*dptr++ = bgra[3];
-		}
-
-		sptr += srcPad;
-		dptr += dstPad;
-	}
-
-	return PRIMITIVES_SUCCESS;
-}
-
-static pstatus_t neon_YCoCgToRGB_8u_AC4R(const BYTE* WINPR_RESTRICT pSrc, INT32 srcStep,
-                                         BYTE* WINPR_RESTRICT pDst, UINT32 DstFormat, INT32 dstStep,
-                                         UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha)
-{
-	switch (DstFormat)
-	{
-		case PIXEL_FORMAT_BGRA32:
-			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
-			                            shift, 2, 1, 0, 3, withAlpha);
-
-		case PIXEL_FORMAT_BGRX32:
-			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
-			                            shift, 2, 1, 0, 3, withAlpha);
-
-		case PIXEL_FORMAT_RGBA32:
-			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
-			                            shift, 0, 1, 2, 3, withAlpha);
-
-		case PIXEL_FORMAT_RGBX32:
-			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
-			                            shift, 0, 1, 2, 3, withAlpha);
-
-		case PIXEL_FORMAT_ARGB32:
-			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
-			                            shift, 1, 2, 3, 0, withAlpha);
-
-		case PIXEL_FORMAT_XRGB32:
-			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
-			                            shift, 1, 2, 3, 0, withAlpha);
-
-		case PIXEL_FORMAT_ABGR32:
-			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
-			                            shift, 3, 2, 1, 0, withAlpha);
-
-		case PIXEL_FORMAT_XBGR32:
-			return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height,
-			                            shift, 3, 2, 1, 0, withAlpha);
-
-		default:
-			return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width,
-			                                   height, shift, withAlpha);
-	}
-}
 #endif /* WITH_SSE2 */
 
 /* ------------------------------------------------------------------------- */
-void primitives_init_YCoCg_opt(primitives_t* WINPR_RESTRICT prims)
+void primitives_init_YCoCg_ssse3(primitives_t* WINPR_RESTRICT prims)
 {
+#if defined(WITH_SSE2)
 	generic = primitives_get_generic();
 	primitives_init_YCoCg(prims);
-	/* While IPP acknowledges the existence of YCoCg-R, it doesn't currently
-	 * include any routines to work with it, especially with variable shift
-	 * width.
-	 */
-#if defined(WITH_SSE2)
 
 	if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
 	    IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
 	{
 		prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
 	}
-
-#elif defined(WITH_NEON)
-
-	if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
-	{
-		prims->YCoCgToRGB_8u_AC4R = neon_YCoCgToRGB_8u_AC4R;
-	}
-
-#endif /* WITH_SSE2 */
+#else
+	WINPR_UNUSED(prims);
+#endif
 }
diff --git a/libfreerdp/primitives/prim_YUV_ssse3.c b/libfreerdp/primitives/sse/prim_YUV_ssse3.c
similarity index 99%
rename from libfreerdp/primitives/prim_YUV_ssse3.c
rename to libfreerdp/primitives/sse/prim_YUV_ssse3.c
index 2fbef3e94..c204b74e5 100644
--- a/libfreerdp/primitives/prim_YUV_ssse3.c
+++ b/libfreerdp/primitives/sse/prim_YUV_ssse3.c
@@ -29,14 +29,12 @@
 #include <freerdp/primitives.h>
 
 #include "prim_internal.h"
+#include "prim_YUV.h"
 
+#if defined(WITH_SSE2)
 #include <emmintrin.h>
 #include <tmmintrin.h>
 
-#if !defined(WITH_SSE2)
-#error "This file needs WITH_SSE2 enabled!"
-#endif
-
 static primitives_t* generic = NULL;
 
 /****************************************************************************/
@@ -1496,9 +1494,11 @@ static pstatus_t ssse3_YUV420CombineToYUV444(avc444_frame_type type,
 			return -1;
 	}
 }
+#endif
 
-void primitives_init_YUV_opt(primitives_t* WINPR_RESTRICT prims)
+void primitives_init_YUV_ssse3(primitives_t* WINPR_RESTRICT prims)
 {
+#if defined(WITH_SSE2)
 	generic = primitives_get_generic();
 	primitives_init_YUV(prims);
 
@@ -1512,4 +1512,7 @@ void primitives_init_YUV_opt(primitives_t* WINPR_RESTRICT prims)
 		prims->YUV444ToRGB_8u_P3AC4R = ssse3_YUV444ToRGB_8u_P3AC4R;
 		prims->YUV420CombineToYUV444 = ssse3_YUV420CombineToYUV444;
 	}
+#else
+	WINPR_UNUSED(prims);
+#endif
 }
diff --git a/libfreerdp/primitives/prim_add_opt.c b/libfreerdp/primitives/sse/prim_add_sse3.c
similarity index 97%
rename from libfreerdp/primitives/prim_add_opt.c
rename to libfreerdp/primitives/sse/prim_add_sse3.c
index 7274683a4..0a97440a2 100644
--- a/libfreerdp/primitives/prim_add_opt.c
+++ b/libfreerdp/primitives/sse/prim_add_sse3.c
@@ -20,6 +20,8 @@
 #include <freerdp/primitives.h>
 #include <winpr/sysinfo.h>
 
+#include "prim_add.h"
+
 #ifdef WITH_SSE2
 #include <emmintrin.h>
 #include <pmmintrin.h>
@@ -28,9 +30,9 @@
 #include "prim_internal.h"
 #include "prim_templates.h"
 
+#ifdef WITH_SSE2
 static primitives_t* generic = NULL;
 
-#ifdef WITH_SSE2
 /* ------------------------------------------------------------------------- */
 SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16,
                  generic->add_16s(sptr1++, sptr2++, dptr++, 1))
@@ -174,12 +176,12 @@ static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1,
 #endif
 
 /* ------------------------------------------------------------------------- */
-void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims)
+void primitives_init_add_sse3(primitives_t* WINPR_RESTRICT prims)
 {
+#if defined(WITH_SSE2)
 	generic = primitives_get_generic();
 	primitives_init_add(prims);
 
-#if defined(WITH_SSE2)
 	if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
 	    IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
 	{
@@ -187,5 +189,7 @@ void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims)
 		prims->add_16s_inplace = sse3_add_16s_inplace;
 	}
 
+#else
+	WINPR_UNUSED(prims);
 #endif
 }
diff --git a/libfreerdp/primitives/prim_alphaComp_opt.c b/libfreerdp/primitives/sse/prim_alphaComp_sse3.c
similarity index 97%
rename from libfreerdp/primitives/prim_alphaComp_opt.c
rename to libfreerdp/primitives/sse/prim_alphaComp_sse3.c
index 5e3ec6fc2..392f9d31b 100644
--- a/libfreerdp/primitives/prim_alphaComp_opt.c
+++ b/libfreerdp/primitives/sse/prim_alphaComp_sse3.c
@@ -26,6 +26,8 @@
 #include <freerdp/primitives.h>
 #include <winpr/sysinfo.h>
 
+#include "prim_alphaComp.h"
+
 #ifdef WITH_SSE2
 #include <emmintrin.h>
 #include <pmmintrin.h>
@@ -33,10 +35,9 @@
 
 #include "prim_internal.h"
 
-static primitives_t* generic = NULL;
-
 /* ------------------------------------------------------------------------- */
 #ifdef WITH_SSE2
+static primitives_t* generic = NULL;
 
 static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 src1Step,
                                      const BYTE* WINPR_RESTRICT pSrc2, UINT32 src2Step,
@@ -208,11 +209,11 @@ static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 sr
 #endif
 
 /* ------------------------------------------------------------------------- */
-void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims)
+void primitives_init_alphaComp_sse3(primitives_t* WINPR_RESTRICT prims)
 {
+#if defined(WITH_SSE2)
 	generic = primitives_get_generic();
 	primitives_init_alphaComp(prims);
-#if defined(WITH_SSE2)
 
 	if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
 	    IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
@@ -220,5 +221,7 @@ void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims)
 		prims->alphaComp_argb = sse2_alphaComp_argb;
 	}
 
+#else
+	WINPR_UNUSED(prims);
 #endif
 }
diff --git a/libfreerdp/primitives/prim_andor_opt.c b/libfreerdp/primitives/sse/prim_andor_sse3.c
similarity index 93%
rename from libfreerdp/primitives/prim_andor_opt.c
rename to libfreerdp/primitives/sse/prim_andor_sse3.c
index 4d8b5916b..57809b2ad 100644
--- a/libfreerdp/primitives/prim_andor_opt.c
+++ b/libfreerdp/primitives/sse/prim_andor_sse3.c
@@ -19,6 +19,8 @@
 #include <freerdp/primitives.h>
 #include <winpr/sysinfo.h>
 
+#include "prim_andor.h"
+
 #ifdef WITH_SSE2
 #include <emmintrin.h>
 #include <pmmintrin.h>
@@ -27,9 +29,9 @@
 #include "prim_internal.h"
 #include "prim_templates.h"
 
+#ifdef WITH_SSE2
 static primitives_t* generic = NULL;
 
-#ifdef WITH_SSE2
 /* ------------------------------------------------------------------------- */
 SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, generic->andC_32u, _mm_and_si128,
                      *dptr++ = *sptr++ & val)
@@ -37,11 +39,11 @@ SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, generic->orC_32u, _mm_or_si128, *dptr
 #endif
 
 /* ------------------------------------------------------------------------- */
-void primitives_init_andor_opt(primitives_t* WINPR_RESTRICT prims)
+void primitives_init_andor_sse3(primitives_t* WINPR_RESTRICT prims)
 {
+#if defined(WITH_SSE2)
 	generic = primitives_get_generic();
 	primitives_init_andor(prims);
-#if defined(WITH_SSE2)
 
 	if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
 	    IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
@@ -50,5 +52,7 @@ void primitives_init_andor_opt(primitives_t* WINPR_RESTRICT prims)
 		prims->orC_32u = sse3_orC_32u;
 	}
 
+#else
+	WINPR_UNUSED(prims);
 #endif
 }
diff --git a/libfreerdp/primitives/prim_colors_opt.c b/libfreerdp/primitives/sse/prim_colors_sse2.c
similarity index 77%
rename from libfreerdp/primitives/prim_colors_opt.c
rename to libfreerdp/primitives/sse/prim_colors_sse2.c
index 60debc3c6..40eea9ddc 100644
--- a/libfreerdp/primitives/prim_colors_opt.c
+++ b/libfreerdp/primitives/sse/prim_colors_sse2.c
@@ -23,18 +23,17 @@
 #include <freerdp/primitives.h>
 #include <winpr/sysinfo.h>
 
+#include "prim_colors.h"
+
 #ifdef WITH_SSE2
 #include <emmintrin.h>
-#elif defined(WITH_NEON)
-#include <arm_neon.h>
-#endif /* WITH_SSE2 else WITH_NEON */
+#endif /* WITH_SSE2 */
 
 #include "prim_internal.h"
 #include "prim_templates.h"
 
-static primitives_t* generic = NULL;
-
 #ifdef WITH_SSE2
+static primitives_t* generic = NULL;
 
 #ifdef __GNUC__
 #define GNU_INLINE __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1246,329 +1245,11 @@ sse2_RGBToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit
 }
 #endif /* WITH_SSE2 */
 
-/*---------------------------------------------------------------------------*/
-#ifdef WITH_NEON
-static pstatus_t
-neon_yCbCrToRGB_16s16s_P3P3(const INT16* const WINPR_RESTRICT pSrc[3], INT32 srcStep,
-                            INT16* WINPR_RESTRICT pDst[3], INT32 dstStep,
-                            const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
-{
-	/* TODO: If necessary, check alignments and call the general version. */
-	int16x8_t zero = vdupq_n_s16(0);
-	int16x8_t max = vdupq_n_s16(255);
-	int16x8_t r_cr = vdupq_n_s16(22986);  //  1.403 << 14
-	int16x8_t g_cb = vdupq_n_s16(-5636);  // -0.344 << 14
-	int16x8_t g_cr = vdupq_n_s16(-11698); // -0.714 << 14
-	int16x8_t b_cb = vdupq_n_s16(28999);  //  1.770 << 14
-	int16x8_t c4096 = vdupq_n_s16(4096);
-	int16x8_t* y_buf = (int16x8_t*)pSrc[0];
-	int16x8_t* cb_buf = (int16x8_t*)pSrc[1];
-	int16x8_t* cr_buf = (int16x8_t*)pSrc[2];
-	int16x8_t* r_buf = (int16x8_t*)pDst[0];
-	int16x8_t* g_buf = (int16x8_t*)pDst[1];
-	int16x8_t* b_buf = (int16x8_t*)pDst[2];
-	int srcbump = srcStep / sizeof(int16x8_t);
-	int dstbump = dstStep / sizeof(int16x8_t);
-	int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
-
-	for (int yp = 0; yp < roi->height; ++yp)
-	{
-		for (int i = 0; i < imax; i++)
-		{
-			/*
-			    In order to use NEON signed 16-bit integer multiplication we need to convert
-			    the floating point factors to signed int without loosing information.
-			    The result of this multiplication is 32 bit and we have a NEON instruction
-			    that returns the hi word of the saturated double.
-			    Thus we will multiply the factors by the highest possible 2^n, take the
-			    upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
-			    shift by 1 to reverse the doubling) and correct	this result by multiplying it
-			    by 2^(16-n).
-			    For the given factors in the conversion matrix the best possible n is 14.
-
-			    Example for calculating r:
-			    r = (y>>5) + 128 + (cr*1.403)>>5                       // our base formula
-			    r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5      // see above
-			    r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5             // simplification
-			    r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
-			*/
-			/* y = (y_buf[i] + 4096) >> 2 */
-			int16x8_t y = vld1q_s16((INT16*)&y_buf[i]);
-			y = vaddq_s16(y, c4096);
-			y = vshrq_n_s16(y, 2);
-			/* cb = cb_buf[i]; */
-			int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
-			/* cr = cr_buf[i]; */
-			int16x8_t cr = vld1q_s16((INT16*)&cr_buf[i]);
-			/* (y + HIWORD(cr*22986)) >> 3 */
-			int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
-			r = vshrq_n_s16(r, 3);
-			/* r_buf[i] = CLIP(r); */
-			r = vminq_s16(vmaxq_s16(r, zero), max);
-			vst1q_s16((INT16*)&r_buf[i], r);
-			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
-			int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
-			g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
-			g = vshrq_n_s16(g, 3);
-			/* g_buf[i] = CLIP(g); */
-			g = vminq_s16(vmaxq_s16(g, zero), max);
-			vst1q_s16((INT16*)&g_buf[i], g);
-			/* (y + HIWORD(cb*28999)) >> 3 */
-			int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
-			b = vshrq_n_s16(b, 3);
-			/* b_buf[i] = CLIP(b); */
-			b = vminq_s16(vmaxq_s16(b, zero), max);
-			vst1q_s16((INT16*)&b_buf[i], b);
-		}
-
-		y_buf += srcbump;
-		cb_buf += srcbump;
-		cr_buf += srcbump;
-		r_buf += dstbump;
-		g_buf += dstbump;
-		b_buf += dstbump;
-	}
-
-	return PRIMITIVES_SUCCESS;
-}
-
-static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(const INT16* const WINPR_RESTRICT pSrc[3],
-                                                UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
-                                                UINT32 dstStep,
-                                                const prim_size_t* WINPR_RESTRICT roi, uint8_t rPos,
-                                                uint8_t gPos, uint8_t bPos, uint8_t aPos)
-{
-	BYTE* pRGB = pDst;
-	const INT16* pY = pSrc[0];
-	const INT16* pCb = pSrc[1];
-	const INT16* pCr = pSrc[2];
-	const size_t srcPad = (srcStep - (roi->width * sizeof(INT16))) / sizeof(INT16);
-	const size_t dstPad = (dstStep - (roi->width * 4)) / 4;
-	const size_t pad = roi->width % 8;
-	const int16x4_t c4096 = vdup_n_s16(4096);
-
-	for (UINT32 y = 0; y < roi->height; y++)
-	{
-		for (UINT32 x = 0; x < roi->width - pad; x += 8)
-		{
-			const int16x8_t Y = vld1q_s16(pY);
-			const int16x4_t Yh = vget_high_s16(Y);
-			const int16x4_t Yl = vget_low_s16(Y);
-			const int32x4_t YhAdd = vaddl_s16(Yh, c4096); /* Y + 4096 */
-			const int32x4_t YlAdd = vaddl_s16(Yl, c4096); /* Y + 4096 */
-			const int32x4_t YhW = vshlq_n_s32(YhAdd, 16);
-			const int32x4_t YlW = vshlq_n_s32(YlAdd, 16);
-			const int16x8_t Cr = vld1q_s16(pCr);
-			const int16x4_t Crh = vget_high_s16(Cr);
-			const int16x4_t Crl = vget_low_s16(Cr);
-			const int16x8_t Cb = vld1q_s16(pCb);
-			const int16x4_t Cbh = vget_high_s16(Cb);
-			const int16x4_t Cbl = vget_low_s16(Cb);
-			uint8x8x4_t bgrx;
-			{
-				/* R */
-				const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916); /* 1.402525 * 2^16 */
-				const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916); /* 1.402525 * 2^16 */
-				const int32x4_t CrhRa = vaddq_s32(CrhR, YhW);
-				const int32x4_t CrlRa = vaddq_s32(CrlR, YlW);
-				const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21));
-				const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21));
-				const int16x8_t Rs = vcombine_s16(Rsl, Rsh);
-				bgrx.val[rPos] = vqmovun_s16(Rs);
-			}
-			{
-				/* G */
-				const int32x4_t CbGh = vmull_n_s16(Cbh, 22527);            /* 0.343730 * 2^16 */
-				const int32x4_t CbGl = vmull_n_s16(Cbl, 22527);            /* 0.343730 * 2^16 */
-				const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819); /* 0.714401 * 2^16 */
-				const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819); /* 0.714401 * 2^16 */
-				const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh);
-				const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl);
-				const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh);
-				const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl);
-				const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21));
-				const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21));
-				const int16x8_t Gs = vcombine_s16(Gsl, Gsh);
-				const uint8x8_t G = vqmovun_s16(Gs);
-				bgrx.val[gPos] = G;
-			}
-			{
-				/* B */
-				const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992); /* 1.769905 * 2^16 */
-				const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992); /* 1.769905 * 2^16 */
-				const int32x4_t YCbBh = vaddq_s32(CbBh, YhW);
-				const int32x4_t YCbBl = vaddq_s32(CbBl, YlW);
-				const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21));
-				const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21));
-				const int16x8_t Bs = vcombine_s16(Bsl, Bsh);
-				const uint8x8_t B = vqmovun_s16(Bs);
-				bgrx.val[bPos] = B;
-			}
-			/* A */
-			{
-				bgrx.val[aPos] = vdup_n_u8(0xFF);
-			}
-			vst4_u8(pRGB, bgrx);
-			pY += 8;
-			pCb += 8;
-			pCr += 8;
-			pRGB += 32;
-		}
-
-		for (UINT32 x = 0; x < pad; x++)
-		{
-			const INT32 divisor = 16;
-			const INT32 Y = ((*pY++) + 4096) << divisor;
-			const INT32 Cb = (*pCb++);
-			const INT32 Cr = (*pCr++);
-			const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
-			const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
-			const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
-			const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
-			INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
-			INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
-			INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
-			BYTE bgrx[4];
-			bgrx[bPos] = CLIP(B);
-			bgrx[gPos] = CLIP(G);
-			bgrx[rPos] = CLIP(R);
-			bgrx[aPos] = 0xFF;
-			*pRGB++ = bgrx[0];
-			*pRGB++ = bgrx[1];
-			*pRGB++ = bgrx[2];
-			*pRGB++ = bgrx[3];
-		}
-
-		pY += srcPad;
-		pCb += srcPad;
-		pCr += srcPad;
-		pRGB += dstPad;
-	}
-
-	return PRIMITIVES_SUCCESS;
-}
-
-static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3],
-                                              UINT32 srcStep, BYTE* WINPR_RESTRICT pDst,
-                                              UINT32 dstStep, UINT32 DstFormat,
-                                              const prim_size_t* WINPR_RESTRICT roi)
-{
-	switch (DstFormat)
-	{
-		case PIXEL_FORMAT_BGRA32:
-		case PIXEL_FORMAT_BGRX32:
-			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
-
-		case PIXEL_FORMAT_RGBA32:
-		case PIXEL_FORMAT_RGBX32:
-			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
-
-		case PIXEL_FORMAT_ARGB32:
-		case PIXEL_FORMAT_XRGB32:
-			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
-
-		case PIXEL_FORMAT_ABGR32:
-		case PIXEL_FORMAT_XBGR32:
-			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
-
-		default:
-			return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
-	}
-}
-
-static pstatus_t neon_RGBToRGB_16s8u_P3AC4R_X(
-    const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
-    UINT32 srcStep,                            /* bytes between rows in source data */
-    BYTE* WINPR_RESTRICT pDst,                 /* 32-bit interleaved ARGB (ABGR?) data */
-    UINT32 dstStep,                            /* bytes between rows in dest data */
-    const prim_size_t* WINPR_RESTRICT roi,     /* region of interest */
-    uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
-{
-	UINT32 pad = roi->width % 8;
-
-	for (UINT32 y = 0; y < roi->height; y++)
-	{
-		const INT16* pr = (INT16*)(((BYTE*)pSrc[0]) + y * srcStep);
-		const INT16* pg = (INT16*)(((BYTE*)pSrc[1]) + y * srcStep);
-		const INT16* pb = (INT16*)(((BYTE*)pSrc[2]) + y * srcStep);
-		BYTE* dst = pDst + y * dstStep;
-
-		for (UINT32 x = 0; x < roi->width - pad; x += 8)
-		{
-			int16x8_t r = vld1q_s16(pr);
-			int16x8_t g = vld1q_s16(pg);
-			int16x8_t b = vld1q_s16(pb);
-			uint8x8x4_t bgrx;
-			bgrx.val[aPos] = vdup_n_u8(0xFF);
-			bgrx.val[rPos] = vqmovun_s16(r);
-			bgrx.val[gPos] = vqmovun_s16(g);
-			bgrx.val[bPos] = vqmovun_s16(b);
-			vst4_u8(dst, bgrx);
-			pr += 8;
-			pg += 8;
-			pb += 8;
-			dst += 32;
-		}
-
-		for (UINT32 x = 0; x < pad; x++)
-		{
-			BYTE bgrx[4];
-			bgrx[bPos] = *pb++;
-			bgrx[gPos] = *pg++;
-			bgrx[rPos] = *pr++;
-			bgrx[aPos] = 0xFF;
-			*dst++ = bgrx[0];
-			*dst++ = bgrx[1];
-			*dst++ = bgrx[2];
-			*dst++ = bgrx[3];
-		}
-	}
-
-	return PRIMITIVES_SUCCESS;
-}
-
-static pstatus_t
-neon_RGBToRGB_16s8u_P3AC4R(const INT16* const WINPR_RESTRICT pSrc[3], /* 16-bit R,G, and B arrays */
-                           UINT32 srcStep,            /* bytes between rows in source data */
-                           BYTE* WINPR_RESTRICT pDst, /* 32-bit interleaved ARGB (ABGR?) data */
-                           UINT32 dstStep,            /* bytes between rows in dest data */
-                           UINT32 DstFormat,
-                           const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
-{
-	switch (DstFormat)
-	{
-		case PIXEL_FORMAT_BGRA32:
-		case PIXEL_FORMAT_BGRX32:
-			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
-
-		case PIXEL_FORMAT_RGBA32:
-		case PIXEL_FORMAT_RGBX32:
-			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
-
-		case PIXEL_FORMAT_ARGB32:
-		case PIXEL_FORMAT_XRGB32:
-			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
-
-		case PIXEL_FORMAT_ABGR32:
-		case PIXEL_FORMAT_XBGR32:
-			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
-
-		default:
-			return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
-	}
-}
-#endif /* WITH_NEON */
-/* I don't see a direct IPP version of this, since the input is INT16
- * YCbCr.  It may be possible via  Deinterleave and then YCbCrToRGB_<mod>.
- * But that would likely be slower.
- */
-
-/* ------------------------------------------------------------------------- */
-void primitives_init_colors_opt(primitives_t* prims)
+void primitives_init_colors_sse2(primitives_t* prims)
 {
+#if defined(WITH_SSE2)
 	generic = primitives_get_generic();
 	primitives_init_colors(prims);
-#if defined(WITH_SSE2)
 
 	if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
 	{
@@ -1578,14 +1259,7 @@ void primitives_init_colors_opt(primitives_t* prims)
 		prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
 	}
 
-#elif defined(WITH_NEON)
-
-	if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
-	{
-		prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R;
-		prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R;
-		prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
-	}
-
-#endif /* WITH_SSE2 */
+#else
+	WINPR_UNUSED(prims);
+#endif
 }
diff --git a/libfreerdp/primitives/prim_copy_avx2.c b/libfreerdp/primitives/sse/prim_copy_avx2.c
similarity index 100%
rename from libfreerdp/primitives/prim_copy_avx2.c
rename to libfreerdp/primitives/sse/prim_copy_avx2.c
diff --git a/libfreerdp/primitives/prim_copy_sse.c b/libfreerdp/primitives/sse/prim_copy_sse4_1.c
similarity index 99%
rename from libfreerdp/primitives/prim_copy_sse.c
rename to libfreerdp/primitives/sse/prim_copy_sse4_1.c
index c2d102e78..d073928a3 100644
--- a/libfreerdp/primitives/prim_copy_sse.c
+++ b/libfreerdp/primitives/sse/prim_copy_sse4_1.c
@@ -268,7 +268,7 @@ static pstatus_t sse_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD
 #endif
 
 /* ------------------------------------------------------------------------- */
-void primitives_init_copy_sse(primitives_t* prims)
+void primitives_init_copy_sse41(primitives_t* prims)
 {
 #if defined(WITH_SSE2)
 	if (IsProcessorFeaturePresent(PF_SSE4_1_INSTRUCTIONS_AVAILABLE))
diff --git a/libfreerdp/primitives/prim_set_opt.c b/libfreerdp/primitives/sse/prim_set_sse2.c
similarity index 97%
rename from libfreerdp/primitives/prim_set_opt.c
rename to libfreerdp/primitives/sse/prim_set_sse2.c
index f77cb310d..b4c1949f0 100644
--- a/libfreerdp/primitives/prim_set_opt.c
+++ b/libfreerdp/primitives/sse/prim_set_sse2.c
@@ -26,11 +26,12 @@
 #endif /* WITH_SSE2 */
 
 #include "prim_internal.h"
-
-static primitives_t* generic = NULL;
+#include "prim_set.h"
 
 /* ========================================================================= */
 #ifdef WITH_SSE2
+static primitives_t* generic = NULL;
+
 static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 len)
 {
 	BYTE byte = 0;
@@ -216,14 +217,13 @@ static pstatus_t sse2_set_32s(INT32 val, INT32* WINPR_RESTRICT pDst, UINT32 len)
 #endif /* WITH_SSE2 */
 
 /* ------------------------------------------------------------------------- */
-void primitives_init_set_opt(primitives_t* WINPR_RESTRICT prims)
+void primitives_init_set_sse2(primitives_t* WINPR_RESTRICT prims)
 {
+#if defined(WITH_SSE2)
 	generic = primitives_get_generic();
 	primitives_init_set(prims);
 	/* Pick tuned versions if possible. */
 
-#if defined(WITH_SSE2)
-
 	if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
 	{
 		prims->set_8u = sse2_set_8u;
@@ -231,5 +231,7 @@ void primitives_init_set_opt(primitives_t* WINPR_RESTRICT prims)
 		prims->set_32u = sse2_set_32u;
 	}
 
+#else
+	WINPR_UNUSED(prims);
 #endif
 }
diff --git a/libfreerdp/primitives/prim_shift_opt.c b/libfreerdp/primitives/sse/prim_shift_sse3.c
similarity index 97%
rename from libfreerdp/primitives/prim_shift_opt.c
rename to libfreerdp/primitives/sse/prim_shift_sse3.c
index a4dd3c6f0..ea50eb6a4 100644
--- a/libfreerdp/primitives/prim_shift_opt.c
+++ b/libfreerdp/primitives/sse/prim_shift_sse3.c
@@ -19,6 +19,8 @@
 #include <freerdp/primitives.h>
 #include <winpr/sysinfo.h>
 
+#include "prim_shift.h"
+
 #ifdef WITH_SSE2
 #include <emmintrin.h>
 #include <pmmintrin.h>
@@ -27,9 +29,9 @@
 #include "prim_internal.h"
 #include "prim_templates.h"
 
+#ifdef WITH_SSE2
 static primitives_t* generic = NULL;
 
-#ifdef WITH_SSE2
 /* ------------------------------------------------------------------------- */
 SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, _mm_slli_epi16,
                  *dptr++ = (INT16)((UINT16)*sptr++ << val))
@@ -142,11 +144,11 @@ static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32
  */
 
 /* ------------------------------------------------------------------------- */
-void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims)
+void primitives_init_shift_sse3(primitives_t* WINPR_RESTRICT prims)
 {
+#if defined(WITH_SSE2)
 	generic = primitives_get_generic();
 	primitives_init_shift(prims);
-#if defined(WITH_SSE2)
 
 	if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
 	    IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
@@ -158,5 +160,7 @@ void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims)
 		prims->rShiftC_16u = sse2_rShiftC_16u;
 	}
 
+#else
+	WINPR_UNUSED(prims);
 #endif
 }
diff --git a/libfreerdp/primitives/prim_sign_opt.c b/libfreerdp/primitives/sse/prim_sign_ssse3.c
similarity index 96%
rename from libfreerdp/primitives/prim_sign_opt.c
rename to libfreerdp/primitives/sse/prim_sign_ssse3.c
index dae76a6c9..c430c827d 100644
--- a/libfreerdp/primitives/prim_sign_opt.c
+++ b/libfreerdp/primitives/sse/prim_sign_ssse3.c
@@ -19,16 +19,18 @@
 #include <freerdp/primitives.h>
 #include <winpr/sysinfo.h>
 
-#ifdef WITH_SSE2
+#include "prim_sign.h"
+
+#if defined(WITH_SSE2)
 #include <emmintrin.h>
 #include <tmmintrin.h>
-#endif /* WITH_SSE2 */
+#endif
 
 #include "prim_internal.h"
 
+#if defined(WITH_SSE2)
 static primitives_t* generic = NULL;
 
-#ifdef WITH_SSE2
 /* ------------------------------------------------------------------------- */
 static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst,
                                 UINT32 len)
@@ -167,13 +169,13 @@ static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_R
 #endif /* WITH_SSE2 */
 
 /* ------------------------------------------------------------------------- */
-void primitives_init_sign_opt(primitives_t* WINPR_RESTRICT prims)
+void primitives_init_sign_ssse3(primitives_t* WINPR_RESTRICT prims)
 {
+#if defined(WITH_SSE2)
 	generic = primitives_get_generic();
 	primitives_init_sign(prims);
 	/* Pick tuned versions if possible. */
 	/* I didn't spot an IPP version of this. */
-#if defined(WITH_SSE2)
 
 	if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
 	    IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
@@ -181,5 +183,7 @@ void primitives_init_sign_opt(primitives_t* WINPR_RESTRICT prims)
 		prims->sign_16s = ssse3_sign_16s;
 	}
 
+#else
+	WINPR_UNUSED(prims);
 #endif
 }