mcst-linux-kernel/patches-2024.06.26/x264-0.164/0001-x264-164-e2k.patch

5308 lines
221 KiB
Diff

From bcac03930d8f5193665086201493e6096ec0cf93 Mon Sep 17 00:00:00 2001
From: Ilya Kurdyukov <jpegqs@gmail.com>
Date: Tue, 30 May 2023 23:24:57 +0700
Subject: [PATCH] x264-164 e2k support
---
Makefile | 10 +
common/common.h | 12 +
common/cpu.c | 9 +
common/dct.c | 43 ++
common/deblock.c | 11 +
common/e2k/dct.c | 829 ++++++++++++++++++++++++
common/e2k/dct.h | 69 ++
common/e2k/deblock.c | 236 +++++++
common/e2k/deblock.h | 32 +
common/e2k/e2kcommon.h | 253 ++++++++
common/e2k/mc.c | 1401 ++++++++++++++++++++++++++++++++++++++++
common/e2k/mc.h | 30 +
common/e2k/pixel.c | 1169 +++++++++++++++++++++++++++++++++
common/e2k/pixel.h | 30 +
common/e2k/predict.c | 187 ++++++
common/e2k/predict.h | 32 +
common/e2k/quant.c | 436 +++++++++++++
common/e2k/quant.h | 73 +++
common/mc.c | 7 +
common/pixel.c | 9 +
common/predict.c | 8 +
common/quant.c | 33 +
configure | 11 +-
tools/checkasm.c | 17 +-
x264.h | 3 +
27 files changed, 4952 insertions(+), 3 deletions(-)
create mode 100644 common/e2k/dct.c
create mode 100644 common/e2k/dct.h
create mode 100644 common/e2k/deblock.c
create mode 100644 common/e2k/deblock.h
create mode 100644 common/e2k/e2kcommon.h
create mode 100644 common/e2k/mc.c
create mode 100644 common/e2k/mc.h
create mode 100644 common/e2k/pixel.c
create mode 100644 common/e2k/pixel.h
create mode 100644 common/e2k/predict.c
create mode 100644 common/e2k/predict.h
create mode 100644 common/e2k/quant.c
create mode 100644 common/e2k/quant.h
diff --git a/Makefile b/Makefile
index 0ba46aa..f95021f 100644
--- a/Makefile
+++ b/Makefile
@@ -137,6 +137,16 @@ SRCS_X += common/ppc/dct.c \
common/ppc/quant.c
endif
+# Elbrus optims
+ifeq ($(SYS_ARCH),E2K)
+SRCS_X += common/e2k/dct.c \
+ common/e2k/deblock.c \
+ common/e2k/mc.c \
+ common/e2k/pixel.c \
+ common/e2k/predict.c \
+ common/e2k/quant.c
+endif
+
# NEON optims
ifeq ($(SYS_ARCH),ARM)
SRCASM_X = common/arm/bitstream-a.S \
diff --git a/common/common.h b/common/common.h
index ebc87ce..fe22ac7 100644
--- a/common/common.h
+++ b/common/common.h
@@ -808,6 +808,18 @@ static ALWAYS_INLINE int x264_predictor_clip( int16_t (*dst)[2], int16_t (*mvc)[
#include "x86/util.h"
#endif
+#if ARCH_E2K
+#include <xmmintrin.h>
+
+#undef M128_ZERO
+#define M128_ZERO ((__m128){0,0,0,0})
+#define x264_union128_t x264_union128_sse_t
+typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_sse_t;
+#if HAVE_VECTOREXT
+typedef uint32_t v4si __attribute__((vector_size (16)));
+#endif
+#endif
+
#include "rectangle.h"
#endif
diff --git a/common/cpu.c b/common/cpu.c
index 41a4c1d..dcd8731 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -89,6 +89,8 @@ const x264_cpu_name_t x264_cpu_names[] =
{"UnalignedStack", X264_CPU_STACK_MOD4},
#elif ARCH_PPC
{"Altivec", X264_CPU_ALTIVEC},
+#elif ARCH_E2K
+ {"Elbrus", X264_CPU_E2K},
#elif ARCH_ARM
{"ARMv6", X264_CPU_ARMV6},
{"NEON", X264_CPU_NEON},
@@ -360,6 +362,13 @@ uint32_t x264_cpu_detect( void )
}
#endif
+#elif ARCH_E2K
+
+uint32_t x264_cpu_detect( void )
+{
+ return X264_CPU_E2K;
+}
+
#elif HAVE_ARMV6
void x264_cpu_neon_test( void );
diff --git a/common/dct.c b/common/dct.c
index 90997ca..6a3a8d4 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -32,6 +32,9 @@
#if HAVE_ALTIVEC
# include "ppc/dct.h"
#endif
+#if ARCH_E2K
+# include "e2k/dct.h"
+#endif
#if HAVE_ARMV6
# include "arm/dct.h"
#endif
@@ -682,6 +685,30 @@ void x264_dct_init( uint32_t cpu, x264_dct_function_t *dctf )
}
#endif
+#if HAVE_E2K
+ if( cpu&X264_CPU_E2K )
+ {
+ dctf->sub4x4_dct = x264_sub4x4_dct_e2k;
+ dctf->sub8x8_dct = x264_sub8x8_dct_e2k;
+ dctf->sub16x16_dct = x264_sub16x16_dct_e2k;
+
+ dctf->add8x8_idct_dc = x264_add8x8_idct_dc_e2k;
+ dctf->add16x16_idct_dc = x264_add16x16_idct_dc_e2k;
+
+ dctf->add4x4_idct = x264_add4x4_idct_e2k;
+ dctf->add8x8_idct = x264_add8x8_idct_e2k;
+ dctf->add16x16_idct = x264_add16x16_idct_e2k;
+
+ dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_e2k;
+ dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_e2k;
+ dctf->sub8x8_dct8 = x264_sub8x8_dct8_e2k;
+ dctf->sub16x16_dct8 = x264_sub16x16_dct8_e2k;
+
+ dctf->add8x8_idct8 = x264_add8x8_idct8_e2k;
+ dctf->add16x16_idct8= x264_add16x16_idct8_e2k;
+ }
+#endif
+
#if HAVE_ARMV6 || HAVE_AARCH64
if( cpu&X264_CPU_NEON )
{
@@ -996,6 +1023,14 @@ void x264_zigzag_init( uint32_t cpu, x264_zigzag_function_t *pf_progressive, x26
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_altivec;
}
#endif
+#if HAVE_E2K
+ if( cpu&X264_CPU_E2K )
+ {
+ pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_e2k;
+ pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_e2k;
+ pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_e2k;
+ }
+#endif
#if HAVE_ARMV6 || HAVE_AARCH64
if( cpu&X264_CPU_NEON )
{
@@ -1081,6 +1116,14 @@ void x264_zigzag_init( uint32_t cpu, x264_zigzag_function_t *pf_progressive, x26
}
#endif // HAVE_ALTIVEC
+#if HAVE_E2K
+ if( cpu&X264_CPU_E2K )
+ {
+ pf_interlaced->interleave_8x8_cavlc =
+ pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_e2k;
+ }
+#endif // HAVE_ALTIVEC
+
#if HAVE_MSA
if( cpu&X264_CPU_MSA )
{
diff --git a/common/deblock.c b/common/deblock.c
index 2607b98..f8649ab 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -671,6 +671,9 @@ void x264_macroblock_deblock( x264_t *h )
#if HAVE_ALTIVEC
#include "ppc/deblock.h"
#endif
+#if ARCH_E2K
+#include "e2k/deblock.h"
+#endif
#if HAVE_ARMV6
#include "arm/deblock.h"
#endif
@@ -783,6 +786,14 @@ void x264_deblock_init( uint32_t cpu, x264_deblock_function_t *pf, int b_mbaff )
}
#endif // HAVE_ALTIVEC
+#if HAVE_E2K
+ if( cpu&X264_CPU_E2K )
+ {
+ pf->deblock_luma[1] = x264_deblock_v_luma_e2k;
+ pf->deblock_luma[0] = x264_deblock_h_luma_e2k;
+ }
+#endif
+
#if HAVE_ARMV6 || HAVE_AARCH64
if( cpu&X264_CPU_NEON )
{
diff --git a/common/e2k/dct.c b/common/e2k/dct.c
new file mode 100644
index 0000000..e9709aa
--- /dev/null
+++ b/common/e2k/dct.c
@@ -0,0 +1,829 @@
+/*****************************************************************************
+ * dct.c: e2k transform and zigzag
+ *****************************************************************************
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ * Copyright (C) 2003-2017 x264 project
+ *
+ * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
+ * Eric Petit <eric.petit@lapsus.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "e2kcommon.h"
+#include "dct.h"
+
+#if !HIGH_BIT_DEPTH
+#define VEC_DCT(a0,a1,a2,a3,b0,b1,b2,b3) \
+ b1 = _mm_add_epi16(a0, a3); \
+ b3 = _mm_add_epi16(a1, a2); \
+ b0 = _mm_add_epi16(b1, b3); \
+ b2 = _mm_sub_epi16(b1, b3); \
+ a0 = _mm_sub_epi16(a0, a3); \
+ a1 = _mm_sub_epi16(a1, a2); \
+ b1 = _mm_add_epi16(a0, a0); \
+ b1 = _mm_add_epi16(b1, a1); \
+ b3 = _mm_sub_epi16(a0, a1); \
+ b3 = _mm_sub_epi16(b3, a1)
+
+void x264_sub4x4_dct_e2k(int16_t dct[16], uint8_t *pix1, uint8_t *pix2) {
+ PREP_DIFF_8BYTEALIGNED;
+ vec_s16_t dct0v, dct1v, dct2v, dct3v;
+ vec_s16_t tmp0v, tmp1v, tmp2v, tmp3v;
+
+ VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct0v);
+ VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct1v);
+ VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct2v);
+ VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct3v);
+ VEC_DCT(dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v);
+ VEC_TRANSPOSE_4(tmp0v, tmp1v, tmp2v, tmp3v,
+ dct0v, dct1v, dct2v, dct3v);
+ VEC_DCT(dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v);
+
+ VEC_ST(dct, _mm_unpacklo_epi64(tmp0v, tmp1v));
+ VEC_ST(dct + 8, _mm_unpacklo_epi64(tmp2v, tmp3v));
+}
+
+void x264_sub8x8_dct_e2k(int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2) {
+ PREP_DIFF_8BYTEALIGNED;
+ vec_s16_t dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v;
+ vec_s16_t tmp0v, tmp1v, tmp2v, tmp3v, tmp4v, tmp5v, tmp6v, tmp7v;
+
+ VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct0v);
+ VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct1v);
+ VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct2v);
+ VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct3v);
+ VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct4v);
+ VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct5v);
+ VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct6v);
+ VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct7v);
+ VEC_DCT(dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v);
+ VEC_DCT(dct4v, dct5v, dct6v, dct7v, tmp4v, tmp5v, tmp6v, tmp7v);
+ VEC_TRANSPOSE_8(tmp0v, tmp1v, tmp2v, tmp3v,
+ tmp4v, tmp5v, tmp6v, tmp7v,
+ dct0v, dct1v, dct2v, dct3v,
+ dct4v, dct5v, dct6v, dct7v);
+
+ VEC_DCT(dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v);
+ VEC_DCT(dct4v, dct5v, dct6v, dct7v, tmp4v, tmp5v, tmp6v, tmp7v);
+
+ VEC_ST(&dct[0][0], _mm_unpacklo_epi64(tmp0v, tmp1v));
+ VEC_ST(&dct[0][8], _mm_unpacklo_epi64(tmp2v, tmp3v));
+ VEC_ST(&dct[1][0], _mm_unpacklo_epi64(tmp4v, tmp5v));
+ VEC_ST(&dct[1][8], _mm_unpacklo_epi64(tmp6v, tmp7v));
+ VEC_ST(&dct[2][0], _mm_unpackhi_epi64(tmp0v, tmp1v));
+ VEC_ST(&dct[2][8], _mm_unpackhi_epi64(tmp2v, tmp3v));
+ VEC_ST(&dct[3][0], _mm_unpackhi_epi64(tmp4v, tmp5v));
+ VEC_ST(&dct[3][8], _mm_unpackhi_epi64(tmp6v, tmp7v));
+}
+
+void x264_sub16x16_dct_e2k(int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2) {
+ x264_sub8x8_dct_e2k(&dct[0], &pix1[0], &pix2[0]);
+ x264_sub8x8_dct_e2k(&dct[4], &pix1[8], &pix2[8]);
+ x264_sub8x8_dct_e2k(&dct[8], &pix1[8 * FENC_STRIDE], &pix2[8 * FDEC_STRIDE]);
+ x264_sub8x8_dct_e2k(&dct[12], &pix1[8 * FENC_STRIDE + 8], &pix2[8 * FDEC_STRIDE + 8]);
+}
+
+/***************************************************************************
+ * 8x8 transform:
+ ***************************************************************************/
+
+#define PIX_DIFF_ITER(j) \
+ pix1v = vec_u8_to_s16(VEC_LD8(p1 + j * FENC_STRIDE)); \
+ pix2v = vec_u8_to_s16(VEC_LD8(p2 + j * FDEC_STRIDE)); \
+ tmp##j = _mm_sub_epi16(pix1v, pix2v)
+
+static ALWAYS_INLINE vec_s16_t pix_diff(uint8_t *p1, uint8_t *p2) {
+ vec_s16_t pix1v, pix2v, tmp0, tmp1, tmp2, tmp3;
+ LOAD_ZERO;
+
+ PIX_DIFF_ITER(0);
+ PIX_DIFF_ITER(1);
+ PIX_DIFF_ITER(2);
+ PIX_DIFF_ITER(3);
+
+ pix1v = _mm_add_epi16(tmp0, tmp1);
+ pix2v = _mm_add_epi16(tmp2, tmp3);
+ return _mm_add_epi16(pix1v, pix2v);
+}
+
+void x264_sub8x8_dct_dc_e2k(int16_t dct[4], uint8_t *pix1, uint8_t *pix2) {
+ __m128i v0, v1; __m64 h0, h1;
+
+ v0 = pix_diff(pix1, pix2);
+ v1 = pix_diff(pix1 + 4 * FENC_STRIDE, pix2 + 4 * FDEC_STRIDE);
+
+ v0 = _mm_hadds_epi16(v0, v1);
+ v0 = _mm_hadds_epi16(v0, v0);
+
+ /* 2x2 DC transform */
+ h1 = _mm_setr_pi16(1, -1, 1, -1);
+ h0 = _mm_movepi64_pi64(v0);
+ h0 = _mm_hadds_pi16(h0, _mm_sign_pi16(h0, h1));
+ h0 = _mm_hadds_pi16(h0, _mm_sign_pi16(h0, h1));
+ h0 = _mm_shuffle_pi16(h0, 0xd8);
+ *(__m64*)dct = h0;
+}
+
+void x264_sub8x16_dct_dc_e2k(int16_t dct[8], uint8_t *pix1, uint8_t *pix2) {
+ __m128i v0, v1, v2, v3;
+
+ v0 = pix_diff(pix1, pix2);
+ v1 = pix_diff(pix1 + 4 * FENC_STRIDE, pix2 + 4 * FDEC_STRIDE);
+ v2 = pix_diff(pix1 + 8 * FENC_STRIDE, pix2 + 8 * FDEC_STRIDE);
+ v3 = pix_diff(pix1 + 12 * FENC_STRIDE, pix2 + 12 * FDEC_STRIDE);
+
+ v0 = _mm_hadds_epi16(v0, v1);
+ v2 = _mm_hadds_epi16(v2, v3);
+ v0 = _mm_hadds_epi16(v0, v2);
+
+ /* 2x4 DC transform */
+ v1 = _mm_set1_epi32(-1 << 16 | 1);
+ v0 = _mm_hadd_epi16(v0, _mm_sign_epi16(v0, v1));
+ v0 = _mm_hadd_epi16(v0, _mm_sign_epi16(v0, v1));
+ v0 = _mm_hadd_epi16(v0, _mm_sign_epi16(v0, v1));
+ VEC_ST(dct, _mm_shuffle_epi32(v0, 0x78));
+}
+
+/* DCT8_1D unrolled by 8 */
+#define DCT8_1D_E2K(dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v) { \
+ /* int s07 = SRC(0) + SRC(7); */ \
+ /* int s16 = SRC(1) + SRC(6); */ \
+ /* int s25 = SRC(2) + SRC(5); */ \
+ /* int s34 = SRC(3) + SRC(4); */ \
+ vec_s16_t s07v = _mm_add_epi16(dct0v, dct7v); \
+ vec_s16_t s16v = _mm_add_epi16(dct1v, dct6v); \
+ vec_s16_t s25v = _mm_add_epi16(dct2v, dct5v); \
+ vec_s16_t s34v = _mm_add_epi16(dct3v, dct4v); \
+\
+ /* int a0 = s07 + s34; */ \
+ /* int a1 = s16 + s25; */ \
+ /* int a2 = s07 - s34; */ \
+ /* int a3 = s16 - s25; */ \
+ vec_s16_t a0v = _mm_add_epi16(s07v, s34v); \
+ vec_s16_t a1v = _mm_add_epi16(s16v, s25v); \
+ vec_s16_t a2v = _mm_sub_epi16(s07v, s34v); \
+ vec_s16_t a3v = _mm_sub_epi16(s16v, s25v); \
+\
+ /* int d07 = SRC(0) - SRC(7); */ \
+ /* int d16 = SRC(1) - SRC(6); */ \
+ /* int d25 = SRC(2) - SRC(5); */ \
+ /* int d34 = SRC(3) - SRC(4); */ \
+ vec_s16_t d07v = _mm_sub_epi16(dct0v, dct7v); \
+ vec_s16_t d16v = _mm_sub_epi16(dct1v, dct6v); \
+ vec_s16_t d25v = _mm_sub_epi16(dct2v, dct5v); \
+ vec_s16_t d34v = _mm_sub_epi16(dct3v, dct4v); \
+\
+ /* int a4 = d16 + d25 + (d07 + (d07>>1)); */ \
+ /* int a5 = d07 - d34 - (d25 + (d25>>1)); */ \
+ /* int a6 = d07 + d34 - (d16 + (d16>>1)); */ \
+ /* int a7 = d16 - d25 + (d34 + (d34>>1)); */ \
+ vec_s16_t a4v = _mm_add_epi16(_mm_add_epi16(d16v, d25v), _mm_add_epi16(d07v, _mm_srai_epi16(d07v, 1))); \
+ vec_s16_t a5v = _mm_sub_epi16(_mm_sub_epi16(d07v, d34v), _mm_add_epi16(d25v, _mm_srai_epi16(d25v, 1))); \
+ vec_s16_t a6v = _mm_sub_epi16(_mm_add_epi16(d07v, d34v), _mm_add_epi16(d16v, _mm_srai_epi16(d16v, 1))); \
+ vec_s16_t a7v = _mm_add_epi16(_mm_sub_epi16(d16v, d25v), _mm_add_epi16(d34v, _mm_srai_epi16(d34v, 1))); \
+\
+ /* DST(0) = a0 + a1; */ \
+ /* DST(1) = a4 + (a7>>2); */ \
+ /* DST(2) = a2 + (a3>>1); */ \
+ /* DST(3) = a5 + (a6>>2); */ \
+ dct0v = _mm_add_epi16(a0v, a1v); \
+ dct1v = _mm_add_epi16(a4v, _mm_srai_epi16(a7v, 2)); \
+ dct2v = _mm_add_epi16(a2v, _mm_srai_epi16(a3v, 1)); \
+ dct3v = _mm_add_epi16(a5v, _mm_srai_epi16(a6v, 2)); \
+ /* DST(4) = a0 - a1; */ \
+ /* DST(5) = a6 - (a5>>2); */ \
+ /* DST(6) = (a2>>1) - a3; */ \
+ /* DST(7) = (a4>>2) - a7; */ \
+ dct4v = _mm_sub_epi16(a0v, a1v); \
+ dct5v = _mm_sub_epi16(a6v, _mm_srai_epi16(a5v, 2)); \
+ dct6v = _mm_sub_epi16(_mm_srai_epi16(a2v, 1), a3v); \
+ dct7v = _mm_sub_epi16(_mm_srai_epi16(a4v, 2), a7v); \
+}
+
+
+void x264_sub8x8_dct8_e2k(int16_t dct[64], uint8_t *pix1, uint8_t *pix2) {
+ PREP_DIFF_8BYTEALIGNED;
+
+ vec_s16_t dct0v, dct1v, dct2v, dct3v,
+ dct4v, dct5v, dct6v, dct7v;
+
+ VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct0v);
+ VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct1v);
+ VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct2v);
+ VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct3v);
+
+ VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct4v);
+ VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct5v);
+ VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct6v);
+ VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct7v);
+
+ DCT8_1D_E2K(dct0v, dct1v, dct2v, dct3v,
+ dct4v, dct5v, dct6v, dct7v);
+
+ vec_s16_t dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
+ dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v;
+
+ VEC_TRANSPOSE_8(dct0v, dct1v, dct2v, dct3v,
+ dct4v, dct5v, dct6v, dct7v,
+ dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
+ dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v);
+
+ DCT8_1D_E2K(dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
+ dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v);
+
+ VEC_ST(dct, dct_tr0v);
+ VEC_ST(dct + 8, dct_tr1v);
+ VEC_ST(dct + 16, dct_tr2v);
+ VEC_ST(dct + 24, dct_tr3v);
+ VEC_ST(dct + 32, dct_tr4v);
+ VEC_ST(dct + 40, dct_tr5v);
+ VEC_ST(dct + 48, dct_tr6v);
+ VEC_ST(dct + 56, dct_tr7v);
+}
+
+void x264_sub16x16_dct8_e2k(int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2) {
+ x264_sub8x8_dct8_e2k(dct[0], &pix1[0], &pix2[0]);
+ x264_sub8x8_dct8_e2k(dct[1], &pix1[8], &pix2[8]);
+ x264_sub8x8_dct8_e2k(dct[2], &pix1[8 * FENC_STRIDE], &pix2[8 * FDEC_STRIDE]);
+ x264_sub8x8_dct8_e2k(dct[3], &pix1[8 * FENC_STRIDE + 8], &pix2[8 * FDEC_STRIDE + 8]);
+}
+
+
+/****************************************************************************
+ * IDCT transform:
+ ****************************************************************************/
+
+#define E2K_DC_SUM8_CLIP(i, c0) { \
+ /* unaligned load */ \
+ t0 = VEC_LD8(&p_dst[i * FDEC_STRIDE]); \
+ t1 = VEC_LD8(&p_dst[(i + 1) * FDEC_STRIDE]); \
+ t0 = _mm_adds_epi16(c0, _mm_unpacklo_epi8(t0, zerov)); \
+ t1 = _mm_adds_epi16(c0, _mm_unpacklo_epi8(t1, zerov)); \
+ v##i = _mm_packus_epi16(t0, t1); \
+}
+
+void x264_add8x8_idct_dc_e2k(uint8_t *p_dst, int16_t dct[4]) {
+ LOAD_ZERO;
+ __m128i c0, c1, v0, v2, v4, v6, t0, t1;
+ __m64 h0 = *(__m64*)dct;
+ h0 = _mm_srai_pi16(_mm_add_pi16(h0, _mm_set1_pi16(32)), 6);
+ t0 = _mm_movpi64_epi64(h0);
+ t0 = _mm_unpacklo_epi16(t0, t0);
+ c0 = _mm_unpacklo_epi16(t0, t0);
+ c1 = _mm_unpackhi_epi16(t0, t0);
+
+ E2K_DC_SUM8_CLIP(0, c0);
+ E2K_DC_SUM8_CLIP(2, c0);
+ E2K_DC_SUM8_CLIP(4, c1);
+ E2K_DC_SUM8_CLIP(6, c1);
+
+ VEC_STL(&p_dst[0 * FDEC_STRIDE], v0);
+ VEC_STH(&p_dst[1 * FDEC_STRIDE], v0);
+ VEC_STL(&p_dst[2 * FDEC_STRIDE], v2);
+ VEC_STH(&p_dst[3 * FDEC_STRIDE], v2);
+ VEC_STL(&p_dst[4 * FDEC_STRIDE], v4);
+ VEC_STH(&p_dst[5 * FDEC_STRIDE], v4);
+ VEC_STL(&p_dst[6 * FDEC_STRIDE], v6);
+ VEC_STH(&p_dst[7 * FDEC_STRIDE], v6);
+}
+
+#define E2K_DC_SUM16_CLIP(i) { \
+ /* unaligned load */ \
+ t1 = VEC_LD(&p_dst[i * FDEC_STRIDE]); \
+ t0 = _mm_adds_epi16(c0, _mm_unpacklo_epi8(t1, zerov)); \
+ t1 = _mm_adds_epi16(c1, _mm_unpackhi_epi8(t1, zerov)); \
+ v##i = _mm_packus_epi16(t0, t1); \
+}
+
+void x264_add16x16_idct_dc_e2k(uint8_t *p_dst, int16_t dct[16]) {
+ int i;
+ LOAD_ZERO;
+
+ for (i = 0; i < 4; i++, dct += 4, p_dst += 4 * FDEC_STRIDE) {
+ __m128i c0, c1, v0, v1, v2, v3, t0, t1;
+ __m64 h0 = *(__m64*)dct;
+ h0 = _mm_srai_pi16(_mm_add_pi16(h0, _mm_set1_pi16(32)), 6);
+ t0 = _mm_movpi64_epi64(h0);
+ t0 = _mm_unpacklo_epi16(t0, t0);
+ c0 = _mm_unpacklo_epi16(t0, t0);
+ c1 = _mm_unpackhi_epi16(t0, t0);
+ E2K_DC_SUM16_CLIP(0);
+ E2K_DC_SUM16_CLIP(1);
+ E2K_DC_SUM16_CLIP(2);
+ E2K_DC_SUM16_CLIP(3);
+ VEC_ST(&p_dst[0 * FDEC_STRIDE], v0);
+ VEC_ST(&p_dst[1 * FDEC_STRIDE], v1);
+ VEC_ST(&p_dst[2 * FDEC_STRIDE], v2);
+ VEC_ST(&p_dst[3 * FDEC_STRIDE], v3);
+ }
+}
+
+#define IDCT_1D_E2K(s0, s1, s2, s3, d0, d1, d2, d3) { \
+ /* a0 = SRC(0) + SRC(2); */ \
+ /* a1 = SRC(0) - SRC(2); */ \
+ /* a2 = (SRC(1)>>1) - SRC(3); */ \
+ /* a3 = (SRC(3)>>1) + SRC(1); */ \
+ vec_s16_t a0v = _mm_add_epi16(s0, s2); \
+ vec_s16_t a1v = _mm_sub_epi16(s0, s2); \
+ vec_s16_t a2v = _mm_sub_epi16(_mm_srai_epi16(s1, 1), s3); \
+ vec_s16_t a3v = _mm_add_epi16(_mm_srai_epi16(s3, 1), s1); \
+ /* DST(0, a0 + a3); */ \
+ /* DST(1, a1 + a2); */ \
+ /* DST(2, a1 - a2); */ \
+ /* DST(3, a0 - a3); */ \
+ d0 = _mm_add_epi16(a0v, a3v); \
+ d1 = _mm_add_epi16(a1v, a2v); \
+ d2 = _mm_sub_epi16(a1v, a2v); \
+ d3 = _mm_sub_epi16(a0v, a3v); \
+}
+
+void x264_add4x4_idct_e2k(uint8_t *dst, int16_t dct[16]) {
+ dct[0] += 32; // rounding for the >>6 at the end
+
+ vec_s16_t s0, s1, s2, s3;
+ s0 = VEC_LD(dct);
+ s1 = _mm_unpackhi_epi64(s0, s0);
+ s2 = VEC_LD(dct + 8);
+ s3 = _mm_unpackhi_epi64(s2, s2);
+
+ vec_s16_t d0, d1, d2, d3, t0, t1, t2, t3;
+ IDCT_1D_E2K(s0, s1, s2, s3, d0, d1, d2, d3);
+ VEC_TRANSPOSE_4(d0, d1, d2, d3, t0, t1, t2, t3);
+ IDCT_1D_E2K(t0, t1, t2, t3, d0, d1, d2, d3);
+
+ LOAD_ZERO;
+ t2 = _mm_setr_epi32(*(uint32_t*)(dst + 0 * FDEC_STRIDE),
+ *(uint32_t*)(dst + 1 * FDEC_STRIDE),
+ *(uint32_t*)(dst + 2 * FDEC_STRIDE),
+ *(uint32_t*)(dst + 3 * FDEC_STRIDE));
+
+ d0 = _mm_srai_epi16(_mm_unpacklo_epi64(d0, d1), 6);
+ d2 = _mm_srai_epi16(_mm_unpacklo_epi64(d2, d3), 6);
+ t0 = _mm_adds_epi16(d0, _mm_unpacklo_epi8(t2, zerov));
+ t1 = _mm_adds_epi16(d2, _mm_unpackhi_epi8(t2, zerov));
+ t0 = _mm_packus_epi16(t0, t1);
+ *(uint32_t*)(dst + 0 * FDEC_STRIDE) = _mm_extract_epi32(t0, 0);
+ *(uint32_t*)(dst + 1 * FDEC_STRIDE) = _mm_extract_epi32(t0, 1);
+ *(uint32_t*)(dst + 2 * FDEC_STRIDE) = _mm_extract_epi32(t0, 2);
+ *(uint32_t*)(dst + 3 * FDEC_STRIDE) = _mm_extract_epi32(t0, 3);
+}
+
+#define VEC_TRANSPOSE_8X4(a0, a1, a2, a3, b0, b1, b2, b3) \
+ b0 = _mm_unpacklo_epi16(a0, a2); \
+ b1 = _mm_unpacklo_epi16(a1, a3); \
+ b2 = _mm_unpackhi_epi16(a0, a2); \
+ b3 = _mm_unpackhi_epi16(a1, a3); \
+ a0 = _mm_unpacklo_epi16(b0, b1); \
+ a1 = _mm_unpackhi_epi16(b0, b1); \
+ a2 = _mm_unpacklo_epi16(b2, b3); \
+ a3 = _mm_unpackhi_epi16(b2, b3); \
+ b0 = _mm_unpacklo_epi64(a0, a2); \
+ b1 = _mm_unpackhi_epi64(a0, a2); \
+ b2 = _mm_unpacklo_epi64(a1, a3); \
+ b3 = _mm_unpackhi_epi64(a1, a3)
+
+static ALWAYS_INLINE void x264_add8x4_idct_e2k(uint8_t *dst, int16_t *dct) {
+ vec_s16_t d0, d1, d2, d3, t0, t1, t2, t3;
+ d0 = VEC_LD(dct);
+ d1 = VEC_LD(dct + 16);
+ d2 = VEC_LD(dct + 8);
+ d3 = VEC_LD(dct + 24);
+ t0 = _mm_unpacklo_epi64(d0, d1);
+ // rounding for the >>6 at the end
+ t0 = _mm_add_epi16(t0, _mm_setr_epi32(32, 0, 32, 0));
+ t1 = _mm_unpackhi_epi64(d0, d1);
+ t2 = _mm_unpacklo_epi64(d2, d3);
+ t3 = _mm_unpackhi_epi64(d2, d3);
+
+ IDCT_1D_E2K(t0, t1, t2, t3, d0, d1, d2, d3);
+ VEC_TRANSPOSE_8X4(d0, d1, d2, d3, t0, t1, t2, t3);
+ IDCT_1D_E2K(t0, t1, t2, t3, d0, d1, d2, d3);
+
+ LOAD_ZERO;
+ t0 = _mm_unpacklo_epi8(VEC_LD8(dst + 0 * FDEC_STRIDE), zerov);
+ t1 = _mm_unpacklo_epi8(VEC_LD8(dst + 1 * FDEC_STRIDE), zerov);
+ t2 = _mm_unpacklo_epi8(VEC_LD8(dst + 2 * FDEC_STRIDE), zerov);
+ t3 = _mm_unpacklo_epi8(VEC_LD8(dst + 3 * FDEC_STRIDE), zerov);
+ d0 = _mm_adds_epi16(_mm_srai_epi16(d0, 6), t0);
+ d1 = _mm_adds_epi16(_mm_srai_epi16(d1, 6), t1);
+ d2 = _mm_adds_epi16(_mm_srai_epi16(d2, 6), t2);
+ d3 = _mm_adds_epi16(_mm_srai_epi16(d3, 6), t3);
+ t0 = _mm_packus_epi16(d0, d1);
+ t1 = _mm_packus_epi16(d2, d3);
+ VEC_STL(dst + 0 * FDEC_STRIDE, t0);
+ VEC_STH(dst + 1 * FDEC_STRIDE, t0);
+ VEC_STL(dst + 2 * FDEC_STRIDE, t1);
+ VEC_STH(dst + 3 * FDEC_STRIDE, t1);
+}
+
+void x264_add8x8_idct_e2k(uint8_t *p_dst, int16_t dct[4][16]) {
+ x264_add8x4_idct_e2k(p_dst, dct[0]);
+ x264_add8x4_idct_e2k(p_dst + 4 * FDEC_STRIDE, dct[2]);
+}
+
+void x264_add16x16_idct_e2k(uint8_t *p_dst, int16_t dct[16][16]) {
+ x264_add8x8_idct_e2k(&p_dst[0], &dct[0]);
+ x264_add8x8_idct_e2k(&p_dst[8], &dct[4]);
+ x264_add8x8_idct_e2k(&p_dst[8 * FDEC_STRIDE], &dct[8]);
+ x264_add8x8_idct_e2k(&p_dst[8 * FDEC_STRIDE + 8], &dct[12]);
+}
+
+#define IDCT8_1D_E2K(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) { \
+ /* a0 = SRC(0) + SRC(4); */ \
+ /* a2 = SRC(0) - SRC(4); */ \
+ /* a4 = (SRC(2)>>1) - SRC(6); */ \
+ /* a6 = (SRC(6)>>1) + SRC(2); */ \
+ vec_s16_t a0v = _mm_add_epi16(s0, s4); \
+ vec_s16_t a2v = _mm_sub_epi16(s0, s4); \
+ vec_s16_t a4v = _mm_sub_epi16(_mm_srai_epi16(s2, 1), s6); \
+ vec_s16_t a6v = _mm_add_epi16(_mm_srai_epi16(s6, 1), s2); \
+ /* b0 = a0 + a6; */ \
+ /* b2 = a2 + a4; */ \
+ /* b4 = a2 - a4; */ \
+ /* b6 = a0 - a6; */ \
+ vec_s16_t b0v = _mm_add_epi16(a0v, a6v); \
+ vec_s16_t b2v = _mm_add_epi16(a2v, a4v); \
+ vec_s16_t b4v = _mm_sub_epi16(a2v, a4v); \
+ vec_s16_t b6v = _mm_sub_epi16(a0v, a6v); \
+ /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \
+ /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \
+ /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \
+ /* a7 = SRC(5) + SRC(3) + SRC(1) + (SRC(1)>>1); */ \
+ vec_s16_t a1v = _mm_sub_epi16(_mm_sub_epi16(s5, s3), _mm_add_epi16(s7, _mm_srai_epi16(s7, 1))); \
+ vec_s16_t a3v = _mm_sub_epi16(_mm_add_epi16(s7, s1), _mm_add_epi16(s3, _mm_srai_epi16(s3, 1))); \
+ vec_s16_t a5v = _mm_add_epi16(_mm_sub_epi16(s7, s1), _mm_add_epi16(s5, _mm_srai_epi16(s5, 1))); \
+ vec_s16_t a7v = _mm_add_epi16(_mm_add_epi16(s5, s3), _mm_add_epi16(s1, _mm_srai_epi16(s1, 1))); \
+ /* b1 = (a7>>2) + a1; */ \
+ /* b3 = a3 + (a5>>2); */ \
+ /* b5 = (a3>>2) - a5; */ \
+ /* b7 = a7 - (a1>>2); */ \
+ vec_s16_t b1v = _mm_add_epi16(_mm_srai_epi16(a7v, 2), a1v); \
+ vec_s16_t b3v = _mm_add_epi16(a3v, _mm_srai_epi16(a5v, 2)); \
+ vec_s16_t b5v = _mm_sub_epi16(_mm_srai_epi16(a3v, 2), a5v); \
+ vec_s16_t b7v = _mm_sub_epi16(a7v, _mm_srai_epi16(a1v, 2)); \
+ /* DST(0, b0 + b7); */ \
+ /* DST(1, b2 + b5); */ \
+ /* DST(2, b4 + b3); */ \
+ /* DST(3, b6 + b1); */ \
+ d0 = _mm_add_epi16(b0v, b7v); \
+ d1 = _mm_add_epi16(b2v, b5v); \
+ d2 = _mm_add_epi16(b4v, b3v); \
+ d3 = _mm_add_epi16(b6v, b1v); \
+ /* DST(4, b6 - b1); */ \
+ /* DST(5, b4 - b3); */ \
+ /* DST(6, b2 - b5); */ \
+ /* DST(7, b0 - b7); */ \
+ d4 = _mm_sub_epi16(b6v, b1v); \
+ d5 = _mm_sub_epi16(b4v, b3v); \
+ d6 = _mm_sub_epi16(b2v, b5v); \
+ d7 = _mm_sub_epi16(b0v, b7v); \
+}
+
+#define E2K_STORE_SUM_CLIP(i, d0, d1) { \
+ uint8_t *ptr = dst + i * FDEC_STRIDE; \
+ /* unaligned load */ \
+ t0 = _mm_unpacklo_epi8(VEC_LD8(ptr), zerov); \
+ t1 = _mm_unpacklo_epi8(VEC_LD8(ptr + FDEC_STRIDE), zerov); \
+ d0 = _mm_adds_epi16(_mm_srai_epi16(d0, 6), t0); \
+ d1 = _mm_adds_epi16(_mm_srai_epi16(d1, 6), t1); \
+ d0 = _mm_packus_epi16(d0, d1); \
+ /* unaligned store */ \
+ VEC_STL(ptr, d0); \
+ VEC_STH(ptr + FDEC_STRIDE, d0); \
+}
+
+void x264_add8x8_idct8_e2k(uint8_t *dst, int16_t dct[64]) {
+ dct[0] += 32; // rounding for the >>6 at the end
+
+ vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7;
+ vec_s16_t t0, t1, t2, t3, t4, t5, t6, t7;
+ t0 = VEC_LD(dct + 8 * 0);
+ t1 = VEC_LD(dct + 8 * 1);
+ t2 = VEC_LD(dct + 8 * 2);
+ t3 = VEC_LD(dct + 8 * 3);
+ t4 = VEC_LD(dct + 8 * 4);
+ t5 = VEC_LD(dct + 8 * 5);
+ t6 = VEC_LD(dct + 8 * 6);
+ t7 = VEC_LD(dct + 8 * 7);
+
+ IDCT8_1D_E2K(t0, t1, t2, t3, t4, t5, t6, t7,
+ d0, d1, d2, d3, d4, d5, d6, d7);
+ VEC_TRANSPOSE_8(d0, d1, d2, d3, d4, d5, d6, d7,
+ t0, t1, t2, t3, t4, t5, t6, t7);
+ IDCT8_1D_E2K(t0, t1, t2, t3, t4, t5, t6, t7,
+ d0, d1, d2, d3, d4, d5, d6, d7);
+ LOAD_ZERO;
+ E2K_STORE_SUM_CLIP(0, d0, d1);
+ E2K_STORE_SUM_CLIP(2, d2, d3);
+ E2K_STORE_SUM_CLIP(4, d4, d5);
+ E2K_STORE_SUM_CLIP(6, d6, d7);
+}
+
+void x264_add16x16_idct8_e2k(uint8_t *dst, int16_t dct[4][64]) {
+ x264_add8x8_idct8_e2k(&dst[0], dct[0]);
+ x264_add8x8_idct8_e2k(&dst[8], dct[1]);
+ x264_add8x8_idct8_e2k(&dst[8 * FDEC_STRIDE], dct[2]);
+ x264_add8x8_idct8_e2k(&dst[8 * FDEC_STRIDE + 8], dct[3]);
+}
+
+#define SHUF16X4(a, b, c, d) _mm_setr_pi8( \
+ a * 2, a * 2 + 1, b * 2, b * 2 + 1, c * 2, c * 2 + 1, d * 2, d * 2 + 1)
+
+#define SHUF16X8(a, b, c, d, e, f, g, h) _mm_setr_epi8( \
+ a * 2, a * 2 + 1, b * 2, b * 2 + 1, c * 2, c * 2 + 1, d * 2, d * 2 + 1, \
+ e * 2, e * 2 + 1, f * 2, f * 2 + 1, g * 2, g * 2 + 1, h * 2, h * 2 + 1)
+
+void x264_zigzag_scan_4x4_frame_e2k(int16_t level[16], int16_t dct[16]) {
+#if defined(__iset__) && __iset__ >= 5
+ __m128i v0, v1, v2, v3;
+
+ v0 = VEC_LD(dct);
+ v1 = VEC_LD(dct + 8);
+ v2 = _mm_shuffle2_epi8(v0, v1, SHUF16X8(0, 4, 1, 2, 5, 8, 12, 9));
+ v3 = _mm_shuffle2_epi8(v0, v1, SHUF16X8(6, 3, 7, 10, 13, 14, 11, 15));
+ VEC_ST(level, v2);
+ VEC_ST(level + 8, v3);
+#else
+ __m64 v0, v1, v2, v3, v4, v5, v6, v7;
+
+ v0 = *(__m64*)dct;
+ v1 = *(__m64*)(dct + 4);
+ v2 = *(__m64*)(dct + 8);
+ v3 = *(__m64*)(dct + 12);
+
+ __m64 mask = _mm_setr_pi16(-1, 0, 0, 0);
+ __m64 i5637 = SHUF16X4(5, 6, 3, 7);
+
+ v4 = _mm_shuffle2_pi8(v0, v1, SHUF16X4(0, 4, 1, 2));
+ v5 = _mm_shuffle2_pi8(v0, v1, i5637);
+ v6 = _mm_shuffle2_pi8(v2, v3, SHUF16X4(2, 0, 4, 1));
+ v7 = _mm_shuffle2_pi8(v2, v3, i5637);
+
+ v1 = _mm_blendv_pi8(v6, v5, mask);
+ v2 = _mm_alignr_pi8(v6, v5, 2);
+
+ *(__m64*)level = v4;
+ *(__m64*)(level + 4) = v1;
+ *(__m64*)(level + 8) = v2;
+ *(__m64*)(level + 12) = v7;
+#endif
+}
+
+void x264_zigzag_scan_4x4_field_e2k(int16_t level[16], int16_t dct[16]) {
+ vec_s16_t dct0v, dct1v, tmp0v, tmp1v;
+
+ dct0v = VEC_LD(dct);
+ dct1v = VEC_LD(dct + 8);
+
+ vec_u8_t sel0 = SHUF16X8(0, 1, 4, 2, 3, 5, 6, 7);
+
+ tmp0v = _mm_shuffle_epi8(dct0v, sel0);
+ tmp1v = dct1v;
+
+ VEC_ST(level, tmp0v);
+ VEC_ST(level + 8, tmp1v);
+}
+
+#define INSFI_M64(a, b, c, d) _mm_cvtsi64_m64(__builtin_e2k_insfd( \
+ _mm_cvtm64_si64(a), (b & 63) | (d & 63) << 6, _mm_cvtm64_si64(c)))
+
+void x264_zigzag_scan_8x8_frame_e2k(int16_t level[64], int16_t dct[64]) {
+#if defined(__iset__) && __iset__ >= 5
+ vec_s16_t tmpv0, tmpv1, tmpv2, tmpv3, tmpv4, tmpv5;
+ vec_s16_t dct0v = VEC_LD(dct + 8 * 0);
+ vec_s16_t dct1v = VEC_LD(dct + 8 * 1);
+ vec_s16_t dct2v = VEC_LD(dct + 8 * 2);
+ vec_s16_t dct3v = VEC_LD(dct + 8 * 3);
+ vec_s16_t dct4v = VEC_LD(dct + 8 * 4);
+ vec_s16_t dct5v = VEC_LD(dct + 8 * 5);
+ vec_s16_t dct6v = VEC_LD(dct + 8 * 6);
+ vec_s16_t dct7v = VEC_LD(dct + 8 * 7);
+
+ tmpv0 = _mm_unpacklo_epi16(dct0v, dct1v);
+ tmpv1 = _mm_unpacklo_epi16(dct2v, dct3v);
+ tmpv2 = _mm_unpacklo_epi32(tmpv0, tmpv1);
+ tmpv3 = _mm_shuffle2_epi8(tmpv2, dct0v, SHUF16X8( 0, 1, 9, 10, 5, 2, 3, 6));
+ VEC_ST(level + 8 * 0, tmpv3);
+
+ tmpv4 = _mm_unpacklo_epi16(dct4v, dct5v);
+ tmpv3 = _mm_shuffle2_epi8(tmpv0, tmpv4, SHUF16X8( 5, 6, 0, 7, 0, 0, 8, 9));
+ tmpv3 = _mm_shuffle2_epi8(tmpv3, dct0v, SHUF16X8( 0, 1, 12, 3, 4, 5, 6, 7));
+ tmpv3 = _mm_shuffle2_epi8(tmpv3, tmpv1, SHUF16X8( 0, 1, 2, 3, 12, 11, 6, 7));
+ VEC_ST(level + 8 * 1, tmpv3);
+
+ tmpv3 = _mm_unpackhi_epi16(dct0v, dct1v);
+ tmpv1 = _mm_unpackhi_epi16(tmpv1, dct2v);
+ tmpv5 = _mm_shuffle2_epi8(tmpv3, tmpv1, SHUF16X8( 0, 10, 12, 1, 2, 4, 3, 9));
+ tmpv5 = _mm_shuffle2_epi8(tmpv5, dct4v, SHUF16X8( 9, 1, 2, 3, 4, 5, 6, 7));
+ VEC_ST(level + 8 * 2, tmpv5);
+
+ tmpv2 = _mm_unpacklo_epi16(dct5v, dct6v);
+ tmpv5 = _mm_unpacklo_epi16(tmpv2, dct7v);
+ tmpv4 = _mm_unpackhi_epi16(tmpv4, tmpv1);
+ tmpv0 = _mm_shuffle2_epi8(tmpv5, tmpv4, SHUF16X8(13, 8, 4, 2, 1, 6, 10, 12));
+ VEC_ST(level + 8 * 3, tmpv0);
+
+ tmpv1 = _mm_unpackhi_epi16(dct2v, dct3v);
+ tmpv0 = _mm_unpackhi_epi16(dct4v, dct5v);
+ tmpv4 = _mm_shuffle2_epi8(tmpv1, tmpv0, SHUF16X8( 1, 2, 3, 4, 8, 9, 10, 5));
+ tmpv3 = _mm_shuffle2_epi8(tmpv4, tmpv3, SHUF16X8( 0, 1, 13, 14, 15, 3, 2, 4));
+ VEC_ST(level + 8 * 4, tmpv3);
+
+ tmpv3 = _mm_unpacklo_epi16(dct6v, dct7v);
+ tmpv2 = _mm_unpackhi_epi16(dct3v, dct4v);
+ tmpv2 = _mm_shuffle2_epi8(tmpv2, dct5v, SHUF16X8( 0, 11, 12, 3, 4, 5, 6, 13));
+ tmpv3 = _mm_shuffle2_epi8(tmpv2, tmpv3, SHUF16X8( 1, 12, 11, 13, 14, 2, 3, 4));
+ VEC_ST(level + 8 * 5, tmpv3);
+
+ tmpv1 = _mm_unpackhi_epi16(tmpv1, tmpv2);
+ tmpv2 = _mm_unpackhi_epi16(dct6v, dct7v);
+ tmpv1 = _mm_shuffle2_epi8(tmpv1, tmpv2, SHUF16X8( 4, 5, 3, 7, 8, 0, 9, 10));
+ tmpv1 = _mm_shuffle2_epi8(tmpv1, dct7v, SHUF16X8( 0, 1, 2, 3, 4, 11, 6, 7));
+ VEC_ST(level + 8 * 6, tmpv1);
+
+ tmpv2 = _mm_shuffle2_epi8(tmpv2, tmpv0, SHUF16X8(13, 14, 15, 4, 3, 5, 6, 7));
+ VEC_ST(level + 8 * 7, tmpv2);
+#else
+ __m64 d0l, d0h, d1l, d1h, d2l, d2h, d3l, d3h;
+ __m64 d4l, d4h, d5l, d5h, d6l, d6h, d7l, d7h;
+ __m64 h0, h1, h2, h3, r0, r1, c1256 = SHUF16X4(1, 2, 5, 6);
+
+ d0l = *(__m64*)(dct + 8 * 0); d0h = *(__m64*)(dct + 8 * 0 + 4);
+ d1l = *(__m64*)(dct + 8 * 1); d1h = *(__m64*)(dct + 8 * 1 + 4);
+ d2l = *(__m64*)(dct + 8 * 2); d2h = *(__m64*)(dct + 8 * 2 + 4);
+ d3l = *(__m64*)(dct + 8 * 3); d3h = *(__m64*)(dct + 8 * 3 + 4);
+ d4l = *(__m64*)(dct + 8 * 4); d4h = *(__m64*)(dct + 8 * 4 + 4);
+ d5l = *(__m64*)(dct + 8 * 5); d5h = *(__m64*)(dct + 8 * 5 + 4);
+ d6l = *(__m64*)(dct + 8 * 6); d6h = *(__m64*)(dct + 8 * 6 + 4);
+ d7l = *(__m64*)(dct + 8 * 7); d7h = *(__m64*)(dct + 8 * 7 + 4);
+
+ // d0l[0] d1l[0] d0l[1] d0l[2]
+ // d1l[1] d2l[0] d3l[0] d2l[1]
+ h0 = _mm_unpacklo_pi16(d0l, d1l);
+ h2 = _mm_unpacklo_pi32(d1l, d2l);
+ h3 = _mm_unpacklo_pi16(d2l, d3l);
+ r0 = INSFI_M64(d0l, -16, h0, 48);
+ r1 = _mm_shuffle2_pi8(h2, h3, c1256);
+ *(__m64*)(level + 8 * 0) = r0;
+ *(__m64*)(level + 8 * 0 + 4) = r1;
+
+ // d1l[2] d0l[3] d0h[0] d1l[3]
+ // d2l[2] d3l[1] d4l[0] d5l[0]
+ h0 = INSFI_M64(d0l, 0, d1l, 48);
+ h1 = INSFI_M64(d1l, 32, d0h, 16);
+ h2 = _mm_unpacklo_pi16(d4l, d5l);
+ h3 = INSFI_M64(d3l, -32, d2l, 48);
+ r0 = _mm_alignr_pi8(h1, h0, 4);
+ r1 = _mm_alignr_pi8(h2, h3, 4);
+ *(__m64*)(level + 8 * 1) = r0;
+ *(__m64*)(level + 8 * 1 + 4) = r1;
+
+ // d4l[1] d3l[2] d2l[3] d1h[0]
+ // d0h[1] d0h[2] d1h[1] d2h[0]
+ h0 = INSFI_M64(d3l, 0, d4l, 32);
+ h1 = _mm_alignr_pi8(d1h, d2l, 4);
+ h2 = _mm_unpacklo_pi32(d1h, d2h);
+ r0 = _mm_shuffle2_pi8(h0, h1, c1256);
+ r1 = _mm_shuffle2_pi8(d0h, h2, c1256);
+ *(__m64*)(level + 8 * 2) = r0;
+ *(__m64*)(level + 8 * 2 + 4) = r1;
+
+ // d3l[3] d4l[2] d5l[1] d6l[0]
+ // d7l[0] d6l[1] d5l[2] d4l[3]
+ h0 = _mm_unpacklo_pi32(d5l, d6l);
+ h1 = _mm_unpackhi_pi32(d3l, d4l);
+ h2 = _mm_unpackhi_pi16(d4l, d5l);
+ h3 = INSFI_M64(d6l, 0, d7l, 16);
+ r0 = _mm_shuffle2_pi8(h1, h0, c1256);
+ r1 = INSFI_M64(h2, -16, h3, 32);
+ *(__m64*)(level + 8 * 3) = r0;
+ *(__m64*)(level + 8 * 3 + 4) = r1;
+
+ // d3h[0] d2h[1] d1h[2] d0h[3]
+ // d1h[3] d2h[2] d3h[1] d4h[0]
+ h0 = _mm_unpackhi_pi16(d0h, d1h);
+ h1 = INSFI_M64(d2h, 0, d3h, 16);
+ h2 = _mm_alignr_pi8(d3h, d1h, 4);
+ h3 = _mm_alignr_pi8(d4h, d2h, 4);
+// h2 = _mm_unpackhi_pi32(d1h, d2h);
+// h3 = _mm_unpacklo_pi32(d3h, d4h);
+ r0 = INSFI_M64(h0, -16, h1, 32);
+ r1 = _mm_shuffle2_pi8(h2, h3, SHUF16X4(1, 4, 3, 6));
+// r1 = _mm_shuffle2_pi8(h2, h3, c1256);
+ *(__m64*)(level + 8 * 4) = r0;
+ *(__m64*)(level + 8 * 4 + 4) = r1;
+
+ // d5l[3] d6l[2] d7l[1] d7l[2]
+ // d6l[3] d5h[0] d4h[1] d3h[2]
+ h0 = _mm_unpackhi_pi32(d5l, d6l);
+ h1 = INSFI_M64(d3h, 0, d4h, 32);
+ h2 = _mm_alignr_pi8(d5h, d6l, 6);
+ r0 = _mm_shuffle2_pi8(h0, d7l, c1256);
+ r1 = INSFI_M64(h1, -16, h2, 32);
+ *(__m64*)(level + 8 * 5) = r0;
+ *(__m64*)(level + 8 * 5 + 4) = r1;
+
+ // d2h[3] d3h[3] d4h[2] d5h[1]
+ // d6h[0] d7l[3] d7h[0] d6h[1]
+ h0 = _mm_unpackhi_pi16(d2h, d3h);
+ h1 = INSFI_M64(d5h, -32, d4h, 48);
+ h2 = _mm_unpacklo_pi16(d6h, d7h);
+ r0 = _mm_unpackhi_pi32(h0, h1);
+ r1 = _mm_shuffle2_pi8(h2, d7l, SHUF16X4(0, 7, 1, 2));
+ *(__m64*)(level + 8 * 6) = r0;
+ *(__m64*)(level + 8 * 6 + 4) = r1;
+
+ // d5h[2] d4h[3] d5h[3] d6h[2]
+ // d7h[1] d7h[2] d6h[3] d7h[3]
+ h0 = _mm_srli_si64(_mm_unpackhi_pi16(d4h, d5h), 16);
+ r0 = INSFI_M64(d6h, -16, h0, 48);
+ r1 = _mm_shuffle2_pi8(d7h, d6h, SHUF16X4(1, 2, 7, 3));
+ *(__m64*)(level + 8 * 7) = r0;
+ *(__m64*)(level + 8 * 7 + 4) = r1;
+#endif
+}
+
+void x264_zigzag_interleave_8x8_cavlc_e2k(int16_t *dst, int16_t *src, uint8_t *nnz) {
+ vec_s16_t merge0, merge1;
+ vec_s16_t permv0, permv1, permv2;
+ vec_s16_t src0v = VEC_LD(src + 8 * 0);
+ vec_s16_t src1v = VEC_LD(src + 8 * 1);
+ vec_s16_t src2v = VEC_LD(src + 8 * 2);
+ vec_s16_t src3v = VEC_LD(src + 8 * 3);
+ vec_s16_t src4v = VEC_LD(src + 8 * 4);
+ vec_s16_t src5v = VEC_LD(src + 8 * 5);
+ vec_s16_t src6v = VEC_LD(src + 8 * 6);
+ vec_s16_t src7v = VEC_LD(src + 8 * 7);
+
+ vec_s16_t tmpv0 = _mm_unpacklo_epi16(src0v, src1v);
+ vec_s16_t tmpv1 = _mm_unpackhi_epi16(src0v, src1v);
+ vec_s16_t tmpv2 = _mm_unpacklo_epi16(src2v, src3v);
+ vec_s16_t tmpv3 = _mm_unpackhi_epi16(src2v, src3v);
+ vec_s16_t tmpv4 = _mm_unpacklo_epi16(src4v, src5v);
+ vec_s16_t tmpv5 = _mm_unpackhi_epi16(src4v, src5v);
+ vec_s16_t tmpv6 = _mm_unpacklo_epi16(src6v, src7v);
+ vec_s16_t tmpv7 = _mm_unpackhi_epi16(src6v, src7v);
+
+ merge0 = _mm_unpacklo_epi16(tmpv0, tmpv1);
+ merge1 = _mm_unpacklo_epi16(tmpv2, tmpv3);
+ permv0 = _mm_unpacklo_epi64(merge0, merge1);
+ permv1 = _mm_unpackhi_epi64(merge0, merge1);
+ VEC_ST(dst + 8 * 0, permv0);
+
+ merge0 = _mm_unpacklo_epi16(tmpv4, tmpv5);
+ merge1 = _mm_unpacklo_epi16(tmpv6, tmpv7);
+ permv0 = _mm_unpacklo_epi64(merge0, merge1);
+ permv2 = _mm_unpackhi_epi64(merge0, merge1);
+ VEC_ST(dst + 8 * 1, permv0);
+ VEC_ST(dst + 8 * 2, permv1);
+ VEC_ST(dst + 8 * 3, permv2);
+
+ merge0 = _mm_unpackhi_epi16(tmpv0, tmpv1);
+ merge1 = _mm_unpackhi_epi16(tmpv2, tmpv3);
+ permv0 = _mm_unpacklo_epi64(merge0, merge1);
+ permv1 = _mm_unpackhi_epi64(merge0, merge1);
+ VEC_ST(dst + 8 * 4, permv0);
+
+ merge0 = _mm_unpackhi_epi16(tmpv4, tmpv5);
+ merge1 = _mm_unpackhi_epi16(tmpv6, tmpv7);
+ permv0 = _mm_unpacklo_epi64(merge0, merge1);
+ permv2 = _mm_unpackhi_epi64(merge0, merge1);
+ VEC_ST(dst + 8 * 5, permv0);
+ VEC_ST(dst + 8 * 6, permv1);
+ VEC_ST(dst + 8 * 7, permv2);
+
+ permv0 = _mm_or_si128(_mm_or_si128(src0v, src1v), _mm_or_si128(src2v, src3v));
+ permv1 = _mm_or_si128(_mm_or_si128(src4v, src5v), _mm_or_si128(src6v, src7v));
+ permv0 = _mm_or_si128(permv0, permv1);
+
+ __m64 h0, h1;
+ h0 = _mm_movepi64_pi64(permv0);
+ h1 = _mm_movepi64_pi64(_mm_unpackhi_epi64(permv0, permv0));
+ h0 = _mm_or_si64(h0, h1);
+
+ h0 = _mm_packs_pi16(h0, h0);
+ h0 = _mm_cmpeq_pi8(h0, _mm_setzero_si64());
+ h0 = _mm_sub_pi8(h0, _mm_set1_pi8(-1)); /* ff 00 -> 00 01 */
+ *(uint16_t*)nnz = _mm_extract_pi16(h0, 0);
+ *(uint16_t*)(nnz + 8) = _mm_extract_pi16(h0, 1);
+}
+
+#endif // !HIGH_BIT_DEPTH
+
diff --git a/common/e2k/dct.h b/common/e2k/dct.h
new file mode 100644
index 0000000..bdf682a
--- /dev/null
+++ b/common/e2k/dct.h
@@ -0,0 +1,69 @@
+/*****************************************************************************
+ * dct.h: e2k transform and zigzag
+ *****************************************************************************
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_E2K_DCT_H
+#define X264_E2K_DCT_H
+
+#define x264_sub4x4_dct_e2k x264_template(sub4x4_dct_e2k)
+void x264_sub4x4_dct_e2k(int16_t dct[16], uint8_t *pix1, uint8_t *pix2);
+#define x264_sub8x8_dct_e2k x264_template(sub8x8_dct_e2k)
+void x264_sub8x8_dct_e2k(int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2);
+#define x264_sub16x16_dct_e2k x264_template(sub16x16_dct_e2k)
+void x264_sub16x16_dct_e2k(int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2);
+
+#define x264_add8x8_idct_dc_e2k x264_template(add8x8_idct_dc_e2k)
+void x264_add8x8_idct_dc_e2k(uint8_t *p_dst, int16_t dct[4]);
+#define x264_add16x16_idct_dc_e2k x264_template(add16x16_idct_dc_e2k)
+void x264_add16x16_idct_dc_e2k(uint8_t *p_dst, int16_t dct[16]);
+
+#define x264_add4x4_idct_e2k x264_template(add4x4_idct_e2k)
+void x264_add4x4_idct_e2k(uint8_t *p_dst, int16_t dct[16]);
+#define x264_add8x8_idct_e2k x264_template(add8x8_idct_e2k)
+void x264_add8x8_idct_e2k(uint8_t *p_dst, int16_t dct[4][16]);
+#define x264_add16x16_idct_e2k x264_template(add16x16_idct_e2k)
+void x264_add16x16_idct_e2k(uint8_t *p_dst, int16_t dct[16][16]);
+
+#define x264_sub8x8_dct_dc_e2k x264_template(sub8x8_dct_dc_e2k)
+void x264_sub8x8_dct_dc_e2k(int16_t dct[4], uint8_t *pix1, uint8_t *pix2);
+#define x264_sub8x16_dct_dc_e2k x264_template(sub8x16_dct_dc_e2k)
+void x264_sub8x16_dct_dc_e2k(int16_t dct[4], uint8_t *pix1, uint8_t *pix2);
+#define x264_sub8x8_dct8_e2k x264_template(sub8x8_dct8_e2k)
+void x264_sub8x8_dct8_e2k(int16_t dct[64], uint8_t *pix1, uint8_t *pix2);
+#define x264_sub16x16_dct8_e2k x264_template(sub16x16_dct8_e2k)
+void x264_sub16x16_dct8_e2k(int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2);
+
+#define x264_add8x8_idct8_e2k x264_template(add8x8_idct8_e2k)
+void x264_add8x8_idct8_e2k(uint8_t *dst, int16_t dct[64]);
+#define x264_add16x16_idct8_e2k x264_template(add16x16_idct8_e2k)
+void x264_add16x16_idct8_e2k(uint8_t *dst, int16_t dct[4][64]);
+
+#define x264_zigzag_scan_4x4_frame_e2k x264_template(zigzag_scan_4x4_frame_e2k)
+void x264_zigzag_scan_4x4_frame_e2k(int16_t level[16], int16_t dct[16]);
+#define x264_zigzag_scan_4x4_field_e2k x264_template(zigzag_scan_4x4_field_e2k)
+void x264_zigzag_scan_4x4_field_e2k(int16_t level[16], int16_t dct[16]);
+#define x264_zigzag_scan_8x8_frame_e2k x264_template(zigzag_scan_8x8_frame_e2k)
+void x264_zigzag_scan_8x8_frame_e2k(int16_t level[64], int16_t dct[64]);
+#define x264_zigzag_interleave_8x8_cavlc_e2k x264_template(zigzag_interleave_8x8_cavlc_e2k)
+void x264_zigzag_interleave_8x8_cavlc_e2k(int16_t *dst, int16_t *src, uint8_t *nnz);
+
+#endif
diff --git a/common/e2k/deblock.c b/common/e2k/deblock.c
new file mode 100644
index 0000000..349578f
--- /dev/null
+++ b/common/e2k/deblock.c
@@ -0,0 +1,236 @@
+/*****************************************************************************
+ * deblock.c: e2k deblocking
+ *****************************************************************************
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ * Copyright (C) 2007-2017 x264 project
+ *
+ * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "e2kcommon.h"
+#include "deblock.h"
+
+#if !HIGH_BIT_DEPTH
+#define transpose4x16(r0, r1, r2, r3) { \
+ vec_u8_t r4, r5, r6, r7; \
+ \
+ r4 = _mm_unpacklo_epi8(r0, r2); /*0, 2 set 0*/ \
+ r5 = _mm_unpackhi_epi8(r0, r2); /*0, 2 set 1*/ \
+ r6 = _mm_unpacklo_epi8(r1, r3); /*1, 3 set 0*/ \
+ r7 = _mm_unpackhi_epi8(r1, r3); /*1, 3 set 1*/ \
+ \
+ r0 = _mm_unpacklo_epi8(r4, r6); /*all set 0*/ \
+ r1 = _mm_unpackhi_epi8(r4, r6); /*all set 1*/ \
+ r2 = _mm_unpacklo_epi8(r5, r7); /*all set 2*/ \
+ r3 = _mm_unpackhi_epi8(r5, r7); /*all set 3*/ \
+}
+
+#define WRITE4(i, j) ((uint32_t*)dst)[(i * 4 + j) * (dst_stride >> 2)] = _mm_extract_epi32(r##i, j);
+
+static ALWAYS_INLINE void write16x4(uint8_t *dst, int dst_stride,
+ vec_u8_t r0, vec_u8_t r1,
+ vec_u8_t r2, vec_u8_t r3) {
+
+ WRITE4(0, 0) WRITE4(0, 1) WRITE4(0, 2) WRITE4(0, 3)
+ WRITE4(1, 0) WRITE4(1, 1) WRITE4(1, 2) WRITE4(1, 3)
+ WRITE4(2, 0) WRITE4(2, 1) WRITE4(2, 2) WRITE4(2, 3)
+ WRITE4(3, 0) WRITE4(3, 1) WRITE4(3, 2) WRITE4(3, 3)
+}
+
+/** \brief performs a 6x16 transpose of data in src, and stores it to dst */
+#define read_and_transpose16x6(src, st, r8, r9, r10, r11, r12, r13) { \
+ vec_u8_t r0, r1, r2, r3, r4, r5, r6, r7, r14, r15; \
+ r0 = VEC_LD8(src); \
+ r1 = VEC_LD8(src + st); \
+ r2 = VEC_LD8(src + st * 2); \
+ r3 = VEC_LD8(src + st * 3); \
+ r4 = VEC_LD8(src + st * 4); \
+ r5 = VEC_LD8(src + st * 5); \
+ r6 = VEC_LD8(src + st * 6); \
+ r7 = VEC_LD8(src + st * 7); \
+ r8 = VEC_LD8(src + st * 8); \
+ r9 = VEC_LD8(src + st * 9); \
+ r10 = VEC_LD8(src + st * 10); \
+ r11 = VEC_LD8(src + st * 11); \
+ r12 = VEC_LD8(src + st * 12); \
+ r13 = VEC_LD8(src + st * 13); \
+ r14 = VEC_LD8(src + st * 14); \
+ r15 = VEC_LD8(src + st * 15); \
+ \
+ /*Merge first pairs*/ \
+ r0 = _mm_unpacklo_epi8(r0, r4); /* 0, 4 */ \
+ r1 = _mm_unpacklo_epi8(r1, r5); /* 1, 5 */ \
+ r2 = _mm_unpacklo_epi8(r2, r6); /* 2, 6 */ \
+ r3 = _mm_unpacklo_epi8(r3, r7); /* 3, 7 */ \
+ r4 = _mm_unpacklo_epi8(r8, r12); /* 8,12 */ \
+ r5 = _mm_unpacklo_epi8(r9, r13); /* 9,13 */ \
+ r6 = _mm_unpacklo_epi8(r10, r14); /* 10,14 */ \
+ r7 = _mm_unpacklo_epi8(r11, r15); /* 11,15 */ \
+ \
+ /*Merge second pairs*/ \
+ r8 = _mm_unpacklo_epi8(r0, r2); /* 0, 2, 4, 6 set 0 */ \
+ r9 = _mm_unpackhi_epi8(r0, r2); /* 0, 2, 4, 6 set 1 */ \
+ r10 = _mm_unpacklo_epi8(r1, r3); /* 1, 3, 5, 7 set 0 */ \
+ r11 = _mm_unpackhi_epi8(r1, r3); /* 1, 3, 5, 7 set 1 */ \
+ r12 = _mm_unpacklo_epi8(r4, r6); /* 8,10,12,14 set 0 */ \
+ r13 = _mm_unpackhi_epi8(r4, r6); /* 8,10,12,14 set 1 */ \
+ r14 = _mm_unpacklo_epi8(r5, r7); /* 9,11,13,15 set 0 */ \
+ r15 = _mm_unpackhi_epi8(r5, r7); /* 9,11,13,15 set 1 */ \
+ \
+ /*Third merge*/ \
+ r0 = _mm_unpacklo_epi8(r8, r10); /* 0..7 set 0 */ \
+ r1 = _mm_unpackhi_epi8(r8, r10); /* 0..7 set 1 */ \
+ r2 = _mm_unpacklo_epi8(r9, r11); /* 0..7 set 2 */ \
+ r4 = _mm_unpacklo_epi8(r12, r14); /* 8..15 set 0 */ \
+ r5 = _mm_unpackhi_epi8(r12, r14); /* 8..15 set 1 */ \
+ r6 = _mm_unpacklo_epi8(r13, r15); /* 8..15 set 2 */ \
+ /* Don't need to compute 3 and 7*/ \
+ \
+ /*Final merge*/ \
+ r8 = _mm_unpacklo_epi64(r0, r4); /* all set 0 */ \
+ r9 = _mm_unpackhi_epi64(r0, r4); /* all set 1 */ \
+ r10 = _mm_unpacklo_epi64(r1, r5); /* all set 2 */ \
+ r11 = _mm_unpackhi_epi64(r1, r5); /* all set 3 */ \
+ r12 = _mm_unpacklo_epi64(r2, r6); /* all set 4 */ \
+ r13 = _mm_unpackhi_epi64(r2, r6); /* all set 5 */ \
+ /* Don't need to compute 14 and 15*/ \
+}
+
+// out: o = |x-y| < a
+static ALWAYS_INLINE vec_u8_t diff_lt_e2k(vec_u8_t x, vec_u8_t y, vec_u8_t a) {
+ vec_u8_t o = _mm_or_si128(_mm_subs_epu8(x, y), _mm_subs_epu8(y, x)); /* |x-y| */
+ return _mm_cmpgt_epi8(a, _mm_xor_si128(o, _mm_set1_epi8(-128)));
+}
+
+static ALWAYS_INLINE vec_u8_t h264_deblock_mask(vec_u8_t p0, vec_u8_t p1, vec_u8_t q0,
+ vec_u8_t q1, vec_u8_t alpha, vec_u8_t beta) {
+ vec_u8_t mask, tempmask;
+
+ mask = diff_lt_e2k(p0, q0, alpha);
+ tempmask = diff_lt_e2k(p1, p0, beta);
+ mask = _mm_and_si128(mask, tempmask);
+ tempmask = diff_lt_e2k(q1, q0, beta);
+ mask = _mm_and_si128(mask, tempmask);
+
+ return mask;
+}
+
+// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
+static ALWAYS_INLINE vec_u8_t h264_deblock_q1(vec_u8_t p0, vec_u8_t p1, vec_u8_t p2,
+ vec_u8_t q0, vec_u8_t tc0) {
+
+ vec_u8_t average = _mm_avg_epu8(p0, q0);
+ vec_u8_t temp, uncliped;
+ vec_u8_t ones = _mm_set1_epi8(1), maxv, minv;
+
+ temp = _mm_xor_si128(average, p2);
+ average = _mm_avg_epu8(average, p2); /* avg(p2, avg(p0, q0)) */
+ temp = _mm_and_si128(temp, ones); /* (p2^avg(p0, q0)) & 1 */
+ uncliped = _mm_subs_epu8(average, temp); /* (p2+((p0+q0+1)>>1))>>1 */
+ maxv = _mm_adds_epu8(p1, tc0);
+ minv = _mm_subs_epu8(p1, tc0);
+ return _mm_min_epu8(maxv, _mm_max_epu8(minv, uncliped));
+}
+
+#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \
+ vec_u8_t pq0bit = _mm_xor_si128(p0, q0); \
+ vec_u8_t q1minus, p0minus, stage1, stage2, c127 = _mm_set1_epi8(127); \
+ vec_u8_t vec160 = _mm_set1_epi8(160), delta, deltaneg, notv = _mm_set1_epi8(-1); \
+ \
+ q1minus = _mm_xor_si128(q1, notv); /* 255 - q1 */ \
+ stage1 = _mm_avg_epu8(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \
+ stage2 = _mm_srli_epi16(stage1, 1); \
+ stage2 = _mm_and_si128(stage2, c127); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \
+ p0minus = _mm_xor_si128(p0, notv); /* 255 - p0 */ \
+ stage1 = _mm_avg_epu8(q0, p0minus); /* (q0 - p0 + 256)>>1 */ \
+ pq0bit = _mm_and_si128(pq0bit, _mm_set1_epi8(1)); \
+ stage2 = _mm_avg_epu8(stage2, pq0bit); /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \
+ stage2 = _mm_adds_epu8(stage2, stage1); /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */ \
+ deltaneg = _mm_subs_epu8(vec160, stage2); /* -d */ \
+ delta = _mm_subs_epu8(stage2, vec160); /* d */ \
+ deltaneg = _mm_min_epu8(tc0masked, deltaneg); \
+ delta = _mm_min_epu8(tc0masked, delta); \
+ p0 = _mm_subs_epu8(p0, deltaneg); \
+ q0 = _mm_subs_epu8(q0, delta); \
+ p0 = _mm_adds_epu8(p0, delta); \
+ q0 = _mm_adds_epu8(q0, deltaneg); \
+}
+
+#define h264_loop_filter_luma_e2k(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \
+ vec_u8_t alphavec, betavec, mask, p1mask, q1mask; \
+ vec_s8_t tc0vec; \
+ vec_u8_t finaltc0, tc0masked, newp1, newq1; \
+ \
+ betavec = _mm_set1_epi8(beta - 128); \
+ alphavec = _mm_set1_epi8(alpha - 128); \
+ mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /* if in block */ \
+ \
+ tc0vec = _mm_cvtsi32_si128(*(uint32_t*)tc0); \
+ tc0vec = _mm_unpacklo_epi8(tc0vec, tc0vec); \
+ tc0vec = _mm_unpacklo_epi8(tc0vec, tc0vec); \
+ mask = _mm_blendv_epi8(mask, _mm_setzero_si128(), tc0vec); \
+ finaltc0 = _mm_and_si128(tc0vec, mask); /* if (tc0[i] >= 0) tc = tc0 */ \
+ \
+ p1mask = diff_lt_e2k(p2, p0, betavec); \
+ p1mask = _mm_and_si128(p1mask, mask); /* if(|p2 - p0| < beta) */ \
+ tc0masked = _mm_and_si128(p1mask, tc0vec); \
+ finaltc0 = _mm_sub_epi8(finaltc0, p1mask); /* tc++ */ \
+ newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \
+ /*end if*/ \
+ \
+ q1mask = diff_lt_e2k(q2, q0, betavec); \
+ q1mask = _mm_and_si128(q1mask, mask); /* if(|q2 - q0| < beta) */ \
+ tc0masked = _mm_and_si128(q1mask, tc0vec); \
+ finaltc0 = _mm_sub_epi8(finaltc0, q1mask); /* tc++ */ \
+ newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \
+ /*end if*/ \
+ \
+ h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \
+ p1 = newp1; \
+ q1 = newq1; \
+}
+
+void x264_deblock_v_luma_e2k(uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0) {
+ if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
+ vec_u8_t p2, p1, p0, q0, q1, q2;
+ p2 = VEC_LD(pix - 3 * stride);
+ p1 = VEC_LD(pix - 2 * stride);
+ p0 = VEC_LD(pix - stride);
+ q0 = VEC_LD(pix);
+ q1 = VEC_LD(pix + stride);
+ q2 = VEC_LD(pix + 2 * stride);
+ h264_loop_filter_luma_e2k(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
+ VEC_ST(pix - 2 * stride, p1);
+ VEC_ST(pix - 1 * stride, p0);
+ VEC_ST(pix, q0);
+ VEC_ST(pix + stride, q1);
+ }
+}
+
+void x264_deblock_h_luma_e2k(uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0) {
+ vec_u8_t line0, line1, line2, line3, line4, line5;
+ if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) return;
+ read_and_transpose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
+ h264_loop_filter_luma_e2k(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
+ transpose4x16(line1, line2, line3, line4);
+ write16x4(pix-2, stride, line1, line2, line3, line4);
+}
+#endif // !HIGH_BIT_DEPTH
diff --git a/common/e2k/deblock.h b/common/e2k/deblock.h
new file mode 100644
index 0000000..87302e2
--- /dev/null
+++ b/common/e2k/deblock.h
@@ -0,0 +1,32 @@
+/*****************************************************************************
+ * deblock.h: e2k deblocking
+ *****************************************************************************
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_E2K_DEBLOCK_H
+#define X264_E2K_DEBLOCK_H
+
+#define x264_deblock_v_luma_e2k x264_template(deblock_v_luma_e2k)
+void x264_deblock_v_luma_e2k(uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0);
+#define x264_deblock_h_luma_e2k x264_template(deblock_h_luma_e2k)
+void x264_deblock_h_luma_e2k(uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0);
+
+#endif
diff --git a/common/e2k/e2kcommon.h b/common/e2k/e2kcommon.h
new file mode 100644
index 0000000..83d0a6c
--- /dev/null
+++ b/common/e2k/e2kcommon.h
@@ -0,0 +1,253 @@
+/*****************************************************************************
+ * e2kcommon.h: e2k utility macros
+ *****************************************************************************
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ * Copyright (C) 2003-2017 x264 project
+ *
+ * Authors: Eric Petit <eric.petit@lapsus.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_E2K_E2KCOMMON_H
+#define X264_E2K_E2KCOMMON_H
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#ifdef ALWAYS_INLINE
+#undef ALWAYS_INLINE
+#endif
+#define ALWAYS_INLINE __attribute__((__always_inline__)) inline
+
+#ifdef __e2k__
+#define PRAGMA_E2K _Pragma
+#define _mm_shuffle2_pi8(a, b, c) \
+ ((__m64)__builtin_e2k_pshufb((uint64_t)(b), (uint64_t)(a), (uint64_t)(c)))
+#define _mm_shuffle2_epi8(a, b, c) \
+ ((__m128i)__builtin_e2k_qppermb((__v2di)(b), (__v2di)(a), (__v2di)(c)))
+#define _mm_blendv_pi8(a, b, c) \
+ ((__m64)__builtin_e2k_pmerge((uint64_t)(a), (uint64_t)(b), (uint64_t)(c)))
+#else
+#define PRAGMA_E2K(x)
+#define _mm_shuffle2_pi8(a, b, c) \
+ _mm_movepi64_pi64(_mm_shuffle_epi8(_mm_unpacklo_epi64( \
+ _mm_movpi64_epi64(a), _mm_movpi64_epi64(b)), _mm_movpi64_epi64(c)))
+#define _mm_shuffle2_epi8(a, b, c) \
+ _mm_blendv_epi8(_mm_shuffle_epi8(a, c), _mm_shuffle_epi8(b, c), \
+ _mm_slli_epi16(c, 3))
+#define _mm_blendv_pi8(a, b, c) \
+ _mm_movepi64_pi64(_mm_blendv_epi8(_mm_movpi64_epi64(a), \
+ _mm_movpi64_epi64(b), _mm_movpi64_epi64(c)))
+
+static ALWAYS_INLINE uint64_t __builtin_e2k_insfd(uint64_t a, uint64_t b, uint64_t c) {
+ int n = b & 63;
+ a = a >> n | a << (64 - n);
+ return c ^ ((a ^ c) & (~0ll << (b >> 6 & 63)));
+}
+#endif
+
+#define _mm_extract_pi32(a, b) _mm_extract_epi32(_mm_movpi64_epi64(a), b)
+#define VEC_ALIGNR8(a, b) _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), 1))
+
+/***********************************************************************
+ * Vector types
+ **********************************************************************/
+#define vec_u8_t __m128i
+#define vec_s8_t __m128i
+#define vec_u16_t __m128i
+#define vec_s16_t __m128i
+#define vec_u32_t __m128i
+#define vec_s32_t __m128i
+#define vec_u64_t __m128i
+#define vec_s64_t __m128i
+
+/***********************************************************************
+ * Null vector
+ **********************************************************************/
+#define LOAD_ZERO const vec_u8_t zerov = _mm_setzero_si128()
+
+/***********************************************************************
+ * 8 <-> 16 bits conversions
+ **********************************************************************/
+#define vec_u8_to_u16_h(v) _mm_unpacklo_epi8(v, zerov)
+#define vec_u8_to_u16_l(v) _mm_unpackhi_epi8(v, zerov)
+#define vec_u8_to_s16_h(v) _mm_unpacklo_epi8(v, zerov)
+#define vec_u8_to_s16_l(v) _mm_unpackhi_epi8(v, zerov)
+
+#define vec_u8_to_u16(v) vec_u8_to_u16_h(v)
+#define vec_u8_to_s16(v) vec_u8_to_s16_h(v)
+
+/***********************************************************************
+ * 16 <-> 32 bits conversions
+ **********************************************************************/
+#define vec_u16_to_u32_h(v) _mm_unpacklo_epi16(v, zerov)
+#define vec_u16_to_u32_l(v) _mm_unpackhi_epi16(v, zerov)
+#define vec_u16_to_s32_h(v) _mm_unpacklo_epi16(v, zerov)
+#define vec_u16_to_s32_l(v) _mm_unpackhi_epi16(v, zerov)
+
+#define vec_u16_to_u32(v) vec_u16_to_u32_h(v)
+#define vec_u16_to_s32(v) vec_u16_to_s32_h(v)
+
+/***********************************************************************
+ * VEC_TRANSPOSE_8
+ ***********************************************************************
+ * Transposes a 8x8 matrix of s16 vectors
+ **********************************************************************/
+#define VEC_TRANSPOSE_8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7) \
+ b0 = _mm_unpacklo_epi16(a0, a2); \
+ b1 = _mm_unpackhi_epi16(a0, a2); \
+ b2 = _mm_unpacklo_epi16(a1, a3); \
+ b3 = _mm_unpackhi_epi16(a1, a3); \
+ b4 = _mm_unpacklo_epi16(a4, a6); \
+ b5 = _mm_unpackhi_epi16(a4, a6); \
+ b6 = _mm_unpacklo_epi16(a5, a7); \
+ b7 = _mm_unpackhi_epi16(a5, a7); \
+ a0 = _mm_unpacklo_epi16(b0, b2); \
+ a1 = _mm_unpackhi_epi16(b0, b2); \
+ a2 = _mm_unpacklo_epi16(b1, b3); \
+ a3 = _mm_unpackhi_epi16(b1, b3); \
+ a4 = _mm_unpacklo_epi16(b4, b6); \
+ a5 = _mm_unpackhi_epi16(b4, b6); \
+ a6 = _mm_unpacklo_epi16(b5, b7); \
+ a7 = _mm_unpackhi_epi16(b5, b7); \
+ b0 = _mm_unpacklo_epi64(a0, a4); \
+ b1 = _mm_unpackhi_epi64(a0, a4); \
+ b2 = _mm_unpacklo_epi64(a1, a5); \
+ b3 = _mm_unpackhi_epi64(a1, a5); \
+ b4 = _mm_unpacklo_epi64(a2, a6); \
+ b5 = _mm_unpackhi_epi64(a2, a6); \
+ b6 = _mm_unpacklo_epi64(a3, a7); \
+ b7 = _mm_unpackhi_epi64(a3, a7)
+
+/***********************************************************************
+ * VEC_TRANSPOSE_4
+ ***********************************************************************
+ * Transposes a 4x4 matrix of s16 vectors.
+ * Actually source and destination are 8x4. The low elements of the
+ * source are discarded and the low elements of the destination mustn't
+ * be used.
+ **********************************************************************/
+#define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \
+ b0 = _mm_unpacklo_epi16(a0, a2); \
+ b1 = _mm_unpacklo_epi16(a1, a3); \
+ a0 = _mm_unpacklo_epi16(b0, b1); \
+ a1 = _mm_unpackhi_epi16(b0, b1); \
+ b0 = _mm_unpacklo_epi64(a0, a0); \
+ b1 = _mm_unpackhi_epi64(a0, a0); \
+ b2 = _mm_unpacklo_epi64(a1, a1); \
+ b3 = _mm_unpackhi_epi64(a1, a1)
+
+/***********************************************************************
+ * VEC_DIFF_HL
+ ***********************************************************************
+ * p1, p2: u8 *
+ * i1, i2: int
+ * dh, dl: s16v
+ *
+ * Loads 16 bytes from p1 and p2, do the diff of the high elements into
+ * dh, the diff of the low elements into dl, increments p1 and p2 by i1
+ * and i2
+ **********************************************************************/
+#define VEC_DIFF_HL(p1,i1,p2,i2,dh,dl) \
+ pix1v = VEC_LD(p1); \
+ pix2v = VEC_LD(p2); \
+ temp0v = vec_u8_to_s16_h(pix1v); \
+ temp1v = vec_u8_to_s16_l(pix1v); \
+ temp2v = vec_u8_to_s16_h(pix2v); \
+ temp3v = vec_u8_to_s16_l(pix2v); \
+ dh = _mm_sub_epi16(temp0v, temp2v); \
+ dl = _mm_sub_epi16(temp1v, temp3v); \
+ p1 += i1; \
+ p2 += i2
+
+/***********************************************************************
+* VEC_DIFF_H_8BYTE_ALIGNED
+***********************************************************************
+* p1, p2: u8 *
+* i1, i2, n: int
+* d: s16v
+*
+* Loads n bytes from p1 and p2, do the diff of the high elements into
+* d, increments p1 and p2 by i1 and i2
+* Slightly faster when we know we are loading/diffing 8bytes which
+* are 8 byte aligned. Reduces need for two loads and two vec_lvsl()'s
+**********************************************************************/
+#define PREP_DIFF_8BYTEALIGNED \
+LOAD_ZERO; \
+vec_s16_t pix1v, pix2v; \
+vec_u8_t pix1v8, pix2v8; \
+
+#define VEC_DIFF_H_8BYTE_ALIGNED(p1,i1,p2,i2,n,d) \
+pix1v8 = VEC_LD8(p1); \
+pix2v8 = VEC_LD8(p2); \
+pix1v = vec_u8_to_s16(pix1v8); \
+pix2v = vec_u8_to_s16(pix2v8); \
+d = _mm_sub_epi16(pix1v, pix2v); \
+p1 += i1; \
+p2 += i2;
+
+#define VEC_LD(a) _mm_loadu_si128((const __m128i*)(a))
+#define VEC_ST(a, b) _mm_storeu_si128((__m128i*)(a), b)
+#define VEC_LD8(a) _mm_loadl_epi64((const __m128i*)(a))
+#define VEC_STL(a, b) _mm_storel_epi64((__m128i*)(a), b)
+#define VEC_STH(a, b) _mm_storeh_pd((double*)(a), _mm_castsi128_pd(b));
+
+#define VEC_SPLAT16(v, i) _mm_shuffle_epi8(v, _mm_set1_epi16((i) * 2 | ((i) * 2 + 1) << 8))
+
+#if !defined(__iset__) || __iset__ < 5
+#define NEED_ALIGN8
+#define ALIGN8_COMMON uint64_t src_shr; __m64 src_tmp0, __attribute__((unused)) src_tmp1;
+#define ALIGN8_VARS(src) __m64 *src##_ptr, src##_next, src##_index;
+#define ALIGN8_START(ptr, src) \
+ src_shr = (intptr_t)(ptr - 1) & 7; \
+ src##_ptr = (__m64*)((intptr_t)(ptr - 1) & -8); \
+ src##_next = src##_ptr[src_shr == 7]; \
+ src##_index = _mm_add_pi8(_mm_set1_pi8(src_shr), \
+ _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8));
+#define ALIGN8_READ8(v0, src, i) \
+ src_tmp0 = src##_ptr[i + 1]; \
+ v0 = _mm_shuffle2_pi8(src##_next, src_tmp0, src##_index); \
+ src##_next = src_tmp0;
+#define ALIGN8_READ16(v0, src, i) \
+ src_tmp1 = src##_ptr[i * 2 + 1]; \
+ src_tmp0 = _mm_shuffle2_pi8(src##_next, src_tmp1, src##_index); \
+ src##_next = src##_ptr[i * 2 + 2]; \
+ src_tmp1 = _mm_shuffle2_pi8(src_tmp1, src##_next, src##_index); \
+ v0 = _mm_setr_epi64(src_tmp0, src_tmp1);
+
+#define ALIGN8A_COMMON __m64 src_tmp0, src_tmp1;
+#define ALIGN8A_VARS(src) uint64_t src##_shr; __m64 *src##_ptr, src##_index;
+#define ALIGN8A_START(ptr, src) \
+ src##_shr = (intptr_t)(ptr - 1) & 7; \
+ src##_ptr = (__m64*)((intptr_t)(ptr - 1) & -8); \
+ src##_index = _mm_add_pi8(_mm_set1_pi8(src##_shr), \
+ _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8)); \
+ src##_shr = src##_shr == 7;
+#define ALIGN8A_READ8(v0, src) \
+ src_tmp0 = src##_ptr[src##_shr]; \
+ src_tmp1 = src##_ptr[1]; \
+ v0 = _mm_shuffle2_pi8(src_tmp0, src_tmp1, src##_index);
+#define ALIGN8A_READ16(v0, src) \
+ src_tmp0 = src##_ptr[src##_shr]; \
+ src_tmp1 = src##_ptr[1]; \
+ src_tmp0 = _mm_shuffle2_pi8(src_tmp0, src_tmp1, src##_index); \
+ src_tmp1 = _mm_shuffle2_pi8(src_tmp1, src##_ptr[2], src##_index); \
+ v0 = _mm_setr_epi64(src_tmp0, src_tmp1);
+#endif
+
+#endif // X264_E2K_E2KCOMMON_H
diff --git a/common/e2k/mc.c b/common/e2k/mc.c
new file mode 100644
index 0000000..430b705
--- /dev/null
+++ b/common/e2k/mc.c
@@ -0,0 +1,1401 @@
+/*****************************************************************************
+ * mc.c: e2k motion compensation
+ *****************************************************************************
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ * Copyright (C) 2003-2017 x264 project
+ *
+ * Authors: Eric Petit <eric.petit@lapsus.org>
+ * Guillaume Poirier <gpoirier@mplayerhq.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "e2kcommon.h"
+#include "mc.h"
+
+#if !HIGH_BIT_DEPTH
+static ALWAYS_INLINE void pixel_avg2_w4_e2k(uint8_t * restrict dst, intptr_t i_dst,
+ uint8_t * restrict src1, intptr_t i_src1,
+ uint8_t * restrict src2, intptr_t i_src2, int i_height) {
+ int y;
+ __m64 v0, v1;
+
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < i_height; y++, dst += i_dst) {
+ v0 = _mm_cvtsi32_si64(*(uint32_t*)src1); src1 += i_src1;
+ v1 = _mm_cvtsi32_si64(*(uint32_t*)src2); src2 += i_src2;
+ *(uint32_t*)dst = _mm_cvtsi64_si32(_mm_avg_pu8(v0, v1));
+ }
+}
+
+static ALWAYS_INLINE void pixel_avg2_w8_e2k(uint8_t * restrict dst, intptr_t i_dst,
+ uint8_t * restrict src1, intptr_t i_src1,
+ uint8_t * restrict src2, intptr_t i_src2, int i_height) {
+ int y;
+ __m64 v0, v1;
+
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < i_height; y++, dst += i_dst) {
+ v0 = *(__m64*)src1; src1 += i_src1;
+ v1 = *(__m64*)src2; src2 += i_src2;
+ *(__m64*)dst = _mm_avg_pu8(v0, v1);
+ }
+}
+
+static ALWAYS_INLINE void pixel_avg2_w16_e2k(uint8_t * restrict dst, intptr_t i_dst,
+ uint8_t * restrict src1, intptr_t i_src1,
+ uint8_t * restrict src2, intptr_t i_src2, int i_height) {
+ int y;
+ __m128i v0, v1;
+
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < i_height; y++, dst += i_dst) {
+ v0 = VEC_LD(src1); src1 += i_src1;
+ v1 = VEC_LD(src2); src2 += i_src2;
+ VEC_ST(dst, _mm_avg_epu8(v0, v1));
+ }
+}
+
+static ALWAYS_INLINE void pixel_avg2_w20_e2k(uint8_t *dst, intptr_t i_dst,
+ uint8_t *src1, intptr_t i_src1,
+ uint8_t *src2, intptr_t i_src2, int i_height) {
+ int y;
+ __m128i v0, v1; __m64 h0, h1;
+
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < i_height; y++, dst += i_dst) {
+ v0 = VEC_LD(src1); h0 = _mm_cvtsi32_si64(*(uint32_t*)(src1 + 16));
+ v1 = VEC_LD(src2); h1 = _mm_cvtsi32_si64(*(uint32_t*)(src2 + 16));
+ src1 += i_src1; src2 += i_src2;
+
+ VEC_ST(dst, _mm_avg_epu8(v0, v1));
+ *(uint32_t*)(dst + 16) = _mm_cvtsi64_si32(_mm_avg_pu8(h0, h1));
+ }
+}
+
+#define PIXEL_AVG_FN(wxh) \
+static void pixel_avg_##wxh##_e2k(uint8_t * restrict dst, intptr_t i_dst, \
+ uint8_t * restrict src1, intptr_t i_src1, \
+ uint8_t * restrict src2, intptr_t i_src2, int weight)
+
+#define INIT_AVG_4X2_E2K \
+ __m64 v0, t0, t1, t2, t3, c0, c1; \
+ c0 = _mm_set1_pi16((weight & 255) | (64 - weight) << 8); \
+ c1 = _mm_set1_pi16(32);
+
+#define PIXEL_AVG_2X2_E2K(i) \
+ t0 = _mm_cvtsi32_si64(*(uint16_t*)src1); \
+ t1 = _mm_cvtsi32_si64(*(uint16_t*)(src1 + i_src1)); \
+ t2 = _mm_cvtsi32_si64(*(uint16_t*)src2); \
+ t3 = _mm_cvtsi32_si64(*(uint16_t*)(src2 + i_src2)); \
+ src1 += i_src1 * 2; src2 += i_src2 * 2; \
+ t0 = _mm_unpacklo_pi32(_mm_unpacklo_pi8(t0, t2), _mm_unpacklo_pi8(t1, t3)); \
+ v##i = _mm_srai_pi16(_mm_add_pi16(_mm_maddubs_pi16(t0, c0), c1), 6); \
+
+#define STORE_2X4_E2K(v0, v1) \
+ v0 = _mm_packs_pu16(v0, v1); \
+ *(uint16_t*)dst = _mm_cvtsi64_si32(v0); \
+ *(uint16_t*)(dst + i_dst) = _mm_extract_pi16(v0, 1); dst += i_dst * 2; \
+ *(uint16_t*)dst = _mm_extract_pi16(v0, 2); \
+ *(uint16_t*)(dst + i_dst) = _mm_extract_pi16(v0, 3); dst += i_dst * 2;
+
+PIXEL_AVG_FN(2x2) {
+ INIT_AVG_4X2_E2K
+ PIXEL_AVG_2X2_E2K(0)
+ v0 = _mm_packs_pu16(v0, v0);
+ *(uint16_t*)dst = _mm_cvtsi64_si32(v0);
+ *(uint16_t*)(dst + i_dst) = _mm_extract_pi16(v0, 1);
+}
+
+PIXEL_AVG_FN(2x4) {
+ __m64 v1;
+ INIT_AVG_4X2_E2K
+ PIXEL_AVG_2X2_E2K(0)
+ PIXEL_AVG_2X2_E2K(1)
+ STORE_2X4_E2K(v0, v1)
+}
+
+PIXEL_AVG_FN(2x8) {
+ __m64 v1, v2, v3;
+ INIT_AVG_4X2_E2K
+ PIXEL_AVG_2X2_E2K(0)
+ PIXEL_AVG_2X2_E2K(1)
+ PIXEL_AVG_2X2_E2K(2)
+ PIXEL_AVG_2X2_E2K(3)
+ STORE_2X4_E2K(v0, v1)
+ STORE_2X4_E2K(v2, v3)
+}
+
+#define PIXEL_AVG_4X2_E2K \
+ t0 = _mm_cvtsi32_si64(*(uint32_t*)src1); \
+ t1 = _mm_cvtsi32_si64(*(uint32_t*)(src1 + i_src1)); \
+ t2 = _mm_cvtsi32_si64(*(uint32_t*)src2); \
+ t3 = _mm_cvtsi32_si64(*(uint32_t*)(src2 + i_src2)); \
+ src1 += i_src1 * 2; src2 += i_src2 * 2; \
+ t0 = _mm_maddubs_pi16(_mm_unpacklo_pi8(t0, t2), c0); \
+ t1 = _mm_maddubs_pi16(_mm_unpacklo_pi8(t1, t3), c0); \
+ t0 = _mm_srai_pi16(_mm_add_pi16(t0, c1), 6); \
+ t1 = _mm_srai_pi16(_mm_add_pi16(t1, c1), 6); \
+ v0 = _mm_packs_pu16(t0, t1); \
+ *(uint32_t*)dst = _mm_cvtsi64_si32(v0); \
+ *(uint32_t*)(dst + i_dst) = _mm_extract_pi32(v0, 1); \
+ dst += i_dst * 2;
+
+PIXEL_AVG_FN(4x2) {
+ INIT_AVG_4X2_E2K
+ PIXEL_AVG_4X2_E2K
+}
+
+PIXEL_AVG_FN(4x4) {
+ INIT_AVG_4X2_E2K
+ PIXEL_AVG_4X2_E2K
+ PIXEL_AVG_4X2_E2K
+}
+
+PIXEL_AVG_FN(4x8) {
+ INIT_AVG_4X2_E2K
+ PIXEL_AVG_4X2_E2K
+ PIXEL_AVG_4X2_E2K
+ PIXEL_AVG_4X2_E2K
+ PIXEL_AVG_4X2_E2K
+}
+
+PIXEL_AVG_FN(4x16) {
+ int y;
+ INIT_AVG_4X2_E2K
+ PRAGMA_E2K("unroll(8)")
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < 8; y++) {
+ PIXEL_AVG_4X2_E2K
+ }
+}
+
+#define PIXEL_AVG_8X2_E2K(i) \
+ t0 = VEC_LD8(src1); t1 = VEC_LD8(src1 + i_src1); src1 += i_src1 * 2; \
+ t2 = VEC_LD8(src2); t3 = VEC_LD8(src2 + i_src2); src2 += i_src2 * 2; \
+ t0 = _mm_maddubs_epi16(_mm_unpacklo_epi8(t0, t2), c0); \
+ t1 = _mm_maddubs_epi16(_mm_unpacklo_epi8(t1, t3), c0); \
+ t0 = _mm_srai_epi16(_mm_add_epi16(t0, c1), 6); \
+ t1 = _mm_srai_epi16(_mm_add_epi16(t1, c1), 6); \
+ v##i = _mm_packus_epi16(t0, t1);
+
+static ALWAYS_INLINE void pixel_avg_w8_e2k(uint8_t * restrict dst, intptr_t i_dst,
+ uint8_t * restrict src1, intptr_t i_src1,
+ uint8_t * restrict src2, intptr_t i_src2, int i_height, int weight) {
+ int y;
+ __m128i v0, v1, t0, t1, t2, t3, c0, c1;
+ c0 = _mm_set1_epi16((weight & 255) | (64 - weight) << 8);
+ c1 = _mm_set1_epi16(32);
+
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < i_height; y += 4) {
+ PIXEL_AVG_8X2_E2K(0)
+ PIXEL_AVG_8X2_E2K(1)
+ VEC_STL(dst, v0); VEC_STH(dst + i_dst, v0); dst += i_dst * 2;
+ VEC_STL(dst, v1); VEC_STH(dst + i_dst, v1); dst += i_dst * 2;
+ }
+}
+
+static ALWAYS_INLINE void pixel_avg_w16_e2k(uint8_t * restrict dst, intptr_t i_dst,
+ uint8_t * restrict src1, intptr_t i_src1,
+ uint8_t * restrict src2, intptr_t i_src2, int i_height, int weight) {
+ int y;
+ __m128i v0, v1, v4, v5, c0, c1;
+ c0 = _mm_set1_epi16((weight & 255) | (64 - weight) << 8);
+ c1 = _mm_set1_epi16(32);
+
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < i_height; y++, dst += i_dst) {
+ v0 = VEC_LD(src1); src1 += i_src1;
+ v1 = VEC_LD(src2); src2 += i_src2;
+ v4 = _mm_maddubs_epi16(_mm_unpacklo_epi8(v0, v1), c0);
+ v5 = _mm_maddubs_epi16(_mm_unpackhi_epi8(v0, v1), c0);
+
+ v4 = _mm_srai_epi16(_mm_add_epi16(v4, c1), 6);
+ v5 = _mm_srai_epi16(_mm_add_epi16(v5, c1), 6);
+ VEC_ST(dst, _mm_packus_epi16(v4, v5));
+ }
+}
+
+#define PIXEL_AVG_E2K(w, h) \
+PIXEL_AVG_FN(w##x##h) { \
+ if (weight == 32) pixel_avg2_w##w##_e2k(dst, i_dst, src1, i_src1, src2, i_src2, h); \
+ else pixel_avg_w##w##_e2k(dst, i_dst, src1, i_src1, src2, i_src2, h, weight); \
+}
+
+PIXEL_AVG_E2K(16, 16)
+PIXEL_AVG_E2K(16, 8)
+PIXEL_AVG_E2K(8, 16)
+PIXEL_AVG_E2K(8, 8)
+PIXEL_AVG_E2K(8, 4)
+
+/* mc_copy: plain c */
+
+#define MC_COPY(name, a, T) \
+static void name(uint8_t *dst, intptr_t i_dst, \
+ uint8_t *src, intptr_t i_src, int i_height) { \
+ int y; \
+ PRAGMA_E2K("ivdep") \
+ for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) { \
+ *(T*)dst = *(T*)src; \
+ } \
+}
+MC_COPY(mc_copy_w4_e2k, 4, uint32_t)
+MC_COPY(mc_copy_w8_e2k, 8, uint64_t)
+
+static void mc_copy_w16_e2k(uint8_t *dst, intptr_t i_dst,
+ uint8_t *src, intptr_t i_src, int i_height) {
+ int y;
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) {
+ __m128i v0 = VEC_LD(src);
+ VEC_ST(dst, v0);
+ }
+}
+
+static void mc_copy_w16_aligned_e2k(uint8_t *dst, intptr_t i_dst,
+ uint8_t *src, intptr_t i_src, int i_height) {
+ int y;
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) {
+ __m128i v0 = VEC_LD(src);
+ VEC_ST(dst, v0);
+ }
+}
+
+static void x264_plane_copy_swap_core_e2k(uint8_t *dst, intptr_t i_dst,
+ uint8_t *src, intptr_t i_src, int w, int h) {
+ int x, y;
+ __m128i mask = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6,
+ 9, 8, 11, 10, 13, 12, 15, 14);
+
+ for (y = 0; y < h; y++, dst += i_dst, src += i_src)
+ PRAGMA_E2K("ivdep")
+ for (x = 0; x < 2 * w; x += 16) {
+ __m128i v0 = VEC_LD(src + x);
+ v0 = _mm_shuffle_epi8(v0, mask);
+ VEC_ST(dst + x, v0);
+ }
+}
+
+static void x264_plane_copy_interleave_core_e2k(uint8_t *dst, intptr_t i_dst,
+ uint8_t *srcu, intptr_t i_srcu,
+ uint8_t *srcv, intptr_t i_srcv, int w, int h) {
+ int x, y;
+ __m128i v0, v1, v2, v3;
+
+ if (w <= 0) return;
+
+ for (y = 0; y < h; y++, dst += i_dst, srcu += i_srcu, srcv += i_srcv)
+ PRAGMA_E2K("ivdep")
+ for (x = 0; x < w; x += 16) {
+ v0 = VEC_LD(srcu + x);
+ v1 = VEC_LD(srcv + x);
+ v2 = _mm_unpacklo_epi8(v0, v1);
+ v3 = _mm_unpackhi_epi8(v0, v1);
+ VEC_ST(dst + 2 * x, v2);
+ VEC_ST(dst + 2 * x + 16, v3);
+ }
+}
+
+static void store_interleave_chroma_e2k(uint8_t *dst, intptr_t i_dst,
+ uint8_t *srcu, uint8_t *srcv, int height) {
+ int y;
+ __m128i v0, v1;
+
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < height; y++, dst += i_dst, srcu += FDEC_STRIDE, srcv += FDEC_STRIDE) {
+ v0 = VEC_LD8(srcu);
+ v1 = VEC_LD8(srcv);
+ v0 = _mm_unpacklo_epi8(v0, v1);
+ VEC_ST(dst, v0);
+ }
+}
+
+static void load_deinterleave_chroma_fenc_e2k(uint8_t *dst, uint8_t *src, intptr_t i_src, int height) {
+ __m128i index = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+ __m128i v0, v1;
+
+ int y;
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < height; y += 2, dst += 2 * FENC_STRIDE, src += i_src * 2) {
+ v0 = VEC_LD(src);
+ v1 = VEC_LD(src + i_src);
+ v0 = _mm_shuffle_epi8(v0, index);
+ v1 = _mm_shuffle_epi8(v1, index);
+#if FENC_STRIDE == 16
+ VEC_ST(dst + 0 * FENC_STRIDE, v0);
+ VEC_ST(dst + 1 * FENC_STRIDE, v1);
+#else
+ VEC_STL(dst + 0 * FENC_STRIDE, v0);
+ VEC_STH(dst + 0 * FENC_STRIDE + FENC_STRIDE / 2, v0);
+ VEC_STL(dst + 1 * FENC_STRIDE, v1);
+ VEC_STH(dst + 1 * FENC_STRIDE + FENC_STRIDE / 2, v1);
+#endif
+ }
+}
+
+static void load_deinterleave_chroma_fdec_e2k(uint8_t *dst, uint8_t *src, intptr_t i_src, int height) {
+ __m128i index = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+ __m128i v0, v1;
+
+ int y;
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < height; y += 2, dst += 2 * FDEC_STRIDE, src += i_src * 2) {
+ v0 = VEC_LD(src);
+ v1 = VEC_LD(src + i_src);
+ v0 = _mm_shuffle_epi8(v0, index);
+ v1 = _mm_shuffle_epi8(v1, index);
+ VEC_STL(dst + 0 * FDEC_STRIDE, v0);
+ VEC_STH(dst + 0 * FDEC_STRIDE + FDEC_STRIDE / 2, v0);
+ VEC_STL(dst + 1 * FDEC_STRIDE, v1);
+ VEC_STH(dst + 1 * FDEC_STRIDE + FDEC_STRIDE / 2, v1);
+ }
+}
+
+#define DEINTERLEAVE_INDEX(x) 0 + x, 2 + x, 4 + x, 6 + x, 8 + x, 10 + x, 12 + x, 14 + x
+
+#if defined(__iset__) && __iset__ >= 5
+
+static void plane_copy_deinterleave_e2k(uint8_t *dstu, intptr_t i_dstu,
+ uint8_t *dstv, intptr_t i_dstv,
+ uint8_t *src, intptr_t i_src, int w, int h) {
+ int x, y;
+ __m128i v0, v1, v2, v3;
+ __m128i index0 = _mm_setr_epi8(DEINTERLEAVE_INDEX(0), DEINTERLEAVE_INDEX(16));
+ __m128i index1 = _mm_setr_epi8(DEINTERLEAVE_INDEX(1), DEINTERLEAVE_INDEX(17));
+
+ if (w <= 0) return;
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < h; y++, dstu += i_dstu, dstv += i_dstv, src += i_src) {
+ for (x = 0; x < w; x += 16) {
+ v0 = VEC_LD(src + 2 * x);
+ v1 = VEC_LD(src + 2 * x + 16);
+ v2 = _mm_shuffle2_epi8(v0, v1, index0);
+ v3 = _mm_shuffle2_epi8(v0, v1, index1);
+ VEC_ST(dstu + x, v2);
+ VEC_ST(dstv + x, v3);
+ }
+ }
+}
+
+#define RGB_INDEX3(x) 0 + x, 3 + x, 6 + x, 9 + x, 12 + x, 15 + x, 18 + x, 21 + x
+#define RGB_INDEX4(x) 0 + x, 4 + x, 8 + x, 12 + x, 16 + x, 20 + x, 24 + x, 28 + x
+
+static void plane_copy_deinterleave_rgb_e2k(uint8_t *dsta, intptr_t i_dsta,
+ uint8_t *dstb, intptr_t i_dstb,
+ uint8_t *dstc, intptr_t i_dstc,
+ uint8_t *src, intptr_t i_src,
+ int pw, int w, int h) {
+ int x, y;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+
+ if (h <= 0 || w <= 0) return;
+ if (pw == 3) {
+ __m128i index0 = _mm_setr_epi8(RGB_INDEX3(0), RGB_INDEX3(1));
+ __m128i index1 = _mm_setr_epi8(RGB_INDEX3(2), RGB_INDEX3(3));
+ __m128i index2 = _mm_setr_epi8(RGB_INDEX3(8), RGB_INDEX3(9));
+ __m128i index3 = _mm_setr_epi8(RGB_INDEX3(10), RGB_INDEX3(11));
+
+ for (y = 0; y < h; y++, dsta += i_dsta, dstb += i_dstb, dstc += i_dstc, src += i_src) {
+ PRAGMA_E2K("ivdep")
+ for (x = 0; x < w; x += 16) {
+ v0 = VEC_LD(src + 3 * x);
+ v1 = VEC_LD(src + 3 * x + 16);
+ v2 = VEC_LD(src + 3 * x + 32);
+ v4 = _mm_shuffle2_epi8(v0, v1, index0); // rg0
+ v5 = _mm_shuffle2_epi8(v0, v1, index1); // bx0
+ v6 = _mm_shuffle2_epi8(v1, v2, index2); // rg1
+ v7 = _mm_shuffle2_epi8(v1, v2, index3); // bx1
+
+ VEC_ST(dsta + x, _mm_unpacklo_epi64(v4, v6));
+ VEC_ST(dstb + x, _mm_unpackhi_epi64(v4, v6));
+ VEC_ST(dstc + x, _mm_unpacklo_epi64(v5, v7));
+ }
+ }
+ } else {
+ __m128i index0 = _mm_setr_epi8(RGB_INDEX4(0), RGB_INDEX4(1));
+ __m128i index1 = _mm_setr_epi8(RGB_INDEX4(2), RGB_INDEX4(3));
+
+ for (y = 0; y < h; y++, dsta += i_dsta, dstb += i_dstb, dstc += i_dstc, src += i_src) {
+ PRAGMA_E2K("ivdep")
+ for (x = 0; x < w; x += 16) {
+ v0 = VEC_LD(src + 4 * x);
+ v1 = VEC_LD(src + 4 * x + 16);
+ v2 = VEC_LD(src + 4 * x + 32);
+ v3 = VEC_LD(src + 4 * x + 48);
+ v4 = _mm_shuffle2_epi8(v0, v1, index0); // rg0
+ v5 = _mm_shuffle2_epi8(v0, v1, index1); // bx0
+ v6 = _mm_shuffle2_epi8(v2, v3, index0); // rg1
+ v7 = _mm_shuffle2_epi8(v2, v3, index1); // bx1
+
+ VEC_ST(dsta + x, _mm_unpacklo_epi64(v4, v6));
+ VEC_ST(dstb + x, _mm_unpackhi_epi64(v4, v6));
+ VEC_ST(dstc + x, _mm_unpacklo_epi64(v5, v7));
+ }
+ }
+ }
+}
+#else
+static void plane_copy_deinterleave_e2k(uint8_t *dstu, intptr_t i_dstu,
+ uint8_t *dstv, intptr_t i_dstv,
+ uint8_t *src, intptr_t i_src, int w, int h) {
+ int x, y;
+ __m64 v0, v1, v2, v3, v4, v5;
+ __m64 index0 = _mm_setr_pi8(DEINTERLEAVE_INDEX(0));
+ __m64 index1 = _mm_setr_pi8(DEINTERLEAVE_INDEX(1));
+
+ if (w <= 0) return;
+ for (y = 0; y < h; y++, dstu += i_dstu, dstv += i_dstv, src += i_src) {
+ PRAGMA_E2K("ivdep")
+ for (x = 0; x < w; x += 16) {
+ v0 = *(__m64*)(src + 2 * x);
+ v1 = *(__m64*)(src + 2 * x + 8);
+ v2 = _mm_shuffle2_pi8(v0, v1, index0);
+ v3 = _mm_shuffle2_pi8(v0, v1, index1);
+ v0 = *(__m64*)(src + 2 * x + 16);
+ v1 = *(__m64*)(src + 2 * x + 24);
+ v4 = _mm_shuffle2_pi8(v0, v1, index0);
+ v5 = _mm_shuffle2_pi8(v0, v1, index1);
+ *(__m64*)(dstu + x) = v2;
+ *(__m64*)(dstv + x) = v3;
+ *(__m64*)(dstu + x + 8) = v4;
+ *(__m64*)(dstv + x + 8) = v5;
+ }
+ }
+}
+
+#define RGB_INDEX3(x) 0 + x, 3 + x, 6 + x, 9 + x
+#define RGB_INDEX4(x) 0 + x, 4 + x, 8 + x, 12 + x
+
+static void plane_copy_deinterleave_rgb_e2k(uint8_t *dsta, intptr_t i_dsta,
+ uint8_t *dstb, intptr_t i_dstb,
+ uint8_t *dstc, intptr_t i_dstc,
+ uint8_t *src, intptr_t i_src,
+ int pw, int w, int h) {
+ int x, y;
+ __m64 v0, v1, v2, v3, v4, v5, v6, v7;
+ ALIGN8_COMMON
+ ALIGN8_VARS(src)
+
+ if (h <= 0 || w <= 0) return;
+ if (pw == 3) {
+ __m64 index0 = _mm_setr_pi8(RGB_INDEX3(0), RGB_INDEX3(1));
+ __m64 index1 = _mm_setr_pi8(RGB_INDEX3(2), RGB_INDEX3(3));
+ __m64 index2 = _mm_setr_pi8(RGB_INDEX3(4), RGB_INDEX3(5));
+ __m64 index3 = _mm_setr_pi8(RGB_INDEX3(6), RGB_INDEX3(7));
+ for (y = 0; y < h; y++, dsta += i_dsta, dstb += i_dstb, dstc += i_dstc, src += i_src) {
+ ALIGN8_START(src, src)
+ PRAGMA_E2K("ivdep")
+ PRAGMA_E2K("unroll(2)")
+ for (x = 0; x < w; x += 8) {
+ ALIGN8_READ8(v0, src, 0)
+ ALIGN8_READ8(v1, src, 1)
+ ALIGN8_READ8(v2, src, 2)
+ src_ptr += 3;
+ v4 = _mm_shuffle2_pi8(v0, v1, index0); // rg0
+ v5 = _mm_shuffle2_pi8(v0, v1, index1); // bx0
+ v6 = _mm_shuffle2_pi8(v1, v2, index2); // rg1
+ v7 = _mm_shuffle2_pi8(v1, v2, index3); // bx1
+ *(__m64*)(dsta + x) = _mm_unpacklo_pi32(v4, v6);
+ *(__m64*)(dstb + x) = _mm_unpackhi_pi32(v4, v6);
+ *(__m64*)(dstc + x) = _mm_unpacklo_pi32(v5, v7);
+ }
+ }
+ } else {
+ __m64 index0 = _mm_setr_pi8(RGB_INDEX4(0), RGB_INDEX4(1));
+ __m64 index1 = _mm_setr_pi8(RGB_INDEX4(2), RGB_INDEX4(3));
+
+ for (y = 0; y < h; y++, dsta += i_dsta, dstb += i_dstb, dstc += i_dstc, src += i_src) {
+ ALIGN8_START(src, src)
+ PRAGMA_E2K("ivdep")
+ PRAGMA_E2K("unroll(2)")
+ for (x = 0; x < w; x += 8) {
+ ALIGN8_READ8(v0, src, 0)
+ ALIGN8_READ8(v1, src, 1)
+ ALIGN8_READ8(v2, src, 2)
+ ALIGN8_READ8(v3, src, 3)
+ src_ptr += 4;
+ v4 = _mm_shuffle2_pi8(v0, v1, index0); // rg0
+ v5 = _mm_shuffle2_pi8(v0, v1, index1); // bx0
+ v6 = _mm_shuffle2_pi8(v2, v3, index0); // rg1
+ v7 = _mm_shuffle2_pi8(v2, v3, index1); // bx1
+ *(__m64*)(dsta + x) = _mm_unpacklo_pi32(v4, v6);
+ *(__m64*)(dstb + x) = _mm_unpackhi_pi32(v4, v6);
+ *(__m64*)(dstc + x) = _mm_unpacklo_pi32(v5, v7);
+ }
+ }
+ }
+}
+#endif
+
+static void mc_luma_e2k(uint8_t *dst, intptr_t i_dst,
+ uint8_t *src[4], intptr_t i_src,
+ int mvx, int mvy,
+ int i_width, int i_height, const x264_weight_t *weight) {
+ int qpel_idx = ((mvy & 3) << 2) + (mvx & 3);
+ intptr_t offset = (mvy >> 2) * i_src + (mvx >> 2);
+ uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy & 3) == 3) * i_src;
+ if (qpel_idx & 5) { /* qpel interpolation needed */
+ uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx & 3) == 3);
+
+ switch (i_width) {
+ case 4:
+ pixel_avg2_w4_e2k(dst, i_dst, src1, i_src, src2, i_src, i_height);
+ break;
+ case 8:
+ pixel_avg2_w8_e2k(dst, i_dst, src1, i_src, src2, i_src, i_height);
+ break;
+ case 16:
+ default:
+ pixel_avg2_w16_e2k(dst, i_dst, src1, i_src, src2, i_src, i_height);
+ }
+ if (weight->weightfn)
+ weight->weightfn[i_width>>2](dst, i_dst, dst, i_dst, weight, i_height);
+ }
+ else if (weight->weightfn)
+ weight->weightfn[i_width>>2](dst, i_dst, src1, i_src, weight, i_height);
+ else {
+ switch (i_width) {
+ case 4:
+ mc_copy_w4_e2k(dst, i_dst, src1, i_src, i_height);
+ break;
+ case 8:
+ mc_copy_w8_e2k(dst, i_dst, src1, i_src, i_height);
+ break;
+ case 16:
+ mc_copy_w16_e2k(dst, i_dst, src1, i_src, i_height);
+ break;
+ }
+ }
+}
+
+static uint8_t *get_ref_e2k(uint8_t *dst, intptr_t *i_dst,
+ uint8_t *src[4], intptr_t i_src,
+ int mvx, int mvy,
+ int i_width, int i_height, const x264_weight_t *weight) {
+ int qpel_idx = ((mvy & 3) << 2) + (mvx & 3);
+ intptr_t offset = (mvy >> 2) * i_src + (mvx >> 2);
+ uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy & 3) == 3) * i_src;
+ if (qpel_idx & 5) { /* qpel interpolation needed */
+ uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx & 3) == 3);
+ switch (i_width) {
+ case 4:
+ pixel_avg2_w4_e2k(dst, *i_dst, src1, i_src, src2, i_src, i_height);
+ break;
+ case 8:
+ pixel_avg2_w8_e2k(dst, *i_dst, src1, i_src, src2, i_src, i_height);
+ break;
+ case 12:
+ case 16:
+ default:
+ pixel_avg2_w16_e2k(dst, *i_dst, src1, i_src, src2, i_src, i_height);
+ break;
+ case 20:
+ pixel_avg2_w20_e2k(dst, *i_dst, src1, i_src, src2, i_src, i_height);
+ break;
+ }
+ if (weight->weightfn)
+ weight->weightfn[i_width >> 2](dst, *i_dst, dst, *i_dst, weight, i_height);
+ return dst;
+ }
+ else if (weight->weightfn) {
+ weight->weightfn[i_width >> 2](dst, *i_dst, src1, i_src, weight, i_height);
+ return dst;
+ } else {
+ *i_dst = i_src;
+ return src1;
+ }
+}
+
+#define VSLD(a, b, n) _mm_alignr_epi8(b, a, n)
+
+static void mc_chroma_2xh(uint8_t *dstu, uint8_t *dstv, intptr_t i_dst,
+ uint8_t *src, intptr_t i_src,
+ int mvx, int mvy, int i_height) {
+ int y, d8x = mvx & 7, d8y = mvy & 7;
+ __m128i t0, t1, c01, c23, k32v = _mm_set1_epi16(32);
+ __m64 index = _mm_setr_pi8(0, 2, 2, 4, 1, 3, 3, 5);
+ __m64 h1, h2, h3;
+
+ c01 = _mm_set1_epi16(((8 - d8x) | d8x << 8) * (8 - d8y));
+ c23 = _mm_set1_epi16(((8 - d8x) | d8x << 8) * d8y);
+
+ src += (mvy >> 3) * i_src + (mvx >> 3) * 2;
+ h3 = *(__m64*)src; src += i_src;
+ h3 = _mm_shuffle_pi8(h3, index);
+
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < i_height; y += 2) {
+ h1 = h3;
+ h2 = *(__m64*)src; src += i_src;
+ h3 = *(__m64*)src; src += i_src;
+ h2 = _mm_shuffle_pi8(h2, index);
+ h3 = _mm_shuffle_pi8(h3, index);
+
+ t0 = _mm_unpacklo_epi64(_mm_movpi64_epi64(h1), _mm_movpi64_epi64(h2));
+ t1 = _mm_unpacklo_epi64(_mm_movpi64_epi64(h2), _mm_movpi64_epi64(h3));
+ t0 = _mm_maddubs_epi16(t0, c01);
+ t1 = _mm_maddubs_epi16(t1, c23);
+ t0 = _mm_add_epi16(_mm_add_epi16(t0, t1), k32v);
+ t0 = _mm_srli_epi16(t0, 6);
+
+ t0 = _mm_packus_epi16(t0, t0);
+ *(uint16_t*)dstu = _mm_extract_epi16(t0, 0); dstu += i_dst;
+ *(uint16_t*)dstv = _mm_extract_epi16(t0, 1); dstv += i_dst;
+ *(uint16_t*)dstu = _mm_extract_epi16(t0, 2); dstu += i_dst;
+ *(uint16_t*)dstv = _mm_extract_epi16(t0, 3); dstv += i_dst;
+ }
+}
+
+static void mc_chroma_4xh_e2k(uint8_t *dstu, uint8_t *dstv, intptr_t i_dst,
+ uint8_t *src, intptr_t i_src,
+ int mvx, int mvy, int i_height) {
+ int y, d8x = mvx & 7, d8y = mvy & 7;
+ __m128i c01, c23, v01, v23, v2, dstuv;
+ __m128i d0, d1, k32v = _mm_set1_epi16(32);
+ __m128i perm0v = _mm_setr_epi8(1, 5, 9, 13, 3, 7, 11, 15,
+ 1, 5, 9, 13, 3, 7, 11, 15);
+
+ c01 = _mm_set1_epi16(((8 - d8x) | d8x << 8) * (8 - d8y));
+ c23 = _mm_set1_epi16(((8 - d8x) | d8x << 8) * d8y);
+
+ src += (mvy >> 3) * i_src + (mvx >> 3) * 2;
+ v2 = VEC_LD(src); src += i_src;
+ v23 = _mm_unpacklo_epi8(v2, _mm_bsrli_si128(v2, 2));
+
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < i_height; y += 2) {
+ v01 = v23;
+ v2 = VEC_LD(src); src += i_src;
+ v23 = _mm_unpacklo_epi8(v2, _mm_bsrli_si128(v2, 2));
+
+ d0 = _mm_maddubs_epi16(v01, c01);
+ d1 = _mm_maddubs_epi16(v23, c23);
+ d0 = _mm_add_epi16(_mm_add_epi16(d0, d1), k32v);
+ d0 = _mm_slli_epi16(d0, 2);
+
+ dstuv = _mm_shuffle_epi8(d0, perm0v);
+ *(uint32_t*)dstu = _mm_extract_epi32(dstuv, 0); dstu += i_dst;
+ *(uint32_t*)dstv = _mm_extract_epi32(dstuv, 1); dstv += i_dst;
+
+ v01 = v23;
+ v2 = VEC_LD(src); src += i_src;
+ v23 = _mm_unpacklo_epi8(v2, _mm_bsrli_si128(v2, 2));
+
+ d0 = _mm_maddubs_epi16(v01, c01);
+ d1 = _mm_maddubs_epi16(v23, c23);
+ d0 = _mm_add_epi16(_mm_add_epi16(d0, d1), k32v);
+ d0 = _mm_slli_epi16(d0, 2);
+
+ dstuv = _mm_shuffle_epi8(d0, perm0v);
+ *(uint32_t*)dstu = _mm_extract_epi32(dstuv, 0); dstu += i_dst;
+ *(uint32_t*)dstv = _mm_extract_epi32(dstuv, 1); dstv += i_dst;
+ }
+}
+
+static void mc_chroma_8xh_e2k(uint8_t *dstu, uint8_t *dstv, intptr_t i_dst,
+ uint8_t *src, intptr_t i_src,
+ int mvx, int mvy, int i_height) {
+ int y, d8x = mvx & 7, d8y = mvy & 7;
+ __m128i c01, c23, v01l, v23l, v01h, v23h, v2, v3;
+ __m128i d0, d1, d2, d3, k32v = _mm_set1_epi16(32);
+ __m128i index = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14,
+ 1, 3, 5, 7, 9, 11, 13, 15);
+
+ c01 = _mm_set1_epi16(((8 - d8x) | d8x << 8) * (8 - d8y));
+ c23 = _mm_set1_epi16(((8 - d8x) | d8x << 8) * d8y);
+
+ src += (mvy >> 3) * i_src + (mvx >> 3) * 2;
+ v2 = VEC_LD(src); v3 = VEC_LD8(src + 16); src += i_src;
+ v3 = _mm_alignr_epi8(v3, v2, 2);
+ v23l = _mm_unpacklo_epi8(v2, v3);
+ v23h = _mm_unpackhi_epi8(v2, v3);
+
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < i_height; y += 2) {
+ v01l = v23l; v01h = v23h;
+ v2 = VEC_LD(src); v3 = VEC_LD8(src + 16); src += i_src;
+ v3 = _mm_alignr_epi8(v3, v2, 2);
+ v23l = _mm_unpacklo_epi8(v2, v3);
+ v23h = _mm_unpackhi_epi8(v2, v3);
+
+ d0 = _mm_maddubs_epi16(v01l, c01);
+ d1 = _mm_maddubs_epi16(v01h, c01);
+ d2 = _mm_maddubs_epi16(v23l, c23);
+ d3 = _mm_maddubs_epi16(v23h, c23);
+ d0 = _mm_add_epi16(_mm_add_epi16(d0, d2), k32v);
+ d1 = _mm_add_epi16(_mm_add_epi16(d1, d3), k32v);
+ d0 = _mm_srli_epi16(d0, 6);
+ d1 = _mm_srli_epi16(d1, 6);
+ d0 = _mm_packus_epi16(d0, d1);
+ d0 = _mm_shuffle_epi8(d0, index);
+
+ VEC_STL(dstu, d0); dstu += i_dst;
+ VEC_STH(dstv, d0); dstv += i_dst;
+
+ v01l = v23l; v01h = v23h;
+ v2 = VEC_LD(src); v3 = VEC_LD8(src + 16); src += i_src;
+ v3 = _mm_alignr_epi8(v3, v2, 2);
+ v23l = _mm_unpacklo_epi8(v2, v3);
+ v23h = _mm_unpackhi_epi8(v2, v3);
+
+ d0 = _mm_maddubs_epi16(v01l, c01);
+ d1 = _mm_maddubs_epi16(v01h, c01);
+ d2 = _mm_maddubs_epi16(v23l, c23);
+ d3 = _mm_maddubs_epi16(v23h, c23);
+ d0 = _mm_add_epi16(_mm_add_epi16(d0, d2), k32v);
+ d1 = _mm_add_epi16(_mm_add_epi16(d1, d3), k32v);
+ d0 = _mm_srli_epi16(d0, 6);
+ d1 = _mm_srli_epi16(d1, 6);
+ d0 = _mm_packus_epi16(d0, d1);
+ d0 = _mm_shuffle_epi8(d0, index);
+
+ VEC_STL(dstu, d0); dstu += i_dst;
+ VEC_STH(dstv, d0); dstv += i_dst;
+ }
+}
+
+static void mc_chroma_e2k(uint8_t *dstu, uint8_t *dstv, intptr_t i_dst,
+ uint8_t *src, intptr_t i_src,
+ int mvx, int mvy, int i_width, int i_height) {
+
+ if (i_width == 8)
+ mc_chroma_8xh_e2k(dstu, dstv, i_dst, src, i_src, mvx, mvy, i_height);
+ else if (i_width == 4)
+ mc_chroma_4xh_e2k(dstu, dstv, i_dst, src, i_src, mvx, mvy, i_height);
+ else
+ mc_chroma_2xh(dstu, dstv, i_dst, src, i_src, mvx, mvy, i_height);
+}
+
+#if 1 /* 3 unpack + 3 mul + 2 add/sub */
+#define HPEL_FILTER_1_INIT __m128i \
+ madd1v = _mm_set1_epi8(-1), madd2v = _mm_set1_epi16(20 - 5 * 256);
+
+#define HPEL_FILTER_1(lo, dest) { \
+ temp1v = _mm_unpack##lo##_epi8(src1v, src6v); \
+ temp2v = _mm_unpack##lo##_epi8(src3v, src2v); \
+ temp3v = _mm_unpack##lo##_epi8(src4v, src5v); \
+ /* a-5*b+20*c */ \
+ temp1v = _mm_maddubs_epi16(temp1v, madd1v); \
+ temp2v = _mm_maddubs_epi16(temp2v, madd2v); \
+ temp3v = _mm_maddubs_epi16(temp3v, madd2v); \
+ temp2v = _mm_sub_epi16(temp2v, temp1v); \
+ dest = _mm_add_epi16(temp2v, temp3v); \
+}
+#else /* 6 unpack + 7 add/sub + 2 shl */
+#define HPEL_FILTER_1_INIT __m128i zerov = _mm_setzero_si128();
+
+#define HPEL_FILTER_1(lo, dest) { \
+ temp1v = _mm_unpack##lo##_epi8(src1v, zerov); \
+ temp2v = _mm_unpack##lo##_epi8(src2v, zerov); \
+ temp3v = _mm_unpack##lo##_epi8(src3v, zerov); \
+ temp4v = _mm_unpack##lo##_epi8(src4v, zerov); \
+ temp5v = _mm_unpack##lo##_epi8(src5v, zerov); \
+ temp6v = _mm_unpack##lo##_epi8(src6v, zerov); \
+ \
+ temp1v = _mm_add_epi16(temp1v, temp6v); \
+ temp2v = _mm_add_epi16(temp2v, temp5v); \
+ temp3v = _mm_add_epi16(temp3v, temp4v); \
+ \
+ temp1v = _mm_sub_epi16(temp1v, temp2v); /* (a-b) */ \
+ temp2v = _mm_sub_epi16(temp2v, temp3v); /* (b-c) */ \
+ temp2v = _mm_slli_epi16(temp2v, 2); /* (b-c)*4 */ \
+ temp1v = _mm_sub_epi16(temp1v, temp2v); /* a-5*b+4*c */ \
+ temp3v = _mm_slli_epi16(temp3v, 4); /* 16*c */ \
+ dest = _mm_add_epi16(temp1v, temp3v); /* a-5*b+20*c */ \
+}
+#endif
+
+#define HPEL_FILTER_2(t1v, t2v, t3v, t4v, t5v, t6v) { \
+ t1v = _mm_add_epi16(t1v, t6v); \
+ t2v = _mm_add_epi16(t2v, t5v); \
+ t3v = _mm_add_epi16(t3v, t4v); \
+ \
+ t1v = _mm_sub_epi16(t1v, t2v); /* (a-b) */ \
+ t1v = _mm_srai_epi16(t1v, 2); /* (a-b)/4 */ \
+ t1v = _mm_sub_epi16(t1v, t2v); /* (a-b)/4-b */ \
+ t1v = _mm_add_epi16(t1v, t3v); /* (a-b)/4-b+c */ \
+ t1v = _mm_srai_epi16(t1v, 2); /* ((a-b)/4-b+c)/4 */ \
+ t1v = _mm_add_epi16(t1v, t3v); /* ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 */ \
+}
+
+#define HPEL_FILTER_HORIZONTAL() { \
+ src2v = VEC_LD8(src + x + i_stride * y - 8); \
+ src2v = _mm_unpacklo_epi64(src2v, src2v); \
+ src3v = VEC_LD(src + x + i_stride * y); \
+ src6v = VEC_LD8(src + x + i_stride * y + 16); \
+ \
+ src1v = VSLD(src2v, src3v, 14); \
+ src2v = VSLD(src2v, src3v, 15); \
+ src4v = VSLD(src3v, src6v, 1); \
+ src5v = VSLD(src3v, src6v, 2); \
+ src6v = VSLD(src3v, src6v, 3); \
+ \
+ HPEL_FILTER_1(lo, temp1v); \
+ \
+ dest1v = _mm_add_epi16(temp1v, sixteenv); \
+ dest1v = _mm_srai_epi16(dest1v, 5); \
+ \
+ HPEL_FILTER_1(hi, temp1v); \
+ \
+ dest2v = _mm_add_epi16(temp1v, sixteenv); \
+ dest2v = _mm_srai_epi16(dest2v, 5); \
+ \
+ destv = _mm_packus_epi16(dest1v, dest2v); \
+ VEC_ST(dsth + x + i_stride * y, destv); \
+}
+
+#define HPEL_FILTER_VERTICAL() { \
+ src1v = VEC_LD(src + x + i_stride * (y - 2)); \
+ src2v = VEC_LD(src + x + i_stride * (y - 1)); \
+ /* src3v = VEC_LD(src + x + i_stride * y); */ \
+ src4v = VEC_LD(src + x + i_stride * (y + 1)); \
+ src5v = VEC_LD(src + x + i_stride * (y + 2)); \
+ src6v = VEC_LD(src + x + i_stride * (y + 3)); \
+ \
+ HPEL_FILTER_1(lo, temp7v); \
+ \
+ dest1v = _mm_add_epi16(temp7v, sixteenv); \
+ dest1v = _mm_srai_epi16(dest1v, 5); \
+ \
+ HPEL_FILTER_1(hi, temp1v); \
+ \
+ dest2v = _mm_add_epi16(temp1v, sixteenv); \
+ dest2v = _mm_srai_epi16(dest2v, 5); \
+ \
+ destv = _mm_packus_epi16(dest1v, dest2v); \
+ VEC_ST(dstv + x + i_stride * y, destv); \
+}
+
+#define HPEL_FILTER_CENTRAL() { \
+ temp1v = VSLD(tempav, tempbv, 12); \
+ temp2v = VSLD(tempav, tempbv, 14); \
+ temp3v = tempbv; \
+ temp4v = VSLD(tempbv, tempcv, 2); \
+ temp5v = VSLD(tempbv, tempcv, 4); \
+ temp6v = VSLD(tempbv, tempcv, 6); \
+ \
+ HPEL_FILTER_2(temp1v, temp2v, temp3v, \
+ temp4v, temp5v, temp6v); \
+ \
+ dest1v = _mm_add_epi16(temp1v, thirtytwov); \
+ dest1v = _mm_srai_epi16(dest1v, 6); \
+ \
+ temp1v = VSLD(tempbv, tempcv, 12); \
+ temp2v = VSLD(tempbv, tempcv, 14); \
+ temp3v = tempcv; \
+ temp4v = VSLD(tempcv, tempdv, 2); \
+ temp5v = VSLD(tempcv, tempdv, 4); \
+ temp6v = VSLD(tempcv, tempdv, 6); \
+ \
+ HPEL_FILTER_2(temp1v, temp2v, temp3v, \
+ temp4v, temp5v, temp6v); \
+ \
+ dest2v = _mm_add_epi16(temp1v, thirtytwov); \
+ dest2v = _mm_srai_epi16(dest2v, 6); \
+ \
+ destv = _mm_packus_epi16(dest1v, dest2v); \
+ VEC_ST(dstc + x - 16 + i_stride * y, destv); \
+}
+
+static void hpel_filter_e2k(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
+ intptr_t i_stride, int i_width, int i_height, int16_t *buf) {
+ int y;
+ vec_u8_t destv;
+ vec_u8_t src1v, src2v, src3v, src4v, src5v, src6v;
+ vec_s16_t dest1v, dest2v;
+ vec_s16_t temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v;
+ vec_s16_t tempav, tempbv, tempcv, tempdv, tempev;
+ HPEL_FILTER_1_INIT;
+
+ vec_s16_t sixteenv, thirtytwov;
+ sixteenv = _mm_set1_epi16(16);
+ thirtytwov = _mm_set1_epi16(32);
+
+ for (y = 0; y < i_height; y++) {
+ int x = 0;
+
+ /* horizontal_filter */
+ HPEL_FILTER_HORIZONTAL();
+
+ /* vertical_filter */
+ HPEL_FILTER_VERTICAL();
+
+ /* central_filter */
+ tempav = tempcv;
+ tempbv = tempdv;
+ tempcv = VEC_SPLAT16(temp1v, 0); /* first only */
+ tempdv = temp7v;
+ tempev = temp1v;
+
+ PRAGMA_E2K("ivdep")
+ for (x = 16; x < i_width; x += 16) {
+ /* horizontal_filter */
+ HPEL_FILTER_HORIZONTAL();
+
+ /* vertical_filter */
+ HPEL_FILTER_VERTICAL();
+
+ /* central_filter */
+ tempav = tempcv;
+ tempbv = tempdv;
+ tempcv = tempev;
+ tempdv = temp7v;
+ tempev = temp1v;
+
+ HPEL_FILTER_CENTRAL();
+ }
+
+ /* Partial vertical filter */
+ src1v = VEC_LD(src + x + i_stride * (y - 2));
+ src2v = VEC_LD(src + x + i_stride * (y - 1));
+ src3v = VEC_LD(src + x + i_stride * y);
+ src4v = VEC_LD(src + x + i_stride * (y + 1));
+ src5v = VEC_LD(src + x + i_stride * (y + 2));
+ src6v = VEC_LD(src + x + i_stride * (y + 3));
+
+ HPEL_FILTER_1(lo, temp1v);
+
+ /* central_filter */
+ tempav = tempcv;
+ tempbv = tempdv;
+ tempcv = tempev;
+ tempdv = temp1v;
+ /* tempev is not used */
+
+ HPEL_FILTER_CENTRAL();
+ }
+}
+
+static void frame_init_lowres_core_e2k(uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
+ intptr_t src_stride, intptr_t dst_stride, int width, int height) {
+ int x, y, w = width >> 4, end = (width & 15);
+ vec_u8_t src0v, src1v, src2v;
+ vec_u8_t lv, hv, src1p1v;
+ vec_u8_t avg0v, avg1v, avghv, avghp1v, avgleftv, avgrightv;
+ __m128i mask_even = _mm_set1_epi16(0xFF);
+
+ for (y = 0; y < height; y++) {
+ uint8_t *src1 = src0 + src_stride;
+ uint8_t *src2 = src1 + src_stride;
+
+ src0v = VEC_LD(src0);
+ src1v = VEC_LD(src1);
+ src2v = VEC_LD(src2);
+
+ avg0v = _mm_avg_epu8(src0v, src1v);
+ avg1v = _mm_avg_epu8(src1v, src2v);
+
+ PRAGMA_E2K("ivdep")
+ for (x = 0; x < w; x++) {
+ lv = VEC_LD(src0 + 16 * (x * 2 + 1));
+ src1v = VEC_LD(src1 + 16 * (x * 2 + 1));
+ avghv = _mm_avg_epu8(lv, src1v);
+
+ lv = VEC_LD(src0 + 16 * (x * 2 + 2));
+ src1p1v = VEC_LD(src1 + 16 * (x * 2 + 2));
+ avghp1v = _mm_avg_epu8(lv, src1p1v);
+
+ avgleftv = _mm_avg_epu8(VSLD(avg0v, avghv, 1), avg0v);
+ avgrightv = _mm_avg_epu8(VSLD(avghv, avghp1v, 1), avghv);
+
+ lv = _mm_packus_epi16(_mm_and_si128(avgleftv, mask_even), _mm_and_si128(avgrightv, mask_even));
+ hv = _mm_packus_epi16(_mm_srli_epi16(avgleftv, 8), _mm_srli_epi16(avgrightv, 8));
+ VEC_ST(dst0 + 16 * x, lv);
+ VEC_ST(dsth + 16 * x, hv);
+
+ avg0v = avghp1v;
+
+ hv = VEC_LD(src2 + 16 * (x * 2 + 1));
+ avghv = _mm_avg_epu8(src1v, hv);
+
+ hv = VEC_LD(src2 + 16 * (x * 2 + 2));
+ avghp1v = _mm_avg_epu8(src1p1v, hv);
+
+ avgleftv = _mm_avg_epu8(VSLD(avg1v, avghv, 1), avg1v);
+ avgrightv = _mm_avg_epu8(VSLD(avghv, avghp1v, 1), avghv);
+
+ lv = _mm_packus_epi16(_mm_and_si128(avgleftv, mask_even), _mm_and_si128(avgrightv, mask_even));
+ hv = _mm_packus_epi16(_mm_srli_epi16(avgleftv, 8), _mm_srli_epi16(avgrightv, 8));
+ VEC_ST(dstv + 16 * x, lv);
+ VEC_ST(dstc + 16 * x, hv);
+
+ avg1v = avghp1v;
+
+ }
+ if (end) {
+ lv = VEC_LD(src0 + 16 * (x * 2 + 1));
+ src1v = VEC_LD(src1 + 16 * (x * 2 + 1));
+ avghv = _mm_avg_epu8(lv, src1v);
+
+ lv = VEC_LD(src2 + 16 * (x * 2 + 1));
+ avghp1v = _mm_avg_epu8(src1v, lv);
+
+ avgleftv = _mm_avg_epu8(VSLD(avg0v, avghv, 1), avg0v);
+ avgrightv = _mm_avg_epu8(VSLD(avg1v, avghp1v, 1), avg1v);
+
+ lv = _mm_packus_epi16(_mm_and_si128(avgleftv, mask_even), _mm_and_si128(avgrightv, mask_even));
+ hv = _mm_packus_epi16(_mm_srli_epi16(avgleftv, 8), _mm_srli_epi16(avgrightv, 8));
+ VEC_STL(dst0 + 16 * x, lv);
+ VEC_STL(dsth + 16 * x, hv);
+
+ VEC_STH(dstv + 16 * x, lv);
+ VEC_STH(dstc + 16 * x, hv);
+ }
+
+ src0 += src_stride*2;
+ dst0 += dst_stride;
+ dsth += dst_stride;
+ dstv += dst_stride;
+ dstc += dst_stride;
+ }
+}
+
+static void mc_weight_w2_e2k(uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
+ const x264_weight_t *weight, int i_height) {
+ LOAD_ZERO;
+ vec_u8_t srcv;
+ vec_s16_t weightv, scalev, offsetv, roundv;
+
+ int y, denom = weight->i_denom;
+ scalev = _mm_set1_epi16(weight->i_scale);
+ offsetv = _mm_set1_epi16(weight->i_offset);
+
+ if (denom >= 1) {
+ roundv = _mm_set1_epi16(1 << (denom - 1));
+
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) {
+ srcv = VEC_LD8(src);
+ weightv = vec_u8_to_s16(srcv);
+
+ weightv = _mm_add_epi16(_mm_mullo_epi16(weightv, scalev), roundv);
+ weightv = _mm_srai_epi16(weightv, denom);
+ weightv = _mm_add_epi16(weightv, offsetv);
+
+ srcv = _mm_packus_epi16(weightv, weightv);
+ *(uint16_t*)dst = _mm_extract_epi16(srcv, 0);
+ }
+ } else {
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) {
+ srcv = VEC_LD8(src);
+ weightv = vec_u8_to_s16(srcv);
+
+ weightv = _mm_add_epi16(_mm_mullo_epi16(weightv, scalev), offsetv);
+
+ srcv = _mm_packus_epi16(weightv, weightv);
+ *(uint16_t*)dst = _mm_extract_epi16(srcv, 0);
+ }
+ }
+}
+
+static void mc_weight_w4_e2k(uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
+ const x264_weight_t *weight, int i_height) {
+ LOAD_ZERO;
+ vec_u8_t srcv;
+ vec_s16_t weightv, scalev, offsetv, roundv;
+
+ int y, denom = weight->i_denom;
+ scalev = _mm_set1_epi16(weight->i_scale);
+ offsetv = _mm_set1_epi16(weight->i_offset);
+
+ if (denom >= 1) {
+ roundv = _mm_set1_epi16(1 << (denom - 1));
+
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) {
+ srcv = VEC_LD8(src);
+ weightv = vec_u8_to_s16(srcv);
+
+ weightv = _mm_add_epi16(_mm_mullo_epi16(weightv, scalev), roundv);
+ weightv = _mm_srai_epi16(weightv, denom);
+ weightv = _mm_add_epi16(weightv, offsetv);
+
+ srcv = _mm_packus_epi16(weightv, weightv);
+ *(uint32_t*)dst = _mm_extract_epi32(srcv, 0);
+ }
+ } else {
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) {
+ srcv = VEC_LD8(src);
+ weightv = vec_u8_to_s16(srcv);
+
+ weightv = _mm_add_epi16(_mm_mullo_epi16(weightv, scalev), offsetv);
+
+ srcv = _mm_packus_epi16(weightv, weightv);
+ *(uint32_t*)dst = _mm_extract_epi32(srcv, 0);
+ }
+ }
+}
+
+static void mc_weight_w8_e2k(uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
+ const x264_weight_t *weight, int i_height) {
+ LOAD_ZERO;
+ vec_u8_t srcv;
+ vec_s16_t weightv, scalev, offsetv, roundv;
+
+ int y, denom = weight->i_denom;
+ scalev = _mm_set1_epi16(weight->i_scale);
+ offsetv = _mm_set1_epi16(weight->i_offset);
+
+ if (denom >= 1) {
+ roundv = _mm_set1_epi16(1 << (denom - 1));
+
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) {
+ srcv = VEC_LD(src);
+ weightv = vec_u8_to_s16(srcv);
+
+ weightv = _mm_add_epi16(_mm_mullo_epi16(weightv, scalev), roundv);
+ weightv = _mm_srai_epi16(weightv, denom);
+ weightv = _mm_add_epi16(weightv, offsetv);
+
+ srcv = _mm_packus_epi16(weightv, weightv);
+ VEC_STL(dst, srcv);
+ }
+ } else {
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) {
+ srcv = VEC_LD(src);
+ weightv = vec_u8_to_s16(srcv);
+
+ weightv = _mm_add_epi16(_mm_mullo_epi16(weightv, scalev), offsetv);
+
+ srcv = _mm_packus_epi16(weightv, weightv);
+ VEC_STL(dst, srcv);
+ }
+ }
+}
+
+static void mc_weight_w16_e2k(uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
+ const x264_weight_t *weight, int i_height) {
+ LOAD_ZERO;
+ vec_u8_t srcv;
+ vec_s16_t weight_lv, weight_hv, scalev, offsetv, roundv;
+
+ int y, denom = weight->i_denom;
+ scalev = _mm_set1_epi16(weight->i_scale);
+ offsetv = _mm_set1_epi16(weight->i_offset);
+
+ if (denom >= 1) {
+ roundv = _mm_set1_epi16(1 << (denom - 1));
+
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) {
+ srcv = VEC_LD(src);
+ weight_hv = vec_u8_to_s16_h(srcv);
+ weight_lv = vec_u8_to_s16_l(srcv);
+
+ weight_hv = _mm_add_epi16(_mm_mullo_epi16(weight_hv, scalev), roundv);
+ weight_lv = _mm_add_epi16(_mm_mullo_epi16(weight_lv, scalev), roundv);
+ weight_hv = _mm_srai_epi16(weight_hv, denom);
+ weight_lv = _mm_srai_epi16(weight_lv, denom);
+ weight_hv = _mm_add_epi16(weight_hv, offsetv);
+ weight_lv = _mm_add_epi16(weight_lv, offsetv);
+
+ srcv = _mm_packus_epi16(weight_hv, weight_lv);
+ VEC_ST(dst, srcv);
+ }
+ } else {
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) {
+ srcv = VEC_LD(src);
+ weight_hv = vec_u8_to_s16_h(srcv);
+ weight_lv = vec_u8_to_s16_l(srcv);
+
+ weight_hv = _mm_add_epi16(_mm_mullo_epi16(weight_hv, scalev), offsetv);
+ weight_lv = _mm_add_epi16(_mm_mullo_epi16(weight_lv, scalev), offsetv);
+
+ srcv = _mm_packus_epi16(weight_hv, weight_lv);
+ VEC_ST(dst, srcv);
+ }
+ }
+}
+
+static void mc_weight_w20_e2k(uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
+ const x264_weight_t *weight, int i_height) {
+ LOAD_ZERO;
+ vec_u8_t srcv, srcv2;
+ vec_s16_t weight_lv, weight_hv, weight_3v;
+ vec_s16_t scalev, offsetv, roundv;
+
+ int y, denom = weight->i_denom;
+ scalev = _mm_set1_epi16(weight->i_scale);
+ offsetv = _mm_set1_epi16(weight->i_offset);
+
+ if (denom >= 1) {
+ roundv = _mm_set1_epi16(1 << (denom - 1));
+
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) {
+ srcv = VEC_LD(src);
+ srcv2 = VEC_LD8(src + 16);
+
+ weight_hv = vec_u8_to_s16_h(srcv);
+ weight_lv = vec_u8_to_s16_l(srcv);
+ weight_3v = vec_u8_to_s16_h(srcv2);
+
+ weight_hv = _mm_add_epi16(_mm_mullo_epi16(weight_hv, scalev), roundv);
+ weight_lv = _mm_add_epi16(_mm_mullo_epi16(weight_lv, scalev), roundv);
+ weight_3v = _mm_add_epi16(_mm_mullo_epi16(weight_3v, scalev), roundv);
+
+ weight_hv = _mm_srai_epi16(weight_hv, denom);
+ weight_lv = _mm_srai_epi16(weight_lv, denom);
+ weight_3v = _mm_srai_epi16(weight_3v, denom);
+
+ weight_hv = _mm_add_epi16(weight_hv, offsetv);
+ weight_lv = _mm_add_epi16(weight_lv, offsetv);
+ weight_3v = _mm_add_epi16(weight_3v, offsetv);
+
+ srcv = _mm_packus_epi16(weight_hv, weight_lv);
+ srcv2 = _mm_packus_epi16(weight_3v, weight_3v);
+ VEC_ST(dst, srcv);
+ *(uint32_t*)(dst + 16) = _mm_extract_epi32(srcv2, 0);
+ }
+ } else {
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) {
+ srcv = VEC_LD(src);
+ srcv2 = VEC_LD8(src + 16);
+
+ weight_hv = vec_u8_to_s16_h(srcv);
+ weight_lv = vec_u8_to_s16_l(srcv);
+ weight_3v = vec_u8_to_s16_h(srcv2);
+
+ weight_hv = _mm_add_epi16(_mm_mullo_epi16(weight_hv, scalev), offsetv);
+ weight_lv = _mm_add_epi16(_mm_mullo_epi16(weight_lv, scalev), offsetv);
+ weight_3v = _mm_add_epi16(_mm_mullo_epi16(weight_3v, scalev), offsetv);
+
+ srcv = _mm_packus_epi16(weight_hv, weight_lv);
+ srcv2 = _mm_packus_epi16(weight_3v, weight_3v);
+ VEC_ST(dst, srcv);
+ *(uint32_t*)(dst + 16) = _mm_extract_epi32(srcv2, 0);
+ }
+ }
+}
+
+static weight_fn_t mc_weight_wtab_e2k[6] = {
+ mc_weight_w2_e2k,
+ mc_weight_w4_e2k,
+ mc_weight_w8_e2k,
+ mc_weight_w16_e2k,
+ mc_weight_w16_e2k,
+ mc_weight_w20_e2k,
+};
+
+PLANE_COPY_SWAP(16, e2k)
+PLANE_INTERLEAVE(e2k)
+#endif // !HIGH_BIT_DEPTH
+
+#if HIGH_BIT_DEPTH
+
+#define STORE_8(a, b, shl0, shl1) { \
+ v3 = _mm_shuffle_epi8(v##a, idx##a); \
+ v5 = _mm_shuffle_epi8(v##b, idx##b); \
+ v4 = _mm_unpacklo_epi64(v3, v5); \
+ v5 = _mm_unpackhi_epi64(v3, v5); \
+ v4 = _mm_mullo_epi16(v4, shl0); \
+ v5 = _mm_mullo_epi16(v5, shl1); \
+ v4 = _mm_srli_epi16(v4, 6); \
+ v5 = _mm_srli_epi16(v5, 6); \
+ VEC_ST(dstc + offset, v4); \
+ VEC_ST(dsty + offset, v5); \
+ offset += 8; \
+}
+
+// v210 input is only compatible with bit-depth of 10 bits
+static void plane_copy_deinterleave_v210_e2k(uint16_t *dsty, intptr_t i_dsty,
+ uint16_t *dstc, intptr_t i_dstc,
+ uint32_t *src, intptr_t i_src, int w, int h) {
+ __m128i idx0 = _mm_setr_epi8(
+ 0x00, 0x01, 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x01, 0x02, 0x04, 0x05, 0x06, 0x07, 0x09, 0x0A);
+ __m128i idx1 = _mm_setr_epi8(
+ 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05, 0x06, 0x07, 0x09, 0x0A, 0x0C, 0x0D);
+ __m128i idx2 = _mm_setr_epi8(
+ 0x05, 0x06, 0x08, 0x09, 0x0A, 0x0B, 0x0D, 0x0E, 0x06, 0x07, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x0F);
+
+ __m128i shl0 = _mm_setr_epi16(64, 4, 16, 64, 4, 16, 64, 4),
+ shl1 = _mm_setr_epi16(16, 64, 4, 16, 64, 4, 16, 64),
+ shl2 = _mm_setr_epi16(4, 16, 64, 4, 16, 64, 4, 16);
+
+ __m128i v0, v1, v2, v3, v4, v5;
+ int i, j;
+
+ for (i = 0; i < h; i++, dsty += i_dsty, dstc += i_dstc, src += i_src) {
+ int offset = 0, s = 0;
+
+ PRAGMA_E2K("ivdep")
+ for (j = 0; j < w; j += 24, s += 16) {
+ v0 = VEC_LD(src + s);
+ v2 = VEC_LD(src + s + 4);
+ v1 = _mm_alignr_epi8(v2, v0, 8);
+ STORE_8(0, 1, shl0, shl1);
+
+ v0 = VEC_LD(src + s + 8);
+ STORE_8(2, 0, shl1, shl2);
+
+ v2 = VEC_LD(src + s + 12);
+ v1 = _mm_alignr_epi8(v2, v0, 8);
+ STORE_8(1, 2, shl2, shl0);
+ }
+ }
+}
+#endif // HIGH_BIT_DEPTH
+
+void x264_mc_init_e2k(x264_mc_functions_t *pf) {
+#if HIGH_BIT_DEPTH
+ pf->plane_copy_deinterleave_v210 = plane_copy_deinterleave_v210_e2k;
+#else // !HIGH_BIT_DEPTH
+ pf->mc_luma = mc_luma_e2k;
+ pf->get_ref = get_ref_e2k;
+ pf->mc_chroma = mc_chroma_e2k;
+
+ pf->avg[PIXEL_16x16] = pixel_avg_16x16_e2k;
+ pf->avg[PIXEL_16x8] = pixel_avg_16x8_e2k;
+ pf->avg[PIXEL_8x16] = pixel_avg_8x16_e2k;
+ pf->avg[PIXEL_8x8] = pixel_avg_8x8_e2k;
+ pf->avg[PIXEL_8x4] = pixel_avg_8x4_e2k;
+ pf->avg[PIXEL_4x16] = pixel_avg_4x16_e2k;
+ pf->avg[PIXEL_4x8] = pixel_avg_4x8_e2k;
+ pf->avg[PIXEL_4x4] = pixel_avg_4x4_e2k;
+ pf->avg[PIXEL_4x2] = pixel_avg_4x2_e2k;
+ pf->avg[PIXEL_2x8] = pixel_avg_2x8_e2k;
+ pf->avg[PIXEL_2x4] = pixel_avg_2x4_e2k;
+ pf->avg[PIXEL_2x2] = pixel_avg_2x2_e2k;
+
+ pf->copy_16x16_unaligned = mc_copy_w16_e2k;
+ pf->copy[PIXEL_16x16] = mc_copy_w16_aligned_e2k;
+
+ pf->hpel_filter = hpel_filter_e2k;
+ pf->frame_init_lowres_core = frame_init_lowres_core_e2k;
+
+ pf->weight = mc_weight_wtab_e2k;
+
+ pf->plane_copy_swap = plane_copy_swap_e2k;
+ pf->plane_copy_interleave = plane_copy_interleave_e2k;
+ pf->store_interleave_chroma = store_interleave_chroma_e2k;
+ pf->plane_copy_deinterleave = plane_copy_deinterleave_e2k;
+ pf->plane_copy_deinterleave_rgb = plane_copy_deinterleave_rgb_e2k;
+
+ pf->load_deinterleave_chroma_fenc = load_deinterleave_chroma_fenc_e2k;
+ pf->load_deinterleave_chroma_fdec = load_deinterleave_chroma_fdec_e2k;
+#endif // !HIGH_BIT_DEPTH
+}
diff --git a/common/e2k/mc.h b/common/e2k/mc.h
new file mode 100644
index 0000000..3e0c8e1
--- /dev/null
+++ b/common/e2k/mc.h
@@ -0,0 +1,30 @@
+/*****************************************************************************
+ * mc.h: e2k motion compensation
+ *****************************************************************************
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_E2K_MC_H
+#define X264_E2K_MC_H
+
+#define x264_mc_init_e2k x264_template(mc_init_e2k)
+void x264_mc_init_e2k( x264_mc_functions_t *pf );
+
+#endif
diff --git a/common/e2k/pixel.c b/common/e2k/pixel.c
new file mode 100644
index 0000000..e68fbda
--- /dev/null
+++ b/common/e2k/pixel.c
@@ -0,0 +1,1169 @@
+/*****************************************************************************
+ * pixel.c: e2k pixel metrics
+ *****************************************************************************
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ * Copyright (C) 2003-2017 x264 project
+ *
+ * Authors: Eric Petit <eric.petit@lapsus.org>
+ * Guillaume Poirier <gpoirier@mplayerhq.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "e2kcommon.h"
+#include "pixel.h"
+
+#if !HIGH_BIT_DEPTH
+/***********************************************************************
+ * SAD routines
+ **********************************************************************/
+
+#define SAD_LD8(pix) *(__m64*)pix
+#define SAD_LD16(pix) VEC_LD(pix)
+#define SAD_NEXT8(sumv, fencv, pixv) sumv = _mm_add_pi32(sumv, _mm_sad_pu8(fencv, pixv))
+#define SAD_NEXT16(sumv, fencv, pixv) sumv = _mm_add_epi32(sumv, _mm_sad_epu8(fencv, pixv))
+#define SAD_SUM8(sumv) _mm_extract_pi32(sumv, 0)
+#define SAD_SUM16(sumv) _mm_extract_epi32(sumv, 0) + _mm_extract_epi32(sumv, 2)
+
+#ifdef NEED_ALIGN8
+#define PIXEL_SAD_E2K(lx, ly, vec, zerov) \
+static int pixel_sad_##lx##x##ly##_e2k(uint8_t *pix1, intptr_t i_pix1, \
+ uint8_t *pix2, intptr_t i_pix2) { \
+ vec v0, v1, sumv = zerov; \
+ int y; \
+ for (y = 0; y < ly; y++) { \
+ v0 = SAD_LD##lx(pix1); pix1 += i_pix1; \
+ v1 = SAD_LD##lx(pix2); pix2 += i_pix2; \
+ SAD_NEXT##lx(sumv, v0, v1); \
+ } \
+ return SAD_SUM##lx(sumv); \
+} \
+static void pixel_sad_x4_##lx##x##ly##_e2k(uint8_t *fenc, \
+ uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, \
+ intptr_t stride, int scores[4]) { \
+ vec fencv, pix0v, pix1v, pix2v, pix3v; \
+ vec sum0v = zerov, sum1v = sum0v, sum2v = sum0v, sum3v = sum0v; \
+ int y; \
+ if (!(stride & 7) && !(FENC_STRIDE & 7)) { \
+ ALIGN8A_COMMON \
+ ALIGN8A_VARS(fenc) \
+ ALIGN8A_VARS(pix0) \
+ ALIGN8A_VARS(pix1) \
+ ALIGN8A_VARS(pix2) \
+ ALIGN8A_VARS(pix3) \
+ stride >>= 3; \
+ ALIGN8A_START(fenc, fenc) \
+ ALIGN8A_START(pix0, pix0) \
+ ALIGN8A_START(pix1, pix1) \
+ ALIGN8A_START(pix2, pix2) \
+ ALIGN8A_START(pix3, pix3) \
+ for (y = 0; y < ly; y++) { \
+ ALIGN8A_READ##lx(pix0v, pix0); pix0_ptr += stride; \
+ ALIGN8A_READ##lx(pix1v, pix1); pix1_ptr += stride; \
+ ALIGN8A_READ##lx(fencv, fenc); fenc_ptr += FENC_STRIDE / 8; \
+ ALIGN8A_READ##lx(pix2v, pix2); pix2_ptr += stride; \
+ ALIGN8A_READ##lx(pix3v, pix3); pix3_ptr += stride; \
+ SAD_NEXT##lx(sum0v, fencv, pix0v); \
+ SAD_NEXT##lx(sum1v, fencv, pix1v); \
+ SAD_NEXT##lx(sum2v, fencv, pix2v); \
+ SAD_NEXT##lx(sum3v, fencv, pix3v); \
+ } \
+ } else \
+ for (y = 0; y < ly; y++) { \
+ pix0v = SAD_LD##lx(pix0); pix0 += stride; \
+ pix1v = SAD_LD##lx(pix1); pix1 += stride; \
+ fencv = SAD_LD##lx(fenc); fenc += FENC_STRIDE; \
+ pix2v = SAD_LD##lx(pix2); pix2 += stride; \
+ pix3v = SAD_LD##lx(pix3); pix3 += stride; \
+ SAD_NEXT##lx(sum0v, fencv, pix0v); \
+ SAD_NEXT##lx(sum1v, fencv, pix1v); \
+ SAD_NEXT##lx(sum2v, fencv, pix2v); \
+ SAD_NEXT##lx(sum3v, fencv, pix3v); \
+ } \
+ scores[0] = SAD_SUM##lx(sum0v); \
+ scores[1] = SAD_SUM##lx(sum1v); \
+ scores[2] = SAD_SUM##lx(sum2v); \
+ scores[3] = SAD_SUM##lx(sum3v); \
+} \
+static void pixel_sad_x3_##lx##x##ly##_e2k(uint8_t *fenc, \
+ uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, \
+ intptr_t stride, int scores[4]) { \
+ vec fencv, pix0v, pix1v, pix2v; \
+ vec sum0v = zerov, sum1v = sum0v, sum2v = sum0v; \
+ int y; \
+ if (!(stride & 7) && !(FENC_STRIDE & 7)) { \
+ ALIGN8A_COMMON \
+ ALIGN8A_VARS(fenc) \
+ ALIGN8A_VARS(pix0) \
+ ALIGN8A_VARS(pix1) \
+ ALIGN8A_VARS(pix2) \
+ stride >>= 3; \
+ ALIGN8A_START(fenc, fenc) \
+ ALIGN8A_START(pix0, pix0) \
+ ALIGN8A_START(pix1, pix1) \
+ ALIGN8A_START(pix2, pix2) \
+ for (y = 0; y < ly; y++) { \
+ ALIGN8A_READ##lx(pix0v, pix0); pix0_ptr += stride; \
+ ALIGN8A_READ##lx(pix1v, pix1); pix1_ptr += stride; \
+ ALIGN8A_READ##lx(fencv, fenc); fenc_ptr += FENC_STRIDE / 8; \
+ ALIGN8A_READ##lx(pix2v, pix2); pix2_ptr += stride; \
+ SAD_NEXT##lx(sum0v, fencv, pix0v); \
+ SAD_NEXT##lx(sum1v, fencv, pix1v); \
+ SAD_NEXT##lx(sum2v, fencv, pix2v); \
+ } \
+ } else \
+ for (y = 0; y < ly; y++) { \
+ pix0v = SAD_LD##lx(pix0); pix0 += stride; \
+ pix1v = SAD_LD##lx(pix1); pix1 += stride; \
+ fencv = SAD_LD##lx(fenc); fenc += FENC_STRIDE; \
+ pix2v = SAD_LD##lx(pix2); pix2 += stride; \
+ SAD_NEXT##lx(sum0v, fencv, pix0v); \
+ SAD_NEXT##lx(sum1v, fencv, pix1v); \
+ SAD_NEXT##lx(sum2v, fencv, pix2v); \
+ } \
+ scores[0] = SAD_SUM##lx(sum0v); \
+ scores[1] = SAD_SUM##lx(sum1v); \
+ scores[2] = SAD_SUM##lx(sum2v); \
+}
+
+#else
+
+#define PIXEL_SAD_E2K(lx, ly, vec, zerov) \
+static int pixel_sad_##lx##x##ly##_e2k(uint8_t *pix1, intptr_t i_pix1, \
+ uint8_t *pix2, intptr_t i_pix2) { \
+ vec v0, v1, sumv = zerov; \
+ int y; \
+ for (y = 0; y < ly; y++) { \
+ v0 = SAD_LD##lx(pix1); pix1 += i_pix1; \
+ v1 = SAD_LD##lx(pix2); pix2 += i_pix2; \
+ SAD_NEXT##lx(sumv, v0, v1); \
+ } \
+ return SAD_SUM##lx(sumv); \
+} \
+static void pixel_sad_x4_##lx##x##ly##_e2k(uint8_t *fenc, \
+ uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, \
+ intptr_t stride, int scores[4]) { \
+ vec fencv, pix0v, pix1v, pix2v, pix3v; \
+ vec sum0v = zerov, sum1v = sum0v, sum2v = sum0v, sum3v = sum0v; \
+ int y; \
+ for (y = 0; y < ly; y++) { \
+ pix0v = SAD_LD##lx(pix0); pix0 += stride; \
+ pix1v = SAD_LD##lx(pix1); pix1 += stride; \
+ fencv = SAD_LD##lx(fenc); fenc += FENC_STRIDE; \
+ pix2v = SAD_LD##lx(pix2); pix2 += stride; \
+ pix3v = SAD_LD##lx(pix3); pix3 += stride; \
+ SAD_NEXT##lx(sum0v, fencv, pix0v); \
+ SAD_NEXT##lx(sum1v, fencv, pix1v); \
+ SAD_NEXT##lx(sum2v, fencv, pix2v); \
+ SAD_NEXT##lx(sum3v, fencv, pix3v); \
+ } \
+ scores[0] = SAD_SUM##lx(sum0v); \
+ scores[1] = SAD_SUM##lx(sum1v); \
+ scores[2] = SAD_SUM##lx(sum2v); \
+ scores[3] = SAD_SUM##lx(sum3v); \
+} \
+static void pixel_sad_x3_##lx##x##ly##_e2k(uint8_t *fenc, \
+ uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, \
+ intptr_t stride, int scores[4]) { \
+ vec fencv, pix0v, pix1v, pix2v; \
+ vec sum0v = zerov, sum1v = sum0v, sum2v = sum0v; \
+ int y; \
+ for (y = 0; y < ly; y++) { \
+ pix0v = SAD_LD##lx(pix0); pix0 += stride; \
+ pix1v = SAD_LD##lx(pix1); pix1 += stride; \
+ fencv = SAD_LD##lx(fenc); fenc += FENC_STRIDE; \
+ pix2v = SAD_LD##lx(pix2); pix2 += stride; \
+ SAD_NEXT##lx(sum0v, fencv, pix0v); \
+ SAD_NEXT##lx(sum1v, fencv, pix1v); \
+ SAD_NEXT##lx(sum2v, fencv, pix2v); \
+ } \
+ scores[0] = SAD_SUM##lx(sum0v); \
+ scores[1] = SAD_SUM##lx(sum1v); \
+ scores[2] = SAD_SUM##lx(sum2v); \
+}
+
+#endif
+
+PIXEL_SAD_E2K(8, 8, __m64, _mm_setzero_si64())
+PIXEL_SAD_E2K(8, 16, __m64, _mm_setzero_si64())
+PIXEL_SAD_E2K(16, 8, __m128i, _mm_setzero_si128())
+PIXEL_SAD_E2K(16, 16, __m128i, _mm_setzero_si128())
+
+/***********************************************************************
+ * SATD routines
+ **********************************************************************/
+
+/***********************************************************************
+ * VEC_HADAMAR
+ ***********************************************************************
+ * b[0] = a[0] + a[1] + a[2] + a[3]
+ * b[1] = a[0] + a[1] - a[2] - a[3]
+ * b[2] = a[0] - a[1] - a[2] + a[3]
+ * b[3] = a[0] - a[1] + a[2] - a[3]
+ **********************************************************************/
+#define VEC_HADAMAR(a0, a1, a2, a3, b0, b1, b2, b3) \
+ b2 = _mm_add_epi16(a0, a1); \
+ b3 = _mm_add_epi16(a2, a3); \
+ a0 = _mm_sub_epi16(a0, a1); \
+ a2 = _mm_sub_epi16(a2, a3); \
+ b0 = _mm_add_epi16(b2, b3); \
+ b1 = _mm_sub_epi16(b2, b3); \
+ b2 = _mm_sub_epi16(a0, a2); \
+ b3 = _mm_add_epi16(a0, a2)
+
+/***********************************************************************
+ * VEC_ABS
+ ***********************************************************************
+ * a: s16v
+ *
+ * a = abs(a)
+ *
+ * Call vec_sub()/vec_max() instead of vec_abs() because vec_abs()
+ * actually also calls vec_splat(0), but we already have a null vector.
+ **********************************************************************/
+#define VEC_ABSOLUTE(a) _mm_abs_epi16(a)
+
+#define VEC_TRANSPOSE_8X4(a0, a1, a2, a3, b0, b1, b2, b3) \
+ b0 = _mm_unpacklo_epi16(a0, a2); \
+ b1 = _mm_unpackhi_epi16(a0, a2); \
+ b2 = _mm_unpacklo_epi16(a1, a3); \
+ b3 = _mm_unpackhi_epi16(a1, a3); \
+ a0 = _mm_unpacklo_epi16(b0, b2); \
+ a1 = _mm_unpackhi_epi16(b0, b2); \
+ a2 = _mm_unpacklo_epi16(b1, b3); \
+ a3 = _mm_unpackhi_epi16(b1, b3); \
+ b0 = _mm_unpacklo_epi64(a0, a2); \
+ b1 = _mm_unpackhi_epi64(a0, a2); \
+ b2 = _mm_unpacklo_epi64(a1, a3); \
+ b3 = _mm_unpackhi_epi64(a1, a3);
+
+#define PREP_DIFF \
+ LOAD_ZERO; \
+ vec_s16_t pix1v, pix2v;
+
+#define VEC_DIFF8X1_H(p1,i1,p2,i2,d) \
+ pix1v = VEC_LD8(p1); \
+ pix2v = VEC_LD8(p2); \
+ d = _mm_sub_epi16(vec_u8_to_s16(pix1v), vec_u8_to_s16(pix2v)); \
+ p1 += i1; p2 += i2
+
+#define VEC_DIFF4X1_H(p1,i1,p2,i2,d) \
+ pix1v = _mm_cvtsi32_si128(*(uint32_t*)p1); \
+ pix2v = _mm_cvtsi32_si128(*(uint32_t*)p2); \
+ d = _mm_sub_epi16(vec_u8_to_s16(pix1v), vec_u8_to_s16(pix2v)); \
+ p1 += i1; p2 += i2
+
+#define VEC_DIFF4X2_H(p1,i1,p2,i2,d) \
+ pix1v = _mm_setr_epi32(*(uint32_t*)p1, *(uint32_t*)(p1 + i1 * 4), 0, 0); \
+ pix2v = _mm_setr_epi32(*(uint32_t*)p2, *(uint32_t*)(p2 + i2 * 4), 0, 0); \
+ d = _mm_sub_epi16(vec_u8_to_s16(pix1v), vec_u8_to_s16(pix2v)); \
+ p1 += i1; p2 += i2
+
+/***********************************************************************
+ * SATD 4x4
+ **********************************************************************/
+static int pixel_satd_4x4_e2k(uint8_t *pix1, intptr_t i_pix1,
+ uint8_t *pix2, intptr_t i_pix2) {
+ PREP_DIFF;
+ vec_s16_t diff0v, diff1v, diff2v, diff3v;
+ vec_s16_t temp0v, temp1v, temp2v, temp3v;
+ vec_u16_t satdv; __m64 halfv;
+
+ VEC_DIFF4X1_H(pix1, i_pix1, pix2, i_pix2, diff0v);
+ VEC_DIFF4X1_H(pix1, i_pix1, pix2, i_pix2, diff1v);
+ VEC_DIFF4X1_H(pix1, i_pix1, pix2, i_pix2, diff2v);
+ VEC_DIFF4X1_H(pix1, i_pix1, pix2, i_pix2, diff3v);
+
+ /* Hadamar H */
+ VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v,
+ temp0v, temp1v, temp2v, temp3v);
+
+ VEC_TRANSPOSE_4(temp0v, temp1v, temp2v, temp3v,
+ diff0v, diff1v, diff2v, diff3v);
+ /* Hadamar V */
+ VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v,
+ temp0v, temp1v, temp2v, temp3v);
+
+ temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v));
+ temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v));
+ satdv = _mm_add_epi16(temp0v, temp2v);
+
+ halfv = _mm_movepi64_pi64(satdv);
+ halfv = _mm_hadd_pi16(halfv, halfv);
+ halfv = _mm_hadd_pi16(halfv, halfv);
+ return _mm_extract_pi16(halfv, 0) >> 1;
+}
+
+/***********************************************************************
+ * SATD 4x8
+ **********************************************************************/
+static int pixel_satd_4x8_e2k(uint8_t *pix1, intptr_t i_pix1,
+ uint8_t *pix2, intptr_t i_pix2) {
+ PREP_DIFF;
+ vec_s16_t diff0v, diff1v, diff2v, diff3v;
+ vec_s16_t temp0v, temp1v, temp2v, temp3v;
+ vec_u16_t satdv; __m64 halfv;
+
+ VEC_DIFF4X2_H(pix1, i_pix1, pix2, i_pix2, diff0v);
+ VEC_DIFF4X2_H(pix1, i_pix1, pix2, i_pix2, diff1v);
+ VEC_DIFF4X2_H(pix1, i_pix1, pix2, i_pix2, diff2v);
+ VEC_DIFF4X2_H(pix1, i_pix1, pix2, i_pix2, diff3v);
+
+ VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v,
+ temp0v, temp1v, temp2v, temp3v);
+ VEC_TRANSPOSE_8X4(temp0v, temp1v, temp2v, temp3v,
+ diff0v, diff1v, diff2v, diff3v);
+ VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v,
+ temp0v, temp1v, temp2v, temp3v);
+
+ temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v));
+ temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v));
+ satdv = _mm_add_epi16(temp0v, temp2v);
+ satdv = _mm_hadd_epi16(satdv, satdv);
+
+ halfv = _mm_movepi64_pi64(satdv);
+ halfv = _mm_hadd_pi16(halfv, halfv);
+ return (_mm_extract_pi16(halfv, 0) + _mm_extract_pi16(halfv, 1)) >> 1;
+}
+
+/***********************************************************************
+ * SATD 8x4
+ **********************************************************************/
+static int pixel_satd_8x4_e2k(uint8_t *pix1, intptr_t i_pix1,
+ uint8_t *pix2, intptr_t i_pix2) {
+ PREP_DIFF;
+ vec_s16_t diff0v, diff1v, diff2v, diff3v;
+ vec_s16_t temp0v, temp1v, temp2v, temp3v;
+ vec_u16_t satdv; __m64 halfv;
+
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff0v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff1v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff2v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff3v);
+
+ VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v,
+ temp0v, temp1v, temp2v, temp3v);
+ VEC_TRANSPOSE_8X4(temp0v, temp1v, temp2v, temp3v,
+ diff0v, diff1v, diff2v, diff3v);
+ VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v,
+ temp0v, temp1v, temp2v, temp3v);
+
+ temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v));
+ temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v));
+ satdv = _mm_add_epi16(temp0v, temp2v);
+ satdv = _mm_hadd_epi16(satdv, satdv);
+
+ halfv = _mm_movepi64_pi64(satdv);
+ halfv = _mm_hadd_pi16(halfv, halfv);
+ return (_mm_extract_pi16(halfv, 0) + _mm_extract_pi16(halfv, 1)) >> 1;
+}
+
+/***********************************************************************
+ * SATD 8x8
+ **********************************************************************/
+static int pixel_satd_8x8_e2k(uint8_t *pix1, intptr_t i_pix1,
+ uint8_t *pix2, intptr_t i_pix2) {
+ PREP_DIFF;
+ vec_s16_t diff0v, diff1v, diff2v, diff3v,
+ diff4v, diff5v, diff6v, diff7v;
+ vec_s16_t temp0v, temp1v, temp2v, temp3v,
+ temp4v, temp5v, temp6v, temp7v;
+ vec_u16_t satdv;
+
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff0v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff1v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff2v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff3v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff4v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff5v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff6v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff7v);
+
+ VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v,
+ temp0v, temp1v, temp2v, temp3v);
+ VEC_HADAMAR(diff4v, diff5v, diff6v, diff7v,
+ temp4v, temp5v, temp6v, temp7v);
+
+ VEC_TRANSPOSE_8(temp0v, temp1v, temp2v, temp3v,
+ temp4v, temp5v, temp6v, temp7v,
+ diff0v, diff1v, diff2v, diff3v,
+ diff4v, diff5v, diff6v, diff7v);
+
+ VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v,
+ temp0v, temp1v, temp2v, temp3v);
+ VEC_HADAMAR(diff4v, diff5v, diff6v, diff7v,
+ temp4v, temp5v, temp6v, temp7v);
+
+ temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v));
+ temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v));
+ temp0v = _mm_add_epi16(temp0v, temp2v);
+ temp4v = _mm_add_epi16(_mm_abs_epi16(temp4v), _mm_abs_epi16(temp5v));
+ temp6v = _mm_add_epi16(_mm_abs_epi16(temp6v), _mm_abs_epi16(temp7v));
+ temp4v = _mm_add_epi16(temp4v, temp6v);
+ satdv = _mm_add_epi16(temp0v, temp4v);
+ satdv = _mm_add_epi32(_mm_unpacklo_epi16(satdv, zerov), _mm_unpackhi_epi16(satdv, zerov));
+
+ satdv = _mm_hadd_epi32(satdv, satdv);
+ return (_mm_extract_epi32(satdv, 0) + _mm_extract_epi32(satdv, 1)) >> 1;
+}
+
+/***********************************************************************
+ * SATD 8x16
+ **********************************************************************/
+static int pixel_satd_8x16_e2k(uint8_t *pix1, intptr_t i_pix1,
+ uint8_t *pix2, intptr_t i_pix2) {
+ PREP_DIFF;
+ vec_s16_t diff0v, diff1v, diff2v, diff3v,
+ diff4v, diff5v, diff6v, diff7v;
+ vec_s16_t temp0v, temp1v, temp2v, temp3v,
+ temp4v, temp5v, temp6v, temp7v;
+ vec_u16_t satdv;
+
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff0v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff1v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff2v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff3v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff4v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff5v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff6v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff7v);
+ VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v,
+ temp0v, temp1v, temp2v, temp3v);
+ VEC_HADAMAR(diff4v, diff5v, diff6v, diff7v,
+ temp4v, temp5v, temp6v, temp7v);
+ VEC_TRANSPOSE_8(temp0v, temp1v, temp2v, temp3v,
+ temp4v, temp5v, temp6v, temp7v,
+ diff0v, diff1v, diff2v, diff3v,
+ diff4v, diff5v, diff6v, diff7v);
+ VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v,
+ temp0v, temp1v, temp2v, temp3v);
+ VEC_HADAMAR(diff4v, diff5v, diff6v, diff7v,
+ temp4v, temp5v, temp6v, temp7v);
+
+ temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v));
+ temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v));
+ temp0v = _mm_add_epi16(temp0v, temp2v);
+ temp4v = _mm_add_epi16(_mm_abs_epi16(temp4v), _mm_abs_epi16(temp5v));
+ temp6v = _mm_add_epi16(_mm_abs_epi16(temp6v), _mm_abs_epi16(temp7v));
+ temp4v = _mm_add_epi16(temp4v, temp6v);
+ satdv = _mm_add_epi16(temp0v, temp4v);
+
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff0v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff1v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff2v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff3v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff4v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff5v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff6v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff7v);
+ VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v,
+ temp0v, temp1v, temp2v, temp3v);
+ VEC_HADAMAR(diff4v, diff5v, diff6v, diff7v,
+ temp4v, temp5v, temp6v, temp7v);
+ VEC_TRANSPOSE_8(temp0v, temp1v, temp2v, temp3v,
+ temp4v, temp5v, temp6v, temp7v,
+ diff0v, diff1v, diff2v, diff3v,
+ diff4v, diff5v, diff6v, diff7v);
+ VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v,
+ temp0v, temp1v, temp2v, temp3v);
+ VEC_HADAMAR(diff4v, diff5v, diff6v, diff7v,
+ temp4v, temp5v, temp6v, temp7v);
+
+ temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v));
+ temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v));
+ temp0v = _mm_add_epi16(temp0v, temp2v);
+ temp4v = _mm_add_epi16(_mm_abs_epi16(temp4v), _mm_abs_epi16(temp5v));
+ temp6v = _mm_add_epi16(_mm_abs_epi16(temp6v), _mm_abs_epi16(temp7v));
+ temp4v = _mm_add_epi16(temp4v, temp6v);
+ satdv = _mm_add_epi16(satdv, _mm_add_epi16(temp0v, temp4v));
+ satdv = _mm_add_epi32(_mm_unpacklo_epi16(satdv, zerov), _mm_unpackhi_epi16(satdv, zerov));
+
+ satdv = _mm_hadd_epi32(satdv, satdv);
+ return (_mm_extract_epi32(satdv, 0) + _mm_extract_epi32(satdv, 1)) >> 1;
+}
+
+/***********************************************************************
+ * SATD 16x8
+ **********************************************************************/
+static int pixel_satd_16x8_e2k(uint8_t *pix1, intptr_t i_pix1,
+ uint8_t *pix2, intptr_t i_pix2) {
+ LOAD_ZERO;
+ vec_u16_t satdv;
+ vec_s16_t pix1v, pix2v;
+ vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
+ diffh4v, diffh5v, diffh6v, diffh7v;
+ vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
+ diffl4v, diffl5v, diffl6v, diffl7v;
+ vec_s16_t temp0v, temp1v, temp2v, temp3v,
+ temp4v, temp5v, temp6v, temp7v;
+
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v);
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v);
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v);
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v);
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v);
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v);
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v);
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v);
+
+ VEC_HADAMAR(diffh0v, diffh1v, diffh2v, diffh3v,
+ temp0v, temp1v, temp2v, temp3v);
+ VEC_HADAMAR(diffh4v, diffh5v, diffh6v, diffh7v,
+ temp4v, temp5v, temp6v, temp7v);
+
+ VEC_TRANSPOSE_8(temp0v, temp1v, temp2v, temp3v,
+ temp4v, temp5v, temp6v, temp7v,
+ diffh0v, diffh1v, diffh2v, diffh3v,
+ diffh4v, diffh5v, diffh6v, diffh7v);
+
+ VEC_HADAMAR(diffh0v, diffh1v, diffh2v, diffh3v,
+ temp0v, temp1v, temp2v, temp3v);
+ VEC_HADAMAR(diffh4v, diffh5v, diffh6v, diffh7v,
+ temp4v, temp5v, temp6v, temp7v);
+
+ temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v));
+ temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v));
+ temp0v = _mm_add_epi16(temp0v, temp2v);
+ temp4v = _mm_add_epi16(_mm_abs_epi16(temp4v), _mm_abs_epi16(temp5v));
+ temp6v = _mm_add_epi16(_mm_abs_epi16(temp6v), _mm_abs_epi16(temp7v));
+ temp4v = _mm_add_epi16(temp4v, temp6v);
+ satdv = _mm_add_epi16(temp0v, temp4v);
+
+ VEC_HADAMAR(diffl0v, diffl1v, diffl2v, diffl3v,
+ temp0v, temp1v, temp2v, temp3v);
+ VEC_HADAMAR(diffl4v, diffl5v, diffl6v, diffl7v,
+ temp4v, temp5v, temp6v, temp7v);
+
+ VEC_TRANSPOSE_8(temp0v, temp1v, temp2v, temp3v,
+ temp4v, temp5v, temp6v, temp7v,
+ diffl0v, diffl1v, diffl2v, diffl3v,
+ diffl4v, diffl5v, diffl6v, diffl7v);
+
+ VEC_HADAMAR(diffl0v, diffl1v, diffl2v, diffl3v,
+ temp0v, temp1v, temp2v, temp3v);
+ VEC_HADAMAR(diffl4v, diffl5v, diffl6v, diffl7v,
+ temp4v, temp5v, temp6v, temp7v);
+
+ temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v));
+ temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v));
+ temp0v = _mm_add_epi16(temp0v, temp2v);
+ temp4v = _mm_add_epi16(_mm_abs_epi16(temp4v), _mm_abs_epi16(temp5v));
+ temp6v = _mm_add_epi16(_mm_abs_epi16(temp6v), _mm_abs_epi16(temp7v));
+ temp4v = _mm_add_epi16(temp4v, temp6v);
+ satdv = _mm_add_epi16(satdv, _mm_add_epi16(temp0v, temp4v));
+ satdv = _mm_add_epi32(_mm_unpacklo_epi16(satdv, zerov), _mm_unpackhi_epi16(satdv, zerov));
+
+ satdv = _mm_hadd_epi32(satdv, satdv);
+ return (_mm_extract_epi32(satdv, 0) + _mm_extract_epi32(satdv, 1)) >> 1;
+}
+
+/***********************************************************************
+ * SATD 16x16
+ **********************************************************************/
+static int pixel_satd_16x16_e2k(uint8_t *pix1, intptr_t i_pix1,
+ uint8_t *pix2, intptr_t i_pix2) {
+ LOAD_ZERO;
+ vec_u16_t satdv;
+ vec_s16_t pix1v, pix2v;
+ vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
+ diffh4v, diffh5v, diffh6v, diffh7v;
+ vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
+ diffl4v, diffl5v, diffl6v, diffl7v;
+ vec_s16_t temp0v, temp1v, temp2v, temp3v,
+ temp4v, temp5v, temp6v, temp7v;
+
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v);
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v);
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v);
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v);
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v);
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v);
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v);
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v);
+ VEC_HADAMAR(diffh0v, diffh1v, diffh2v, diffh3v,
+ temp0v, temp1v, temp2v, temp3v);
+ VEC_HADAMAR(diffh4v, diffh5v, diffh6v, diffh7v,
+ temp4v, temp5v, temp6v, temp7v);
+ VEC_TRANSPOSE_8(temp0v, temp1v, temp2v, temp3v,
+ temp4v, temp5v, temp6v, temp7v,
+ diffh0v, diffh1v, diffh2v, diffh3v,
+ diffh4v, diffh5v, diffh6v, diffh7v);
+ VEC_HADAMAR(diffh0v, diffh1v, diffh2v, diffh3v,
+ temp0v, temp1v, temp2v, temp3v);
+ VEC_HADAMAR(diffh4v, diffh5v, diffh6v, diffh7v,
+ temp4v, temp5v, temp6v, temp7v);
+
+ temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v));
+ temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v));
+ temp0v = _mm_add_epi16(temp0v, temp2v);
+ temp4v = _mm_add_epi16(_mm_abs_epi16(temp4v), _mm_abs_epi16(temp5v));
+ temp6v = _mm_add_epi16(_mm_abs_epi16(temp6v), _mm_abs_epi16(temp7v));
+ temp4v = _mm_add_epi16(temp4v, temp6v);
+ satdv = _mm_add_epi16(temp0v, temp4v);
+
+ VEC_HADAMAR(diffl0v, diffl1v, diffl2v, diffl3v,
+ temp0v, temp1v, temp2v, temp3v);
+ VEC_HADAMAR(diffl4v, diffl5v, diffl6v, diffl7v,
+ temp4v, temp5v, temp6v, temp7v);
+ VEC_TRANSPOSE_8(temp0v, temp1v, temp2v, temp3v,
+ temp4v, temp5v, temp6v, temp7v,
+ diffl0v, diffl1v, diffl2v, diffl3v,
+ diffl4v, diffl5v, diffl6v, diffl7v);
+ VEC_HADAMAR(diffl0v, diffl1v, diffl2v, diffl3v,
+ temp0v, temp1v, temp2v, temp3v);
+ VEC_HADAMAR(diffl4v, diffl5v, diffl6v, diffl7v,
+ temp4v, temp5v, temp6v, temp7v);
+
+ temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v));
+ temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v));
+ temp0v = _mm_add_epi16(temp0v, temp2v);
+ temp4v = _mm_add_epi16(_mm_abs_epi16(temp4v), _mm_abs_epi16(temp5v));
+ temp6v = _mm_add_epi16(_mm_abs_epi16(temp6v), _mm_abs_epi16(temp7v));
+ temp4v = _mm_add_epi16(temp4v, temp6v);
+ satdv = _mm_add_epi16(satdv, _mm_add_epi16(temp0v, temp4v));
+
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v);
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v);
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v);
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v);
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v);
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v);
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v);
+ VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v);
+ VEC_HADAMAR(diffh0v, diffh1v, diffh2v, diffh3v,
+ temp0v, temp1v, temp2v, temp3v);
+ VEC_HADAMAR(diffh4v, diffh5v, diffh6v, diffh7v,
+ temp4v, temp5v, temp6v, temp7v);
+ VEC_TRANSPOSE_8(temp0v, temp1v, temp2v, temp3v,
+ temp4v, temp5v, temp6v, temp7v,
+ diffh0v, diffh1v, diffh2v, diffh3v,
+ diffh4v, diffh5v, diffh6v, diffh7v);
+ VEC_HADAMAR(diffh0v, diffh1v, diffh2v, diffh3v,
+ temp0v, temp1v, temp2v, temp3v);
+ VEC_HADAMAR(diffh4v, diffh5v, diffh6v, diffh7v,
+ temp4v, temp5v, temp6v, temp7v);
+
+ temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v));
+ temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v));
+ temp0v = _mm_add_epi16(temp0v, temp2v);
+ temp4v = _mm_add_epi16(_mm_abs_epi16(temp4v), _mm_abs_epi16(temp5v));
+ temp6v = _mm_add_epi16(_mm_abs_epi16(temp6v), _mm_abs_epi16(temp7v));
+ temp4v = _mm_add_epi16(temp4v, temp6v);
+ satdv = _mm_add_epi16(satdv, _mm_add_epi16(temp0v, temp4v));
+
+ VEC_HADAMAR(diffl0v, diffl1v, diffl2v, diffl3v,
+ temp0v, temp1v, temp2v, temp3v);
+ VEC_HADAMAR(diffl4v, diffl5v, diffl6v, diffl7v,
+ temp4v, temp5v, temp6v, temp7v);
+ VEC_TRANSPOSE_8(temp0v, temp1v, temp2v, temp3v,
+ temp4v, temp5v, temp6v, temp7v,
+ diffl0v, diffl1v, diffl2v, diffl3v,
+ diffl4v, diffl5v, diffl6v, diffl7v);
+ VEC_HADAMAR(diffl0v, diffl1v, diffl2v, diffl3v,
+ temp0v, temp1v, temp2v, temp3v);
+ VEC_HADAMAR(diffl4v, diffl5v, diffl6v, diffl7v,
+ temp4v, temp5v, temp6v, temp7v);
+
+ temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v));
+ temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v));
+ temp0v = _mm_add_epi16(temp0v, temp2v);
+ temp4v = _mm_add_epi16(_mm_abs_epi16(temp4v), _mm_abs_epi16(temp5v));
+ temp6v = _mm_add_epi16(_mm_abs_epi16(temp6v), _mm_abs_epi16(temp7v));
+ temp4v = _mm_add_epi16(temp4v, temp6v);
+ satdv = _mm_add_epi16(satdv, _mm_add_epi16(temp0v, temp4v));
+ satdv = _mm_add_epi32(_mm_unpacklo_epi16(satdv, zerov), _mm_unpackhi_epi16(satdv, zerov));
+
+ satdv = _mm_hadd_epi32(satdv, satdv);
+ return (_mm_extract_epi32(satdv, 0) + _mm_extract_epi32(satdv, 1)) >> 1;
+}
+
+
+/***********************************************************************
+* SSD routines
+**********************************************************************/
+
+static int pixel_ssd_16x16_e2k(uint8_t *pix1, intptr_t stride1,
+ uint8_t *pix2, intptr_t stride2) {
+ LOAD_ZERO;
+ __m128i v0, v1, v2, v3, sumv = zerov;
+ int y;
+
+ for (y = 0; y < 16; y++) {
+ v0 = VEC_LD(pix1); pix1 += stride1;
+ v1 = VEC_LD(pix2); pix2 += stride2;
+ v2 = _mm_sub_epi16(_mm_unpacklo_epi8(v0, zerov), _mm_unpacklo_epi8(v1, zerov));
+ v3 = _mm_sub_epi16(_mm_unpackhi_epi8(v0, zerov), _mm_unpackhi_epi8(v1, zerov));
+ sumv = _mm_add_epi32(sumv, _mm_madd_epi16(v2, v2));
+ sumv = _mm_add_epi32(sumv, _mm_madd_epi16(v3, v3));
+ }
+
+ sumv = _mm_hadd_epi32(sumv, sumv);
+ return _mm_extract_epi32(sumv, 0) + _mm_extract_epi32(sumv, 1);
+}
+
+static int pixel_ssd_8x8_e2k(uint8_t *pix1, intptr_t stride1,
+ uint8_t *pix2, intptr_t stride2) {
+ LOAD_ZERO;
+ __m128i v0, v1, v2, v3, sumv = zerov;
+ int y;
+
+ for (y = 0; y < 4; y++) {
+ v0 = VEC_LD8(pix1); pix1 += stride1;
+ v2 = VEC_LD8(pix2); pix2 += stride2;
+ v1 = VEC_LD8(pix1); pix1 += stride1;
+ v3 = VEC_LD8(pix2); pix2 += stride2;
+ v2 = _mm_sub_epi16(_mm_unpacklo_epi8(v0, zerov), _mm_unpacklo_epi8(v2, zerov));
+ v3 = _mm_sub_epi16(_mm_unpacklo_epi8(v1, zerov), _mm_unpacklo_epi8(v3, zerov));
+ sumv = _mm_add_epi32(sumv, _mm_madd_epi16(v2, v2));
+ sumv = _mm_add_epi32(sumv, _mm_madd_epi16(v3, v3));
+ }
+
+ sumv = _mm_hadd_epi32(sumv, sumv);
+ return _mm_extract_epi32(sumv, 0) + _mm_extract_epi32(sumv, 1);
+}
+
+/****************************************************************************
+ * variance
+ ****************************************************************************/
+static uint64_t pixel_var_16x16_e2k(uint8_t *pix, intptr_t stride) {
+ LOAD_ZERO;
+ __m128i v0, v1, sqrv = zerov, sumv = zerov;
+ int y, sum, sqr;
+
+ for (y = 0; y < 16; y++, pix += stride) {
+ v1 = VEC_LD(pix);
+ sumv = _mm_add_epi16(sumv, _mm_sad_epu8(v1, zerov));
+ v0 = _mm_unpacklo_epi8(v1, zerov);
+ v1 = _mm_unpackhi_epi8(v1, zerov);
+ sqrv = _mm_add_epi32(sqrv, _mm_madd_epi16(v0, v0));
+ sqrv = _mm_add_epi32(sqrv, _mm_madd_epi16(v1, v1));
+ }
+ sqrv = _mm_hadd_epi32(sqrv, sqrv);
+ sum = _mm_extract_epi16(sumv, 0) + _mm_extract_epi16(sumv, 4);
+ sqr = _mm_extract_epi32(sqrv, 0) + _mm_extract_epi32(sqrv, 1);
+ return sum | (uint64_t)sqr << 32;
+}
+
+static uint64_t pixel_var_8x8_e2k(uint8_t *pix, intptr_t stride) {
+ LOAD_ZERO;
+ __m128i v0, v1, sqrv = zerov, sumv = zerov;
+ int y, sum, sqr;
+
+ for (y = 0; y < 4; y++, pix += stride * 2) {
+ v0 = VEC_LD8(pix);
+ v1 = VEC_LD8(pix + stride);
+ sumv = _mm_add_epi16(sumv, _mm_sad_epu8(_mm_unpacklo_epi64(v0, v1), zerov));
+ v0 = _mm_unpacklo_epi8(v0, zerov);
+ v1 = _mm_unpacklo_epi8(v1, zerov);
+ sqrv = _mm_add_epi32(sqrv, _mm_madd_epi16(v0, v0));
+ sqrv = _mm_add_epi32(sqrv, _mm_madd_epi16(v1, v1));
+ }
+ sqrv = _mm_hadd_epi32(sqrv, sqrv);
+ sum = _mm_extract_epi16(sumv, 0) + _mm_extract_epi16(sumv, 4);
+ sqr = _mm_extract_epi32(sqrv, 0) + _mm_extract_epi32(sqrv, 1);
+ return sum | (uint64_t)sqr << 32;
+}
+
+/**********************************************************************
+ * SA8D routines: sum of 8x8 Hadamard transformed differences
+ **********************************************************************/
+/* SA8D_1D unrolled by 8 */
+#define SA8D_1D_E2K(sa8d0v, sa8d1v, sa8d2v, sa8d3v, \
+ sa8d4v, sa8d5v, sa8d6v, sa8d7v) { \
+ /* int a0 = SRC(0) + SRC(4) */ \
+ /* int a4 = SRC(0) - SRC(4) */ \
+ /* int a1 = SRC(1) + SRC(5) */ \
+ /* int a5 = SRC(1) - SRC(5) */ \
+ /* int a2 = SRC(2) + SRC(6) */ \
+ /* int a6 = SRC(2) - SRC(6) */ \
+ /* int a3 = SRC(3) + SRC(7) */ \
+ /* int a7 = SRC(3) - SRC(7) */ \
+ vec_s16_t a0v = _mm_add_epi16(sa8d0v, sa8d4v); \
+ vec_s16_t a4v = _mm_sub_epi16(sa8d0v, sa8d4v); \
+ vec_s16_t a1v = _mm_add_epi16(sa8d1v, sa8d5v); \
+ vec_s16_t a5v = _mm_sub_epi16(sa8d1v, sa8d5v); \
+ vec_s16_t a2v = _mm_add_epi16(sa8d2v, sa8d6v); \
+ vec_s16_t a6v = _mm_sub_epi16(sa8d2v, sa8d6v); \
+ vec_s16_t a3v = _mm_add_epi16(sa8d3v, sa8d7v); \
+ vec_s16_t a7v = _mm_sub_epi16(sa8d3v, sa8d7v); \
+ \
+ /* int b0 = a0 + a2; */ \
+ /* int b2 = a0 - a2; */ \
+ /* int b1 = a1 + a3; */ \
+ /* int b3 = a1 - a3; */ \
+ /* int b4 = a4 + a6; */ \
+ /* int b6 = a4 - a6; */ \
+ /* int b5 = a5 + a7; */ \
+ /* int b7 = a5 - a7; */ \
+ vec_s16_t b0v = _mm_add_epi16(a0v, a2v); \
+ vec_s16_t b2v = _mm_sub_epi16(a0v, a2v); \
+ vec_s16_t b1v = _mm_add_epi16(a1v, a3v); \
+ vec_s16_t b3v = _mm_sub_epi16(a1v, a3v); \
+ vec_s16_t b4v = _mm_add_epi16(a4v, a6v); \
+ vec_s16_t b6v = _mm_sub_epi16(a4v, a6v); \
+ vec_s16_t b5v = _mm_add_epi16(a5v, a7v); \
+ vec_s16_t b7v = _mm_sub_epi16(a5v, a7v); \
+ \
+ /* DST(0, b0 + b1) */ \
+ /* DST(1, b0 - b1) */ \
+ /* DST(2, b2 + b3) */ \
+ /* DST(3, b2 - b3) */ \
+ /* DST(4, b4 + b5) */ \
+ /* DST(5, b4 - b5) */ \
+ /* DST(6, b6 + b7) */ \
+ /* DST(7, b6 - b7) */ \
+ sa8d0v = _mm_add_epi16(b0v, b1v); \
+ sa8d1v = _mm_sub_epi16(b0v, b1v); \
+ sa8d2v = _mm_add_epi16(b2v, b3v); \
+ sa8d3v = _mm_sub_epi16(b2v, b3v); \
+ sa8d4v = _mm_add_epi16(b4v, b5v); \
+ sa8d5v = _mm_sub_epi16(b4v, b5v); \
+ sa8d6v = _mm_add_epi16(b6v, b7v); \
+ sa8d7v = _mm_sub_epi16(b6v, b7v); \
+}
+
+static int pixel_sa8d_8x8_core_e2k(uint8_t *pix1, intptr_t i_pix1,
+ uint8_t *pix2, intptr_t i_pix2) {
+ PREP_DIFF;
+ vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;
+
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff0v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff1v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff2v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff3v);
+
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff4v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff5v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff6v);
+ VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff7v);
+
+ vec_s16_t sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v;
+
+ SA8D_1D_E2K(diff0v, diff1v, diff2v, diff3v,
+ diff4v, diff5v, diff6v, diff7v);
+
+ VEC_TRANSPOSE_8(diff0v, diff1v, diff2v, diff3v,
+ diff4v, diff5v, diff6v, diff7v,
+ sa8d0v, sa8d1v, sa8d2v, sa8d3v,
+ sa8d4v, sa8d5v, sa8d6v, sa8d7v);
+
+ SA8D_1D_E2K(sa8d0v, sa8d1v, sa8d2v, sa8d3v,
+ sa8d4v, sa8d5v, sa8d6v, sa8d7v);
+
+ /* accumulation of the absolute value of all elements of the resulting bloc */
+ vec_s16_t sum0v = _mm_add_epi16(_mm_abs_epi16(sa8d0v), _mm_abs_epi16(sa8d1v));
+ vec_s16_t sum1v = _mm_add_epi16(_mm_abs_epi16(sa8d2v), _mm_abs_epi16(sa8d3v));
+ vec_s16_t sum2v = _mm_add_epi16(_mm_abs_epi16(sa8d4v), _mm_abs_epi16(sa8d5v));
+ vec_s16_t sum3v = _mm_add_epi16(_mm_abs_epi16(sa8d6v), _mm_abs_epi16(sa8d7v));
+ sum0v = _mm_add_epi16(sum0v, sum1v);
+ sum2v = _mm_add_epi16(sum2v, sum3v);
+ sum0v = _mm_add_epi32(_mm_unpacklo_epi16(sum0v, zerov), _mm_unpackhi_epi16(sum0v, zerov));
+ sum1v = _mm_add_epi32(_mm_unpacklo_epi16(sum2v, zerov), _mm_unpackhi_epi16(sum2v, zerov));
+ sum0v = _mm_add_epi32(sum0v, sum1v);
+
+ sum0v = _mm_hadd_epi32(sum0v, sum0v);
+ return _mm_extract_epi32(sum0v, 0) + _mm_extract_epi32(sum0v, 1);
+}
+
+static int pixel_sa8d_8x8_e2k(uint8_t *pix1, intptr_t i_pix1,
+ uint8_t *pix2, intptr_t i_pix2) {
+ return (pixel_sa8d_8x8_core_e2k(pix1, i_pix1, pix2, i_pix2) + 2) >> 2;
+}
+
+static int pixel_sa8d_16x16_e2k(uint8_t *pix1, intptr_t i_pix1,
+ uint8_t *pix2, intptr_t i_pix2) {
+ return (pixel_sa8d_8x8_core_e2k(&pix1[0], i_pix1, &pix2[0], i_pix2)
+ + pixel_sa8d_8x8_core_e2k(&pix1[8], i_pix1, &pix2[8], i_pix2)
+ + pixel_sa8d_8x8_core_e2k(&pix1[8 * i_pix1], i_pix1, &pix2[8 * i_pix2], i_pix2)
+ + pixel_sa8d_8x8_core_e2k(&pix1[8 * i_pix1 + 8], i_pix1, &pix2[8 * i_pix2 + 8], i_pix2) + 2) >> 2;
+}
+
+#define HADAMARD4_E2K(d0,d1,d2,d3,s0,s1,s2,s3) { \
+ vec_s16_t t0 = _mm_add_epi16(s0, s1); \
+ vec_s16_t t1 = _mm_sub_epi16(s0, s1); \
+ vec_s16_t t2 = _mm_add_epi16(s2, s3); \
+ vec_s16_t t3 = _mm_sub_epi16(s2, s3); \
+ d0 = _mm_add_epi16(t0, t2); \
+ d2 = _mm_sub_epi16(t0, t2); \
+ d1 = _mm_add_epi16(t1, t3); \
+ d3 = _mm_sub_epi16(t1, t3); \
+}
+
+#define VEC_LOAD_HIGH(p, num) \
+ vec_u8_t pix8_##num = VEC_LD8(p + stride * num); \
+ vec_s16_t pix16_s##num = vec_u8_to_s16(pix8_##num); \
+ vec_s16_t pix16_d##num;
+
+static uint64_t pixel_hadamard_ac_e2k(uint8_t *pix, intptr_t stride) {
+ LOAD_ZERO;
+
+ VEC_LOAD_HIGH(pix, 0);
+ VEC_LOAD_HIGH(pix, 1);
+ VEC_LOAD_HIGH(pix, 2);
+ VEC_LOAD_HIGH(pix, 3);
+ HADAMARD4_E2K(pix16_d0, pix16_d1, pix16_d2, pix16_d3,
+ pix16_s0, pix16_s1, pix16_s2, pix16_s3);
+
+ VEC_LOAD_HIGH(pix, 4);
+ VEC_LOAD_HIGH(pix, 5);
+ VEC_LOAD_HIGH(pix, 6);
+ VEC_LOAD_HIGH(pix, 7);
+ HADAMARD4_E2K(pix16_d4, pix16_d5, pix16_d6, pix16_d7,
+ pix16_s4, pix16_s5, pix16_s6, pix16_s7);
+
+ VEC_TRANSPOSE_8(pix16_d0, pix16_d1, pix16_d2, pix16_d3,
+ pix16_d4, pix16_d5, pix16_d6, pix16_d7,
+ pix16_s0, pix16_s1, pix16_s2, pix16_s3,
+ pix16_s4, pix16_s5, pix16_s6, pix16_s7);
+
+ HADAMARD4_E2K(pix16_d0, pix16_d1, pix16_d2, pix16_d3,
+ pix16_s0, pix16_s1, pix16_s2, pix16_s3);
+
+ HADAMARD4_E2K(pix16_d4, pix16_d5, pix16_d6, pix16_d7,
+ pix16_s4, pix16_s5, pix16_s6, pix16_s7);
+
+ vec_u16_t addabs01 = _mm_add_epi16(_mm_abs_epi16(pix16_d0), _mm_abs_epi16(pix16_d1));
+ vec_u16_t addabs23 = _mm_add_epi16(_mm_abs_epi16(pix16_d2), _mm_abs_epi16(pix16_d3));
+ vec_u16_t addabs45 = _mm_add_epi16(_mm_abs_epi16(pix16_d4), _mm_abs_epi16(pix16_d5));
+ vec_u16_t addabs67 = _mm_add_epi16(_mm_abs_epi16(pix16_d6), _mm_abs_epi16(pix16_d7));
+
+ vec_u16_t sum4_v = _mm_add_epi16(_mm_add_epi16(addabs01, addabs23), _mm_add_epi16(addabs45, addabs67));
+ sum4_v = _mm_add_epi32(_mm_unpacklo_epi16(sum4_v, zerov), _mm_unpackhi_epi16(sum4_v, zerov));
+ sum4_v = _mm_hadd_epi32(sum4_v, sum4_v);
+ int sum4 = _mm_extract_epi32(sum4_v, 0) + _mm_extract_epi32(sum4_v, 1);
+
+ vec_s16_t tmpi0 = _mm_add_epi16(pix16_d0, pix16_d4);
+ vec_s16_t tmpi4 = _mm_sub_epi16(pix16_d0, pix16_d4);
+ vec_s16_t tmpi1 = _mm_add_epi16(pix16_d1, pix16_d5);
+ vec_s16_t tmpi5 = _mm_sub_epi16(pix16_d1, pix16_d5);
+ vec_s16_t tmpi2 = _mm_add_epi16(pix16_d2, pix16_d6);
+ vec_s16_t tmpi6 = _mm_sub_epi16(pix16_d2, pix16_d6);
+ vec_s16_t tmpi3 = _mm_add_epi16(pix16_d3, pix16_d7);
+ vec_s16_t tmpi7 = _mm_sub_epi16(pix16_d3, pix16_d7);
+
+ VEC_TRANSPOSE_8(tmpi0, tmpi1, tmpi2, tmpi3,
+ tmpi4, tmpi5, tmpi6, tmpi7,
+ pix16_d0, pix16_d1, pix16_d2, pix16_d3,
+ pix16_d4, pix16_d5, pix16_d6, pix16_d7);
+
+ vec_u16_t addsum04 = _mm_add_epi16(_mm_abs_epi16(_mm_add_epi16(pix16_d0, pix16_d4)),
+ _mm_abs_epi16(_mm_sub_epi16(pix16_d0, pix16_d4)));
+ vec_u16_t addsum15 = _mm_add_epi16(_mm_abs_epi16(_mm_add_epi16(pix16_d1, pix16_d5)),
+ _mm_abs_epi16(_mm_sub_epi16(pix16_d1, pix16_d5)));
+ vec_u16_t addsum26 = _mm_add_epi16(_mm_abs_epi16(_mm_add_epi16(pix16_d2, pix16_d6)),
+ _mm_abs_epi16(_mm_sub_epi16(pix16_d2, pix16_d6)));
+ vec_u16_t addsum37 = _mm_add_epi16(_mm_abs_epi16(_mm_add_epi16(pix16_d3, pix16_d7)),
+ _mm_abs_epi16(_mm_sub_epi16(pix16_d3, pix16_d7)));
+
+ vec_u16_t sum8_v = _mm_add_epi16(_mm_add_epi16(addsum04, addsum15), _mm_add_epi16(addsum26, addsum37));
+ sum8_v = _mm_add_epi32(_mm_unpacklo_epi16(sum8_v, zerov), _mm_unpackhi_epi16(sum8_v, zerov));
+ sum8_v = _mm_hadd_epi32(sum8_v, sum8_v);
+ int sum8 = _mm_extract_epi32(sum8_v, 0) + _mm_extract_epi32(sum8_v, 1);
+
+ int tmp0 = _mm_extract_epi16(_mm_add_epi16(pix16_d0, pix16_d4), 0);
+ return (uint64_t)(sum8 - tmp0) << 32 | (sum4 - tmp0);
+}
+
+static uint64_t pixel_hadamard_ac_16x16_e2k(uint8_t *pix, intptr_t stride) {
+ uint64_t sum = pixel_hadamard_ac_e2k(pix, stride);
+ sum += pixel_hadamard_ac_e2k(pix + 8, stride);
+ sum += pixel_hadamard_ac_e2k(pix + 8 * stride, stride);
+ sum += pixel_hadamard_ac_e2k(pix + 8 * stride + 8, stride);
+ return (sum >> 34) << 32 | (uint32_t)sum >> 1;
+}
+
+static uint64_t pixel_hadamard_ac_16x8_e2k(uint8_t *pix, intptr_t stride) {
+ uint64_t sum = pixel_hadamard_ac_e2k(pix, stride);
+ sum += pixel_hadamard_ac_e2k(pix + 8, stride);
+ return (sum >> 34) << 32 | (uint32_t)sum >> 1;
+}
+
+static uint64_t pixel_hadamard_ac_8x16_e2k(uint8_t *pix, intptr_t stride) {
+ uint64_t sum = pixel_hadamard_ac_e2k(pix, stride);
+ sum += pixel_hadamard_ac_e2k(pix + 8 * stride, stride);
+ return (sum >> 34) << 32 | (uint32_t)sum >> 1;
+}
+
+static uint64_t pixel_hadamard_ac_8x8_e2k(uint8_t *pix, intptr_t stride) {
+ uint64_t sum = pixel_hadamard_ac_e2k(pix, stride);
+ return (sum >> 34) << 32 | (uint32_t)sum >> 1;
+}
+
+
+/****************************************************************************
+ * structural similarity metric
+ ****************************************************************************/
+static void ssim_4x4x2_core_e2k(const uint8_t *pix1, intptr_t stride1,
+ const uint8_t *pix2, intptr_t stride2,
+ int sums[2][4]) {
+ LOAD_ZERO;
+ __m128i v0, v1, s1v, s2v, ssv, s12v;
+ int y;
+ s1v = s2v = ssv = s12v = zerov;
+
+ for (y = 0; y < 4; y++) {
+ v0 = VEC_LD8(pix1); pix1 += stride1;
+ v1 = VEC_LD8(pix2); pix2 += stride2;
+ v0 = _mm_unpacklo_epi8(v0, zerov);
+ v1 = _mm_unpacklo_epi8(v1, zerov);
+
+ s1v = _mm_add_epi16(s1v, _mm_sad_epu8(v0, zerov));
+ s2v = _mm_add_epi16(s2v, _mm_sad_epu8(v1, zerov));
+ ssv = _mm_add_epi32(ssv, _mm_madd_epi16(v0, v0));
+ ssv = _mm_add_epi32(ssv, _mm_madd_epi16(v1, v1));
+ s12v = _mm_add_epi32(s12v, _mm_madd_epi16(v0, v1));
+ }
+ ssv = _mm_hadd_epi32(ssv, ssv);
+ s12v = _mm_hadd_epi32(s12v, s12v);
+
+ sums[0][0] = _mm_extract_epi16(s1v, 0);
+ sums[1][0] = _mm_extract_epi16(s1v, 4);
+ sums[0][1] = _mm_extract_epi16(s2v, 0);
+ sums[1][1] = _mm_extract_epi16(s2v, 4);
+ sums[0][2] = _mm_extract_epi32(ssv, 0);
+ sums[1][2] = _mm_extract_epi32(ssv, 1);
+ sums[0][3] = _mm_extract_epi32(s12v, 0);
+ sums[1][3] = _mm_extract_epi32(s12v, 1);
+}
+
+#define SATD_X(size) \
+static void pixel_satd_x3_##size##_e2k(uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, \
+ intptr_t stride, int scores[3]) { \
+ scores[0] = pixel_satd_##size##_e2k(fenc, FENC_STRIDE, pix0, stride); \
+ scores[1] = pixel_satd_##size##_e2k(fenc, FENC_STRIDE, pix1, stride); \
+ scores[2] = pixel_satd_##size##_e2k(fenc, FENC_STRIDE, pix2, stride); \
+} \
+static void pixel_satd_x4_##size##_e2k(uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, \
+ uint8_t *pix3, intptr_t stride, int scores[4]) { \
+ scores[0] = pixel_satd_##size##_e2k(fenc, FENC_STRIDE, pix0, stride); \
+ scores[1] = pixel_satd_##size##_e2k(fenc, FENC_STRIDE, pix1, stride); \
+ scores[2] = pixel_satd_##size##_e2k(fenc, FENC_STRIDE, pix2, stride); \
+ scores[3] = pixel_satd_##size##_e2k(fenc, FENC_STRIDE, pix3, stride); \
+}
+
+SATD_X(16x16)
+SATD_X(16x8)
+SATD_X(8x16)
+SATD_X(8x8)
+SATD_X(8x4)
+SATD_X(4x8)
+SATD_X(4x4)
+
+#define INTRA_MBCMP_8x8(mbcmp) \
+void intra_##mbcmp##_x3_8x8_e2k(uint8_t *fenc, uint8_t edge[36], int res[3]) { \
+ uint8_t __attribute__((aligned(8))) pix[8*FDEC_STRIDE]; \
+ x264_predict_8x8_v_c(pix, edge); \
+ res[0] = pixel_##mbcmp##_8x8_e2k(pix, FDEC_STRIDE, fenc, FENC_STRIDE); \
+ x264_predict_8x8_h_c(pix, edge); \
+ res[1] = pixel_##mbcmp##_8x8_e2k(pix, FDEC_STRIDE, fenc, FENC_STRIDE); \
+ x264_predict_8x8_dc_c(pix, edge); \
+ res[2] = pixel_##mbcmp##_8x8_e2k(pix, FDEC_STRIDE, fenc, FENC_STRIDE); \
+}
+
+INTRA_MBCMP_8x8(sad)
+INTRA_MBCMP_8x8(sa8d)
+
+#define INTRA_MBCMP(mbcmp, size, pred1, pred2, pred3, chroma) \
+void intra_##mbcmp##_x3_##size##x##size##chroma##_e2k(uint8_t *fenc, uint8_t *fdec, int res[3]) { \
+ x264_predict_##size##x##size##chroma##_##pred1##_c(fdec); \
+ res[0] = pixel_##mbcmp##_##size##x##size##_e2k(fdec, FDEC_STRIDE, fenc, FENC_STRIDE); \
+ x264_predict_##size##x##size##chroma##_##pred2##_c(fdec); \
+ res[1] = pixel_##mbcmp##_##size##x##size##_e2k(fdec, FDEC_STRIDE, fenc, FENC_STRIDE); \
+ x264_predict_##size##x##size##chroma##_##pred3##_c(fdec); \
+ res[2] = pixel_##mbcmp##_##size##x##size##_e2k(fdec, FDEC_STRIDE, fenc, FENC_STRIDE); \
+}
+
+INTRA_MBCMP(satd, 4, v, h, dc, )
+INTRA_MBCMP(sad, 8, dc, h, v, c)
+INTRA_MBCMP(satd, 8, dc, h, v, c)
+INTRA_MBCMP(sad, 16, v, h, dc, )
+INTRA_MBCMP(satd, 16, v, h, dc, )
+#endif // !HIGH_BIT_DEPTH
+
+/****************************************************************************
+ * x264_pixel_init:
+ ****************************************************************************/
+void x264_pixel_init_e2k(x264_pixel_function_t *pixf) {
+#if !HIGH_BIT_DEPTH
+ pixf->sad[PIXEL_16x16] = pixel_sad_16x16_e2k;
+ pixf->sad[PIXEL_8x16] = pixel_sad_8x16_e2k;
+ pixf->sad[PIXEL_16x8] = pixel_sad_16x8_e2k;
+ pixf->sad[PIXEL_8x8] = pixel_sad_8x8_e2k;
+
+ pixf->sad_x3[PIXEL_16x16] = pixel_sad_x3_16x16_e2k;
+ pixf->sad_x3[PIXEL_8x16] = pixel_sad_x3_8x16_e2k;
+ pixf->sad_x3[PIXEL_16x8] = pixel_sad_x3_16x8_e2k;
+ pixf->sad_x3[PIXEL_8x8] = pixel_sad_x3_8x8_e2k;
+
+ pixf->sad_x4[PIXEL_16x16] = pixel_sad_x4_16x16_e2k;
+ pixf->sad_x4[PIXEL_8x16] = pixel_sad_x4_8x16_e2k;
+ pixf->sad_x4[PIXEL_16x8] = pixel_sad_x4_16x8_e2k;
+ pixf->sad_x4[PIXEL_8x8] = pixel_sad_x4_8x8_e2k;
+
+ pixf->satd[PIXEL_16x16] = pixel_satd_16x16_e2k;
+ pixf->satd[PIXEL_8x16] = pixel_satd_8x16_e2k;
+ pixf->satd[PIXEL_16x8] = pixel_satd_16x8_e2k;
+ pixf->satd[PIXEL_8x8] = pixel_satd_8x8_e2k;
+ pixf->satd[PIXEL_8x4] = pixel_satd_8x4_e2k;
+ pixf->satd[PIXEL_4x8] = pixel_satd_4x8_e2k;
+ pixf->satd[PIXEL_4x4] = pixel_satd_4x4_e2k;
+
+ pixf->satd_x3[PIXEL_16x16] = pixel_satd_x3_16x16_e2k;
+ pixf->satd_x3[PIXEL_8x16] = pixel_satd_x3_8x16_e2k;
+ pixf->satd_x3[PIXEL_16x8] = pixel_satd_x3_16x8_e2k;
+ pixf->satd_x3[PIXEL_8x8] = pixel_satd_x3_8x8_e2k;
+ pixf->satd_x3[PIXEL_8x4] = pixel_satd_x3_8x4_e2k;
+ pixf->satd_x3[PIXEL_4x8] = pixel_satd_x3_4x8_e2k;
+ pixf->satd_x3[PIXEL_4x4] = pixel_satd_x3_4x4_e2k;
+
+ pixf->satd_x4[PIXEL_16x16] = pixel_satd_x4_16x16_e2k;
+ pixf->satd_x4[PIXEL_8x16] = pixel_satd_x4_8x16_e2k;
+ pixf->satd_x4[PIXEL_16x8] = pixel_satd_x4_16x8_e2k;
+ pixf->satd_x4[PIXEL_8x8] = pixel_satd_x4_8x8_e2k;
+ pixf->satd_x4[PIXEL_8x4] = pixel_satd_x4_8x4_e2k;
+ pixf->satd_x4[PIXEL_4x8] = pixel_satd_x4_4x8_e2k;
+ pixf->satd_x4[PIXEL_4x4] = pixel_satd_x4_4x4_e2k;
+
+ pixf->intra_sad_x3_8x8 = intra_sad_x3_8x8_e2k;
+ pixf->intra_sad_x3_8x8c = intra_sad_x3_8x8c_e2k;
+ pixf->intra_sad_x3_16x16 = intra_sad_x3_16x16_e2k;
+
+ pixf->intra_satd_x3_4x4 = intra_satd_x3_4x4_e2k;
+ pixf->intra_satd_x3_8x8c = intra_satd_x3_8x8c_e2k;
+ pixf->intra_satd_x3_16x16 = intra_satd_x3_16x16_e2k;
+
+ pixf->ssd[PIXEL_16x16] = pixel_ssd_16x16_e2k;
+ pixf->ssd[PIXEL_8x8] = pixel_ssd_8x8_e2k;
+
+ pixf->sa8d[PIXEL_16x16] = pixel_sa8d_16x16_e2k;
+ pixf->sa8d[PIXEL_8x8] = pixel_sa8d_8x8_e2k;
+
+ pixf->intra_sa8d_x3_8x8 = intra_sa8d_x3_8x8_e2k;
+
+ pixf->var[PIXEL_16x16] = pixel_var_16x16_e2k;
+ pixf->var[PIXEL_8x8] = pixel_var_8x8_e2k;
+
+ pixf->hadamard_ac[PIXEL_16x16] = pixel_hadamard_ac_16x16_e2k;
+ pixf->hadamard_ac[PIXEL_16x8] = pixel_hadamard_ac_16x8_e2k;
+ pixf->hadamard_ac[PIXEL_8x16] = pixel_hadamard_ac_8x16_e2k;
+ pixf->hadamard_ac[PIXEL_8x8] = pixel_hadamard_ac_8x8_e2k;
+
+ pixf->ssim_4x4x2_core = ssim_4x4x2_core_e2k;
+#endif // !HIGH_BIT_DEPTH
+}
diff --git a/common/e2k/pixel.h b/common/e2k/pixel.h
new file mode 100644
index 0000000..d7c8a97
--- /dev/null
+++ b/common/e2k/pixel.h
@@ -0,0 +1,30 @@
+/*****************************************************************************
+ * pixel.h: e2k pixel metrics
+ *****************************************************************************
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_E2K_PIXEL_H
+#define X264_E2K_PIXEL_H
+
+#define x264_pixel_init_e2k x264_template(pixel_init_e2k)
+void x264_pixel_init_e2k(x264_pixel_function_t *pixf);
+
+#endif
diff --git a/common/e2k/predict.c b/common/e2k/predict.c
new file mode 100644
index 0000000..0381e26
--- /dev/null
+++ b/common/e2k/predict.c
@@ -0,0 +1,187 @@
+/*****************************************************************************
+ * predict.c: e2k intra prediction
+ *****************************************************************************
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ * Copyright (C) 2007-2017 x264 project
+ *
+ * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "e2kcommon.h"
+#include "predict.h"
+#include "pixel.h"
+
+#if !HIGH_BIT_DEPTH
+static void predict_8x8c_p_e2k(uint8_t *src) {
+ int i, H = 0, V = 0;
+
+ for (i = 0; i < 4; i++) {
+ H += (i + 1) * (src[4 + i - FDEC_STRIDE] - src[2 - i - FDEC_STRIDE]);
+ V += (i + 1) * (src[(i + 4) * FDEC_STRIDE - 1] - src[(2 - i) * FDEC_STRIDE - 1]);
+ }
+
+ int a = 16 * (src[7 * FDEC_STRIDE - 1] + src[7 - FDEC_STRIDE]);
+ int b = (17 * H + 16) >> 5;
+ int c = (17 * V + 16) >> 5;
+ int i00 = a - b * 3 - c * 3 + 16;
+
+ vec_s16_t i00_v, b_v, c_v;
+ i00_v = _mm_set1_epi16(i00);
+ b_v = _mm_set1_epi16(b);
+ c_v = _mm_set1_epi16(c);
+
+ vec_s16_t induc_v = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+ vec_s16_t add0 = _mm_add_epi16(_mm_mullo_epi16(induc_v, b_v), i00_v);
+
+ for (i = 0; i < 8; ++i) {
+ vec_s16_t shiftv = _mm_srai_epi16(add0, 5);
+ VEC_STL(src, _mm_packus_epi16(shiftv, shiftv));
+ src += FDEC_STRIDE;
+ add0 = _mm_adds_epi16(add0, c_v);
+ }
+}
+
+
+/****************************************************************************
+ * 16x16 prediction for intra luma block
+ ****************************************************************************/
+
+static void predict_16x16_p_e2k(uint8_t *src) {
+ int i, H = 0, V = 0;
+
+ for (i = 1; i <= 8; i++) {
+ H += i * (src[7 + i - FDEC_STRIDE] - src[7 - i - FDEC_STRIDE]);
+ V += i * (src[(7 + i) * FDEC_STRIDE - 1] - src[(7 - i) * FDEC_STRIDE - 1]);
+ }
+
+ int a = 16 * (src[15 * FDEC_STRIDE - 1] + src[15 - FDEC_STRIDE]);
+ int b = (5 * H + 32) >> 6;
+ int c = (5 * V + 32) >> 6;
+ int i00 = a - b * 7 - c * 7 + 16;
+
+ vec_s16_t i00_v, b_v, c_v;
+ i00_v = _mm_set1_epi16(i00);
+ b_v = _mm_set1_epi16(b);
+ c_v = _mm_set1_epi16(c);
+
+ vec_s16_t induc_v = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+ vec_s16_t b8_v = _mm_slli_epi16(b_v, 3);
+ vec_s16_t add0 = _mm_add_epi16(_mm_mullo_epi16(induc_v, b_v), i00_v);
+ vec_s16_t add1 = _mm_adds_epi16(b8_v, add0);
+
+ for (i = 0; i < 16; i++) {
+ vec_u8_t outv = _mm_packus_epi16(_mm_srai_epi16(add0, 5),
+ _mm_srai_epi16(add1, 5));
+ VEC_ST(src, outv);
+ src += FDEC_STRIDE;
+ add0 = _mm_adds_epi16(add0, c_v);
+ add1 = _mm_adds_epi16(add1, c_v);
+ }
+}
+
+#define PREDICT_16x16_DC_e2k(v) \
+for (i = 0; i < 16; i += 2) { \
+ VEC_ST(src, v); \
+ VEC_ST(src + FDEC_STRIDE, v); \
+ src += FDEC_STRIDE*2; \
+}
+
+static void predict_16x16_dc_e2k(uint8_t *src) {
+ int i; uint32_t dc = 0;
+
+ for (i = 0; i < 16; i++) {
+ dc += src[i * FDEC_STRIDE - 1];
+ dc += src[i - FDEC_STRIDE];
+ }
+ vec_u8_t bc_v = _mm_set1_epi8((dc + 16) >> 5);
+ PREDICT_16x16_DC_e2k(bc_v);
+}
+
+static void predict_16x16_dc_left_e2k(uint8_t *src) {
+ int i; uint32_t dc = 0;
+
+ for (i = 0; i < 16; i++)
+ dc += src[i * FDEC_STRIDE - 1];
+
+ vec_u8_t bc_v = _mm_set1_epi8((dc + 8) >> 4);
+ PREDICT_16x16_DC_e2k(bc_v);
+}
+
+static void predict_16x16_dc_top_e2k(uint8_t *src) {
+ int i; uint32_t dc = 0;
+
+ for (i = 0; i < 16; i++)
+ dc += src[i - FDEC_STRIDE];
+
+ vec_u8_t bc_v = _mm_set1_epi8((dc + 8) >> 4);
+ PREDICT_16x16_DC_e2k(bc_v);
+}
+
+static void predict_16x16_dc_128_e2k(uint8_t *src) {
+ int i;
+ vec_u8_t bc_v = _mm_set1_epi8(128);
+ PREDICT_16x16_DC_e2k(bc_v);
+}
+
+#define PREDICT_16_H_LD(i) \
+ vec_u8_t v##i = _mm_set1_epi8(src[FDEC_STRIDE * i - 1]);
+#define PREDICT_16_H_ST(i) VEC_ST(src + FDEC_STRIDE * i, v##i);
+
+#define PREDICT_REP16(M) M(0) M(1) M(2) M(3) M(4) M(5) M(6) M(7) \
+ M(8) M(9) M(10) M(11) M(12) M(13) M(14) M(15)
+
+static void predict_16x16_h_e2k(uint8_t *src) {
+ PREDICT_REP16(PREDICT_16_H_LD)
+ PREDICT_REP16(PREDICT_16_H_ST)
+}
+
+static void predict_16x16_v_e2k(uint8_t *src) {
+ int i;
+ vec_u32_t v = VEC_LD(src - FDEC_STRIDE);
+
+ for (i = 0; i < 16; i++) {
+ VEC_ST(src, v);
+ src += FDEC_STRIDE;
+ }
+}
+#endif // !HIGH_BIT_DEPTH
+
+
+/****************************************************************************
+ * Exported functions:
+ ****************************************************************************/
+void x264_predict_16x16_init_e2k(x264_predict_t pf[7]) {
+#if !HIGH_BIT_DEPTH
+ pf[I_PRED_16x16_V ] = predict_16x16_v_e2k;
+ pf[I_PRED_16x16_H ] = predict_16x16_h_e2k;
+ pf[I_PRED_16x16_DC] = predict_16x16_dc_e2k;
+ pf[I_PRED_16x16_P ] = predict_16x16_p_e2k;
+ pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_e2k;
+ pf[I_PRED_16x16_DC_TOP ] = predict_16x16_dc_top_e2k;
+ pf[I_PRED_16x16_DC_128 ] = predict_16x16_dc_128_e2k;
+#endif // !HIGH_BIT_DEPTH
+}
+
+void x264_predict_8x8c_init_e2k(x264_predict_t pf[7]) {
+#if !HIGH_BIT_DEPTH
+ pf[I_PRED_CHROMA_P] = predict_8x8c_p_e2k;
+#endif // !HIGH_BIT_DEPTH
+}
diff --git a/common/e2k/predict.h b/common/e2k/predict.h
new file mode 100644
index 0000000..7b7f627
--- /dev/null
+++ b/common/e2k/predict.h
@@ -0,0 +1,32 @@
+/*****************************************************************************
+ * predict.h: e2k intra prediction
+ *****************************************************************************
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_E2K_PREDICT_H
+#define X264_E2K_PREDICT_H
+
+#define x264_predict_16x16_init_e2k x264_template(predict_16x16_init_e2k)
+void x264_predict_16x16_init_e2k(x264_predict_t pf[7]);
+#define x264_predict_8x8c_init_e2k x264_template(predict_8x8c_init_e2k)
+void x264_predict_8x8c_init_e2k(x264_predict_t pf[7]);
+
+#endif
diff --git a/common/e2k/quant.c b/common/e2k/quant.c
new file mode 100644
index 0000000..76a778d
--- /dev/null
+++ b/common/e2k/quant.c
@@ -0,0 +1,436 @@
+/*****************************************************************************
+ * quant.c: e2k quantization
+ *****************************************************************************
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ * Copyright (C) 2007-2017 x264 project
+ *
+ * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "e2kcommon.h"
+#include "quant.h"
+
+#if !HIGH_BIT_DEPTH
+// quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
+
+#define LOAD_MF_BIAS(i) \
+ mfvA = VEC_LD(mf + i); \
+ mfvB = VEC_LD(mf + i + 8); \
+ biasvA = VEC_LD(bias + i); \
+ biasvB = VEC_LD(bias + i + 8); \
+
+#define QUANT_16_U(dct, b0, b1, mf0, mf1) { \
+ temp1v = VEC_LD(dct); \
+ temp2v = VEC_LD(dct + 8); \
+ coefvA = _mm_abs_epi16(temp1v); \
+ coefvB = _mm_abs_epi16(temp2v); \
+ coefvA = _mm_add_epi16(coefvA, b0); \
+ coefvB = _mm_add_epi16(coefvB, b1); \
+ coefvA = _mm_mulhi_epu16(coefvA, mf0); \
+ coefvB = _mm_mulhi_epu16(coefvB, mf1); \
+ temp1v = _mm_sign_epi16(coefvA, temp1v); \
+ temp2v = _mm_sign_epi16(coefvB, temp2v); \
+ VEC_ST(dct, temp1v); \
+ VEC_ST(dct + 8, temp2v); \
+}
+
+int x264_quant_2x2_dc_e2k(int16_t dct[4], int mf, int bias) {
+ __m64 v0, v1, mfv = _mm_set1_pi16(mf), biasv = _mm_set1_pi16(bias);
+
+ v0 = *(__m64*)dct;
+ v1 = _mm_abs_pi16(v0);
+ v1 = _mm_add_pi16(v1, biasv);
+ v1 = _mm_mulhi_pu16(v1, mfv);
+ v1 = _mm_sign_pi16(v1, v0);
+ *(__m64*)dct = v1;
+ return _mm_cvtm64_si64(v1) != 0;
+}
+
+int x264_quant_4x4_e2k(int16_t dct[16], uint16_t mf[16], uint16_t bias[16]) {
+ vec_u16_t coefvA, mfvA, biasvA, coefvB, mfvB, biasvB;
+ vec_s16_t nz, temp1v, temp2v;
+
+ LOAD_MF_BIAS(0);
+ QUANT_16_U(dct, biasvA, biasvB, mfvA, mfvB);
+ nz = _mm_or_si128(temp1v, temp2v);
+ return _mm_cvtsi128_si64(_mm_packs_epi16(nz, nz)) != 0;
+}
+
+int x264_quant_4x4_dc_e2k(int16_t dct[16], int mf, int bias) {
+ vec_u16_t coefvA, coefvB;
+ vec_s16_t nz, temp1v, temp2v;
+ vec_u16_t mfv, biasv;
+
+ mfv = _mm_set1_epi16(mf);
+ biasv = _mm_set1_epi16(bias);
+
+ QUANT_16_U(dct, biasv, biasv, mfv, mfv);
+ nz = _mm_or_si128(temp1v, temp2v);
+ return _mm_cvtsi128_si64(_mm_packs_epi16(nz, nz)) != 0;
+}
+
+int x264_quant_4x4x4_e2k(int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16]) {
+ vec_u16_t coefvA, mfvA, biasvA, coefvB, mfvB, biasvB;
+ vec_s16_t nz0, nz1, nz2, nz3, temp1v, temp2v;
+
+ LOAD_MF_BIAS(0);
+ QUANT_16_U(dct[0], biasvA, biasvB, mfvA, mfvB);
+ nz0 = _mm_or_si128(temp1v, temp2v);
+ QUANT_16_U(dct[1], biasvA, biasvB, mfvA, mfvB);
+ nz1 = _mm_or_si128(temp1v, temp2v);
+ QUANT_16_U(dct[2], biasvA, biasvB, mfvA, mfvB);
+ nz2 = _mm_or_si128(temp1v, temp2v);
+ QUANT_16_U(dct[3], biasvA, biasvB, mfvA, mfvB);
+ nz3 = _mm_or_si128(temp1v, temp2v);
+
+ return !!_mm_cvtsi128_si64(_mm_packs_epi16(nz0, nz0)) * 1 |
+ !!_mm_cvtsi128_si64(_mm_packs_epi16(nz1, nz1)) * 2 |
+ !!_mm_cvtsi128_si64(_mm_packs_epi16(nz2, nz2)) * 4 |
+ !!_mm_cvtsi128_si64(_mm_packs_epi16(nz3, nz3)) * 8;
+}
+
+int x264_quant_8x8_e2k(int16_t dct[64], uint16_t mf[64], uint16_t bias[64]) {
+ LOAD_ZERO;
+ vec_u16_t coefvA, mfvA, biasvA, coefvB, mfvB, biasvB;
+ vec_s16_t nz = zerov, temp1v, temp2v;
+ int i;
+
+ PRAGMA_E2K("ivdep")
+ for (i = 0; i < 64; i += 16) {
+ LOAD_MF_BIAS(i);
+ QUANT_16_U(dct + i, biasvA, biasvB, mfvA, mfvB);
+ nz = _mm_or_si128(nz, _mm_or_si128(temp1v, temp2v));
+ }
+ return _mm_cvtsi128_si64(_mm_packs_epi16(nz, nz)) != 0;
+}
+
+#define DEQUANT_SHL() { \
+ dctv = VEC_LD(dct + y); \
+ mf1v = VEC_LD(dequant_mf[i_mf] + y); \
+ mf2v = VEC_LD(dequant_mf[i_mf] + y + 4); \
+ mfv = _mm_packs_epi32(mf1v, mf2v); \
+ dctv = _mm_mullo_epi16(dctv, mfv); \
+ dctv = _mm_slli_epi16(dctv, i_qbits); \
+ VEC_ST(dct + y, dctv); \
+}
+
+#define DEQUANT_SHR() { \
+ dctv = VEC_LD(dct + y); \
+ mf1v = VEC_LD(dequant_mf[i_mf] + y); \
+ mf2v = VEC_LD(dequant_mf[i_mf] + y + 4); \
+ mfv = _mm_packs_epi32(mf1v, mf2v); \
+ mult1v = _mm_mulhi_epi16(dctv, mfv); \
+ mult0v = _mm_mullo_epi16(dctv, mfv); \
+ temp1v = _mm_unpacklo_epi16(mult0v, mult1v); \
+ temp2v = _mm_unpackhi_epi16(mult0v, mult1v); \
+ temp1v = _mm_add_epi32(temp1v, fv); \
+ temp2v = _mm_add_epi32(temp2v, fv); \
+ temp1v = _mm_srai_epi32(temp1v, -i_qbits); \
+ temp2v = _mm_srai_epi32(temp2v, -i_qbits); \
+ dctv = _mm_packs_epi32(temp1v, temp2v); \
+ VEC_ST(dct + y, dctv); \
+}
+
+void x264_dequant_4x4_e2k(int16_t dct[16], int dequant_mf[6][16], int i_qp) {
+ int y, i_mf = i_qp % 6, i_qbits = i_qp / 6 - 4;
+ vec_s16_t dctv, mfv, mult0v, mult1v;
+ vec_s32_t mf1v, mf2v, temp1v, temp2v;
+
+ if (i_qbits >= 0) {
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < 16; y += 8) DEQUANT_SHL();
+ } else {
+ vec_s32_t fv = _mm_set1_epi32(1 << (-i_qbits-1));
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < 16; y += 8) DEQUANT_SHR();
+ }
+}
+
+void x264_dequant_4x4_dc_e2k(int16_t dct[16], int dequant_mf[6][16], int i_qp) {
+ int y, i_mf = i_qp % 6, i_qbits = i_qp / 6 - 6;
+ vec_s16_t dctv, mfv, mult0v, mult1v;
+ vec_s32_t temp1v, temp2v;
+
+ if (i_qbits >= 0) {
+ mfv = _mm_set1_epi16(dequant_mf[i_mf][0] << i_qbits);
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < 16; y += 8) {
+ dctv = VEC_LD(dct + y);
+ dctv = _mm_mullo_epi16(dctv, mfv);
+ VEC_ST(dct + y, dctv);
+ }
+ } else {
+ vec_s32_t fv = _mm_set1_epi32(1 << (-i_qbits-1));
+ mfv = _mm_set1_epi32(dequant_mf[i_mf][0]);
+ mfv = _mm_packs_epi32(mfv, mfv);
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < 16; y += 8) {
+ dctv = VEC_LD(dct + y);
+ mult1v = _mm_mulhi_epi16(dctv, mfv);
+ mult0v = _mm_mullo_epi16(dctv, mfv);
+ temp1v = _mm_unpacklo_epi16(mult0v, mult1v);
+ temp2v = _mm_unpackhi_epi16(mult0v, mult1v);
+ temp1v = _mm_add_epi32(temp1v, fv);
+ temp2v = _mm_add_epi32(temp2v, fv);
+ temp1v = _mm_srai_epi32(temp1v, -i_qbits);
+ temp2v = _mm_srai_epi32(temp2v, -i_qbits);
+ dctv = _mm_packs_epi32(temp1v, temp2v);
+ VEC_ST(dct + y, dctv);
+ }
+ }
+}
+
+void x264_dequant_8x8_e2k(int16_t dct[64], int dequant_mf[6][64], int i_qp) {
+ int y, i_mf = i_qp % 6, i_qbits = i_qp / 6 - 6;
+ vec_s16_t dctv, mfv, mult0v, mult1v;
+ vec_s32_t mf1v, mf2v, temp1v, temp2v;
+
+ if (i_qbits >= 0) {
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < 64; y += 8) DEQUANT_SHL();
+ } else {
+ vec_s32_t fv = _mm_set1_epi32(1 << (-i_qbits-1));
+ PRAGMA_E2K("ivdep")
+ for (y = 0; y < 64; y += 8) DEQUANT_SHR();
+ }
+}
+
+void x264_denoise_dct_e2k(int16_t *dct, uint32_t *sum, uint16_t *offset, int size) {
+ /* size = 16, 64 */
+ int i;
+ LOAD_ZERO;
+
+ PRAGMA_E2K("ivdep")
+ for (i = 0; i < size; i += 8) {
+ __m128i v0, v1, v2, v3, v4;
+ v0 = VEC_LD(dct + i);
+ v2 = VEC_LD(sum + i);
+ v3 = VEC_LD(sum + i + 4);
+ v4 = VEC_LD(offset + i);
+
+ v1 = _mm_abs_epi16(v0);
+ v2 = _mm_add_epi32(v2, _mm_unpacklo_epi16(v1, zerov));
+ v3 = _mm_add_epi32(v3, _mm_unpackhi_epi16(v1, zerov));
+ v1 = _mm_sign_epi16(_mm_subs_epu16(v1, v4), v0);
+
+ VEC_ST(sum + i, v2);
+ VEC_ST(sum + i + 4, v3);
+ VEC_ST(dct + i, v1);
+ }
+}
+
+static const uint8_t decimate_mask_table4[256] = {
+ 0, 3, 2, 6, 2, 5, 5, 9, 1, 5, 4, 8, 5, 8, 8, 12,
+ 1, 4, 4, 8, 4, 7, 7, 11, 4, 8, 7, 11, 8, 11, 11, 15,
+ 1, 4, 3, 7, 4, 7, 7, 11, 3, 7, 6, 10, 7, 10, 10, 14,
+ 4, 7, 7, 11, 7, 10, 10, 14, 7, 11, 10, 14, 11, 14, 14, 18,
+ 0, 4, 3, 7, 3, 6, 6, 10, 3, 7, 6, 10, 7, 10, 10, 14,
+ 3, 6, 6, 10, 6, 9, 9, 13, 6, 10, 9, 13, 10, 13, 13, 17,
+ 4, 7, 6, 10, 7, 10, 10, 14, 6, 10, 9, 13, 10, 13, 13, 17,
+ 7, 10, 10, 14, 10, 13, 13, 17, 10, 14, 13, 17, 14, 17, 17, 21,
+ 0, 3, 3, 7, 3, 6, 6, 10, 2, 6, 5, 9, 6, 9, 9, 13,
+ 3, 6, 6, 10, 6, 9, 9, 13, 6, 10, 9, 13, 10, 13, 13, 17,
+ 3, 6, 5, 9, 6, 9, 9, 13, 5, 9, 8, 12, 9, 12, 12, 16,
+ 6, 9, 9, 13, 9, 12, 12, 16, 9, 13, 12, 16, 13, 16, 16, 20,
+ 3, 7, 6, 10, 6, 9, 9, 13, 6, 10, 9, 13, 10, 13, 13, 17,
+ 6, 9, 9, 13, 9, 12, 12, 16, 9, 13, 12, 16, 13, 16, 16, 20,
+ 7, 10, 9, 13, 10, 13, 13, 17, 9, 13, 12, 16, 13, 16, 16, 20,
+ 10, 13, 13, 17, 13, 16, 16, 20, 13, 17, 16, 20, 17, 20, 20, 24
+};
+
+#define DECIMATE_MASK_E2K(i, nz, a) \
+ v0 = VEC_LD(dct + i); \
+ v1 = VEC_LD(dct + i + 8); \
+ v0 = _mm_abs_epi16(v0); \
+ v1 = _mm_abs_epi16(v1); \
+ v0 = _mm_packs_epi16(v0, v1); \
+ v2 = _mm_cmpeq_epi8(c0, v0); \
+ v0 = _mm_cmpgt_epi8(v0, c1); \
+ nz = _mm_movemask_epi8(v2); \
+ a = _mm_movemask_epi8(v0);
+
+static int ALWAYS_INLINE decimate_score_internal(int16_t *dct, int i_max) {
+ __m128i v0, v1, v2, c0 = _mm_setzero_si128(), c1 = _mm_set1_epi8(1);
+ int c, a, nz;
+
+ DECIMATE_MASK_E2K(0, nz, a)
+ nz ^= 0xffff;
+ if (!nz) return 0;
+ if (a) return 9;
+ if (i_max == 15) nz >>= 1;
+ c = nz & 255;
+ a = decimate_mask_table4[c];
+ if (nz < 256) return a;
+ nz >>= 32 - __builtin_clz(c | 1);
+ c = __builtin_ctz(nz);
+ nz >>= c + 1;
+ a += x264_decimate_table4[c];
+ a += decimate_mask_table4[nz];
+ return a;
+}
+
+int x264_decimate_score15_e2k(dctcoef *dct) {
+ return decimate_score_internal(dct, 15);
+}
+
+int x264_decimate_score16_e2k(dctcoef *dct) {
+ return decimate_score_internal(dct, 16);
+}
+
+int x264_decimate_score64_e2k(dctcoef *dct) {
+ __m128i v0, v1, v2, c0 = _mm_setzero_si128(), c1 = _mm_set1_epi8(1);
+ int c, a, a1;
+ uint64_t nz, nz1;
+
+ DECIMATE_MASK_E2K(0, nz, a)
+ if (a) return 9;
+ DECIMATE_MASK_E2K(16, nz1, a) nz |= nz1 << 16;
+ DECIMATE_MASK_E2K(32, nz1, a1) nz |= nz1 << 32; a |= a1;
+ DECIMATE_MASK_E2K(48, nz1, a1) nz |= nz1 << 48; a |= a1;
+ if (a) return 9;
+ nz = ~nz;
+ for (a = 0; nz; nz >>= c, nz >>= 1) {
+ c = __builtin_ctzll(nz);
+ a += x264_decimate_table8[c];
+ if (a > 5) return 9;
+ }
+ return a;
+}
+
+int x264_coeff_last4_e2k(dctcoef *dct) {
+ uint64_t a = *(uint64_t*)dct;
+ return a ? 3 - (__builtin_clzll(a) >> 4) : -1;
+}
+
+int x264_coeff_last8_e2k(dctcoef *dct) {
+ __m64 v0 = *(__m64*)dct, v1 = *(__m64*)(dct + 4);
+ uint64_t a = _mm_cvtm64_si64(_mm_packs_pi16(v0, v1));
+ return a ? 7 - (__builtin_clzll(a) >> 3) : -1;
+}
+
+#define COEFF_LAST_E2K(i, nz) \
+ v0 = VEC_LD(dct + i); \
+ v1 = VEC_LD(dct + i + 8); \
+ v0 = _mm_packs_epi16(v0, v1); \
+ v0 = _mm_cmpeq_epi8(v0, c0); \
+ nz = _mm_movemask_epi8(v0);
+
+int x264_coeff_last15_e2k(dctcoef *dct) {
+ __m128i v0, v1, c0 = _mm_setzero_si128();
+ uint32_t nz;
+ COEFF_LAST_E2K(-1, nz)
+ nz ^= 0xffff;
+ return 30 - __builtin_clz(nz | 1);
+}
+
+int x264_coeff_last16_e2k(dctcoef *dct) {
+ __m128i v0, v1, c0 = _mm_setzero_si128();
+ uint32_t nz;
+ COEFF_LAST_E2K(0, nz)
+ nz ^= 0xffff;
+ return nz ? 31 - __builtin_clz(nz) : -1;
+}
+
+int x264_coeff_last64_e2k(dctcoef *dct) {
+ __m128i v0, v1, c0 = _mm_setzero_si128();
+ uint32_t nz;
+ COEFF_LAST_E2K(48, nz) nz ^= 0xffff;
+ if (nz) return 79 - __builtin_clz(nz);
+ COEFF_LAST_E2K(32, nz) nz ^= 0xffff;
+ if (nz) return 63 - __builtin_clz(nz);
+ COEFF_LAST_E2K(16, nz) nz ^= 0xffff;
+ if (nz) return 47 - __builtin_clz(nz);
+ COEFF_LAST_E2K(0, nz) nz ^= 0xffff;
+ return nz ? 31 - __builtin_clz(nz) : -1;
+}
+
+int x264_coeff_level_run4_e2k(dctcoef *dct, x264_run_level_t *runlevel) {
+ __m64 v0 = *(__m64*)dct;
+ uint32_t nz; int total = 0, n, last;
+ v0 = _mm_packs_pi16(v0, v0);
+ v0 = _mm_cmpeq_pi8(v0, _mm_setzero_si64());
+ nz = ~_mm_movemask_pi8(v0) & 0xf;
+ runlevel->mask = nz;
+ nz = nz << 2 | 2;
+ n = __builtin_clz(nz);
+ nz <<= 1 + n;
+ runlevel->last = last = 29 - n;
+ do {
+ nz <<= n = __builtin_clz(nz) + 1;
+ runlevel->level[total++] = dct[last];
+ } while ((last -= n) >= 0);
+ return total;
+}
+
+int x264_coeff_level_run8_e2k(dctcoef *dct, x264_run_level_t *runlevel) {
+ __m64 v0 = *(__m64*)dct, v1 = *(__m64*)(dct + 4);
+ uint32_t nz; int total = 0, n, last;
+ v0 = _mm_packs_pi16(v0, v1);
+ v0 = _mm_cmpeq_pi8(v0, _mm_setzero_si64());
+ nz = _mm_movemask_pi8(v0) ^ 0xff;
+ runlevel->mask = nz;
+ nz = nz << 2 | 2;
+ n = __builtin_clz(nz);
+ nz <<= 1 + n;
+ runlevel->last = last = 29 - n;
+ do {
+ nz <<= n = __builtin_clz(nz) + 1;
+ runlevel->level[total++] = dct[last];
+ } while ((last -= n) >= 0);
+ return total;
+}
+
+int x264_coeff_level_run15_e2k(dctcoef *dct, x264_run_level_t *runlevel) {
+ __m128i v0, v1, c0 = _mm_setzero_si128();
+ uint32_t nz; int total = 0, n, last;
+ COEFF_LAST_E2K(-1, nz)
+ nz ^= 0xffff;
+ runlevel->mask = nz >> 1;
+ nz = nz << 2 | 4;
+ n = __builtin_clz(nz);
+ nz <<= 1 + n;
+ runlevel->last = last = 28 - n;
+ do {
+ nz <<= n = __builtin_clz(nz) + 1;
+ runlevel->level[total++] = dct[last];
+ } while ((last -= n) >= 0);
+ return total;
+}
+
+int x264_coeff_level_run16_e2k(dctcoef *dct, x264_run_level_t *runlevel) {
+ __m128i v0, v1, c0 = _mm_setzero_si128();
+ uint32_t nz; int total = 0, n, last;
+ COEFF_LAST_E2K(0, nz)
+ nz ^= 0xffff;
+ runlevel->mask = nz;
+ nz = nz << 2 | 2;
+ n = __builtin_clz(nz);
+ nz <<= 1 + n;
+ runlevel->last = last = 29 - n;
+ do {
+ nz <<= n = __builtin_clz(nz) + 1;
+ runlevel->level[total++] = dct[last];
+ } while ((last -= n) >= 0);
+ return total;
+}
+
+#endif // !HIGH_BIT_DEPTH
+
diff --git a/common/e2k/quant.h b/common/e2k/quant.h
new file mode 100644
index 0000000..1050f9d
--- /dev/null
+++ b/common/e2k/quant.h
@@ -0,0 +1,73 @@
+/*****************************************************************************
+ * quant.h: e2k quantization
+ *****************************************************************************
+ * Copyright (C) 2021, Ilya Kurdyukov <jpegqs@gmail.com> for BaseALT, Ltd
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_E2K_QUANT_H
+#define X264_E2K_QUANT_H
+
+#define x264_quant_2x2_dc_e2k x264_template(quant_2x2_dc_e2k)
+int x264_quant_2x2_dc_e2k(int16_t dct[4], int mf, int bias);
+#define x264_quant_4x4_e2k x264_template(quant_4x4_e2k)
+int x264_quant_4x4_e2k(int16_t dct[16], uint16_t mf[16], uint16_t bias[16]);
+#define x264_quant_4x4_dc_e2k x264_template(quant_4x4_dc_e2k)
+int x264_quant_4x4_dc_e2k(int16_t dct[16], int mf, int bias);
+#define x264_quant_4x4x4_e2k x264_template(quant_4x4x4_e2k)
+int x264_quant_4x4x4_e2k(int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16]);
+#define x264_quant_8x8_e2k x264_template(quant_8x8_e2k)
+int x264_quant_8x8_e2k(int16_t dct[64], uint16_t mf[64], uint16_t bias[64]);
+
+#define x264_dequant_4x4_e2k x264_template(dequant_4x4_e2k)
+void x264_dequant_4x4_e2k(int16_t dct[16], int dequant_mf[6][16], int i_qp);
+#define x264_dequant_4x4_dc_e2k x264_template(dequant_4x4_dc_e2k)
+void x264_dequant_4x4_dc_e2k(int16_t dct[16], int dequant_mf[6][16], int i_qp);
+#define x264_dequant_8x8_e2k x264_template(dequant_8x8_e2k)
+void x264_dequant_8x8_e2k(int16_t dct[64], int dequant_mf[6][64], int i_qp);
+
+#define x264_denoise_dct_e2k x264_template(denoise_dct_e2k)
+void x264_denoise_dct_e2k(int16_t *dct, uint32_t *sum, uint16_t *offset, int size);
+#define x264_decimate_score15_e2k x264_template(decimate_score15_e2k)
+int x264_decimate_score15_e2k(dctcoef *dct);
+#define x264_decimate_score16_e2k x264_template(decimate_score16_e2k)
+int x264_decimate_score16_e2k(dctcoef *dct);
+#define x264_decimate_score64_e2k x264_template(decimate_score64_e2k)
+int x264_decimate_score64_e2k(dctcoef *dct);
+
+#define x264_coeff_last4_e2k x264_template(coeff_last4_e2k)
+int x264_coeff_last4_e2k(dctcoef *dct);
+#define x264_coeff_last8_e2k x264_template(coeff_last8_e2k)
+int x264_coeff_last8_e2k(dctcoef *dct);
+#define x264_coeff_last15_e2k x264_template(coeff_last15_e2k)
+int x264_coeff_last15_e2k(dctcoef *dct);
+#define x264_coeff_last16_e2k x264_template(coeff_last16_e2k)
+int x264_coeff_last16_e2k(dctcoef *dct);
+#define x264_coeff_last64_e2k x264_template(coeff_last64_e2k)
+int x264_coeff_last64_e2k(dctcoef *dct);
+#define x264_coeff_level_run4_e2k x264_template(coeff_level_run4_e2k)
+int x264_coeff_level_run4_e2k(dctcoef *dct, x264_run_level_t *runlevel);
+#define x264_coeff_level_run8_e2k x264_template(coeff_level_run8_e2k)
+int x264_coeff_level_run8_e2k(dctcoef *dct, x264_run_level_t *runlevel);
+#define x264_coeff_level_run15_e2k x264_template(coeff_level_run15_e2k)
+int x264_coeff_level_run15_e2k(dctcoef *dct, x264_run_level_t *runlevel);
+#define x264_coeff_level_run16_e2k x264_template(coeff_level_run16_e2k)
+int x264_coeff_level_run16_e2k(dctcoef *dct, x264_run_level_t *runlevel);
+
+#endif
diff --git a/common/mc.c b/common/mc.c
index d5e6d55..eb6b7be 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -32,6 +32,9 @@
#if HAVE_ALTIVEC
#include "ppc/mc.h"
#endif
+#if ARCH_E2K
+#include "e2k/mc.h"
+#endif
#if HAVE_ARMV6
#include "arm/mc.h"
#endif
@@ -677,6 +680,10 @@ void x264_mc_init( uint32_t cpu, x264_mc_functions_t *pf, int cpu_independent )
if( cpu&X264_CPU_ALTIVEC )
x264_mc_init_altivec( pf );
#endif
+#if HAVE_E2K
+ if( cpu&X264_CPU_E2K )
+ x264_mc_init_e2k( pf );
+#endif
#if HAVE_ARMV6
x264_mc_init_arm( cpu, pf );
#endif
diff --git a/common/pixel.c b/common/pixel.c
index 113df30..d4757e9 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -34,6 +34,9 @@
#if HAVE_ALTIVEC
# include "ppc/pixel.h"
#endif
+#if ARCH_E2K
+# include "e2k/pixel.h"
+#endif
#if HAVE_ARMV6
# include "arm/pixel.h"
# include "arm/predict.h"
@@ -1506,6 +1509,12 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
x264_pixel_init_altivec( pixf );
}
#endif
+#if HAVE_E2K
+ if( cpu&X264_CPU_E2K )
+ {
+ x264_pixel_init_e2k( pixf );
+ }
+#endif
pixf->ads[PIXEL_8x16] =
pixf->ads[PIXEL_8x4] =
diff --git a/common/predict.c b/common/predict.c
index 5a48198..f7e1f2f 100644
--- a/common/predict.c
+++ b/common/predict.c
@@ -37,6 +37,9 @@
#if HAVE_ALTIVEC
# include "ppc/predict.h"
#endif
+#if ARCH_E2K
+# include "e2k/predict.h"
+#endif
#if HAVE_ARMV6
# include "arm/predict.h"
#endif
@@ -902,6 +905,11 @@ void x264_predict_16x16_init( uint32_t cpu, x264_predict_t pf[7] )
x264_predict_16x16_init_altivec( pf );
#endif
+#if HAVE_E2K
+ if( cpu&X264_CPU_E2K )
+ x264_predict_16x16_init_e2k( pf );
+#endif
+
#if HAVE_ARMV6
x264_predict_16x16_init_arm( cpu, pf );
#endif
diff --git a/common/quant.c b/common/quant.c
index 6b71225..b5cefa0 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -34,6 +34,9 @@
#if HAVE_ALTIVEC
# include "ppc/quant.h"
#endif
+#if ARCH_E2K
+# include "e2k/quant.h"
+#endif
#if HAVE_ARMV6
# include "arm/quant.h"
#endif
@@ -745,6 +748,36 @@ void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf )
}
#endif
+#if HAVE_E2K
+ if( cpu&X264_CPU_E2K )
+ {
+ pf->quant_2x2_dc = x264_quant_2x2_dc_e2k;
+ pf->quant_4x4 = x264_quant_4x4_e2k;
+ pf->quant_4x4_dc = x264_quant_4x4_dc_e2k;
+ pf->quant_4x4x4 = x264_quant_4x4x4_e2k;
+ pf->quant_8x8 = x264_quant_8x8_e2k;
+
+ pf->dequant_4x4 = x264_dequant_4x4_e2k;
+ pf->dequant_4x4_dc = x264_dequant_4x4_dc_e2k;
+ pf->dequant_8x8 = x264_dequant_8x8_e2k;
+
+ pf->denoise_dct = x264_denoise_dct_e2k;
+ pf->decimate_score15 = x264_decimate_score15_e2k;
+ pf->decimate_score16 = x264_decimate_score16_e2k;
+ pf->decimate_score64 = x264_decimate_score64_e2k;
+
+ pf->coeff_last4 = x264_coeff_last4_e2k;
+ pf->coeff_last8 = x264_coeff_last8_e2k;
+ pf->coeff_last[DCT_LUMA_AC ] = x264_coeff_last15_e2k;
+ pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_e2k;
+ pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_e2k;
+ pf->coeff_level_run4 = x264_coeff_level_run4_e2k;
+ pf->coeff_level_run8 = x264_coeff_level_run8_e2k;
+ pf->coeff_level_run[DCT_LUMA_AC ] = x264_coeff_level_run15_e2k;
+ pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_e2k;
+ }
+#endif
+
#if HAVE_ARMV6
if( cpu&X264_CPU_ARMV6 )
{
diff --git a/configure b/configure
index e242e73..22e7583 100755
--- a/configure
+++ b/configure
@@ -409,7 +409,7 @@ NL="
"
# list of all preprocessor HAVE values we can define
-CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON AARCH64 BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F SWSCALE \
+CONFIG_HAVE="MALLOC_H E2K ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON AARCH64 BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F SWSCALE \
LAVF FFMS GPAC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP LSMASH X86_INLINE_ASM AS_FUNC INTEL_DISPATCHER \
MSA MMAP WINRT VSX ARM_INLINE_ASM STRTOK_R CLOCK_GETTIME BITDEPTH8 BITDEPTH10"
@@ -814,6 +814,15 @@ case $host_cpu in
fi
fi
;;
+ e2k)
+ ARCH="E2K"
+ if [ $asm = auto ] ; then
+ define HAVE_E2K
+ CFLAGS="$CFLAGS -msse4.1"
+ AS="${AS-${CC}}"
+ AS_EXT=".c"
+ fi
+ ;;
sparc)
ARCH="SPARC"
;;
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 829f3e2..f857e43 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -100,6 +100,10 @@ static const char **intra_predict_8x16c_names = intra_predict_8x8c_names;
#define set_func_name(...) snprintf( func_name, sizeof(func_name), __VA_ARGS__ )
+#ifdef ARCH_E2K
+#include <x86intrin.h>
+#endif
+
static inline uint32_t read_time(void)
{
uint32_t a = 0;
@@ -109,6 +113,9 @@ static inline uint32_t read_time(void)
: "=a"(a) :: "edx", "memory" );
#elif ARCH_PPC
asm volatile( "mftb %0" : "=r"(a) :: "memory" );
+#elif ARCH_E2K
+ unsigned aux;
+ a = __rdtscp(&aux);
#elif HAVE_ARM_INLINE_ASM // ARMv7 only
asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) :: "memory" );
#elif ARCH_AARCH64
@@ -207,6 +214,8 @@ static void print_bench(void)
b->cpu&X264_CPU_MMX ? "mmx" :
#elif ARCH_PPC
b->cpu&X264_CPU_ALTIVEC ? "altivec" :
+#elif ARCH_E2K
+ b->cpu&X264_CPU_E2K ? "e2k" :
#elif ARCH_ARM
b->cpu&X264_CPU_NEON ? "neon" :
b->cpu&X264_CPU_ARMV6 ? "armv6" :
@@ -2959,6 +2968,12 @@ static int check_all_flags( void )
fprintf( stderr, "x264: ALTIVEC against C\n" );
ret = check_all_funcs( 0, X264_CPU_ALTIVEC );
}
+#elif ARCH_E2K
+ if( cpu_detect & X264_CPU_E2K )
+ {
+ fprintf( stderr, "x264: E2K against C\n" );
+ ret = check_all_funcs( 0, X264_CPU_E2K );
+ }
#elif ARCH_ARM
if( cpu_detect & X264_CPU_NEON )
x264_checkasm_call = x264_checkasm_call_neon;
@@ -2989,7 +3004,7 @@ REALIGN_STACK int main( int argc, char **argv )
if( argc > 1 && !strncmp( argv[1], "--bench", 7 ) )
{
-#if !ARCH_X86 && !ARCH_X86_64 && !ARCH_PPC && !ARCH_ARM && !ARCH_AARCH64 && !ARCH_MIPS
+#if !ARCH_X86 && !ARCH_X86_64 && !ARCH_PPC && !ARCH_E2K && !ARCH_ARM && !ARCH_AARCH64 && !ARCH_MIPS
fprintf( stderr, "no --bench for your cpu until you port rdtsc\n" );
return 1;
#endif
diff --git a/x264.h b/x264.h
index 5aa2a8d..ce501ef 100644
--- a/x264.h
+++ b/x264.h
@@ -172,6 +172,9 @@ typedef struct x264_nal_t
/* PowerPC */
#define X264_CPU_ALTIVEC 0x0000001U
+/* Elbrus */
+#define X264_CPU_E2K 0x0000001U
+
/* ARM and AArch64 */
#define X264_CPU_ARMV6 0x0000001U
#define X264_CPU_NEON 0x0000002U /* ARM NEON */
--
2.34.1