From bcac03930d8f5193665086201493e6096ec0cf93 Mon Sep 17 00:00:00 2001 From: Ilya Kurdyukov Date: Tue, 30 May 2023 23:24:57 +0700 Subject: [PATCH] x264-164 e2k support --- Makefile | 10 + common/common.h | 12 + common/cpu.c | 9 + common/dct.c | 43 ++ common/deblock.c | 11 + common/e2k/dct.c | 829 ++++++++++++++++++++++++ common/e2k/dct.h | 69 ++ common/e2k/deblock.c | 236 +++++++ common/e2k/deblock.h | 32 + common/e2k/e2kcommon.h | 253 ++++++++ common/e2k/mc.c | 1401 ++++++++++++++++++++++++++++++++++++++++ common/e2k/mc.h | 30 + common/e2k/pixel.c | 1169 +++++++++++++++++++++++++++++++++ common/e2k/pixel.h | 30 + common/e2k/predict.c | 187 ++++++ common/e2k/predict.h | 32 + common/e2k/quant.c | 436 +++++++++++++ common/e2k/quant.h | 73 +++ common/mc.c | 7 + common/pixel.c | 9 + common/predict.c | 8 + common/quant.c | 33 + configure | 11 +- tools/checkasm.c | 17 +- x264.h | 3 + 27 files changed, 4952 insertions(+), 3 deletions(-) create mode 100644 common/e2k/dct.c create mode 100644 common/e2k/dct.h create mode 100644 common/e2k/deblock.c create mode 100644 common/e2k/deblock.h create mode 100644 common/e2k/e2kcommon.h create mode 100644 common/e2k/mc.c create mode 100644 common/e2k/mc.h create mode 100644 common/e2k/pixel.c create mode 100644 common/e2k/pixel.h create mode 100644 common/e2k/predict.c create mode 100644 common/e2k/predict.h create mode 100644 common/e2k/quant.c create mode 100644 common/e2k/quant.h diff --git a/Makefile b/Makefile index 0ba46aa..f95021f 100644 --- a/Makefile +++ b/Makefile @@ -137,6 +137,16 @@ SRCS_X += common/ppc/dct.c \ common/ppc/quant.c endif +# Elbrus optims +ifeq ($(SYS_ARCH),E2K) +SRCS_X += common/e2k/dct.c \ + common/e2k/deblock.c \ + common/e2k/mc.c \ + common/e2k/pixel.c \ + common/e2k/predict.c \ + common/e2k/quant.c +endif + # NEON optims ifeq ($(SYS_ARCH),ARM) SRCASM_X = common/arm/bitstream-a.S \ diff --git a/common/common.h b/common/common.h index ebc87ce..fe22ac7 100644 --- a/common/common.h +++ b/common/common.h @@ -808,6 +808,18 @@ static ALWAYS_INLINE int x264_predictor_clip( int16_t (*dst)[2], int16_t (*mvc)[ #include "x86/util.h" #endif +#if ARCH_E2K +#include + +#undef M128_ZERO +#define M128_ZERO ((__m128){0,0,0,0}) +#define x264_union128_t x264_union128_sse_t +typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_sse_t; +#if HAVE_VECTOREXT +typedef uint32_t v4si __attribute__((vector_size (16))); +#endif +#endif + #include "rectangle.h" #endif diff --git a/common/cpu.c b/common/cpu.c index 41a4c1d..dcd8731 100644 --- a/common/cpu.c +++ b/common/cpu.c @@ -89,6 +89,8 @@ const x264_cpu_name_t x264_cpu_names[] = {"UnalignedStack", X264_CPU_STACK_MOD4}, #elif ARCH_PPC {"Altivec", X264_CPU_ALTIVEC}, +#elif ARCH_E2K + {"Elbrus", X264_CPU_E2K}, #elif ARCH_ARM {"ARMv6", X264_CPU_ARMV6}, {"NEON", X264_CPU_NEON}, @@ -360,6 +362,13 @@ uint32_t x264_cpu_detect( void ) } #endif +#elif ARCH_E2K + +uint32_t x264_cpu_detect( void ) +{ + return X264_CPU_E2K; +} + #elif HAVE_ARMV6 void x264_cpu_neon_test( void ); diff --git a/common/dct.c b/common/dct.c index 90997ca..6a3a8d4 100644 --- a/common/dct.c +++ b/common/dct.c @@ -32,6 +32,9 @@ #if HAVE_ALTIVEC # include "ppc/dct.h" #endif +#if ARCH_E2K +# include "e2k/dct.h" +#endif #if HAVE_ARMV6 # include "arm/dct.h" #endif @@ -682,6 +685,30 @@ void x264_dct_init( uint32_t cpu, x264_dct_function_t *dctf ) } #endif +#if HAVE_E2K + if( cpu&X264_CPU_E2K ) + { + dctf->sub4x4_dct = x264_sub4x4_dct_e2k; + dctf->sub8x8_dct = x264_sub8x8_dct_e2k; + dctf->sub16x16_dct = x264_sub16x16_dct_e2k; + + dctf->add8x8_idct_dc = x264_add8x8_idct_dc_e2k; + dctf->add16x16_idct_dc = x264_add16x16_idct_dc_e2k; + + dctf->add4x4_idct = x264_add4x4_idct_e2k; + dctf->add8x8_idct = x264_add8x8_idct_e2k; + dctf->add16x16_idct = x264_add16x16_idct_e2k; + + dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_e2k; + dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_e2k; + dctf->sub8x8_dct8 = x264_sub8x8_dct8_e2k; + dctf->sub16x16_dct8 = x264_sub16x16_dct8_e2k; + + dctf->add8x8_idct8 = x264_add8x8_idct8_e2k; + dctf->add16x16_idct8= x264_add16x16_idct8_e2k; + } +#endif + #if HAVE_ARMV6 || HAVE_AARCH64 if( cpu&X264_CPU_NEON ) { @@ -996,6 +1023,14 @@ void x264_zigzag_init( uint32_t cpu, x264_zigzag_function_t *pf_progressive, x26 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_altivec; } #endif +#if HAVE_E2K + if( cpu&X264_CPU_E2K ) + { + pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_e2k; + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_e2k; + pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_e2k; + } +#endif #if HAVE_ARMV6 || HAVE_AARCH64 if( cpu&X264_CPU_NEON ) { @@ -1081,6 +1116,14 @@ void x264_zigzag_init( uint32_t cpu, x264_zigzag_function_t *pf_progressive, x26 } #endif // HAVE_ALTIVEC +#if HAVE_E2K + if( cpu&X264_CPU_E2K ) + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_e2k; + } +#endif // HAVE_ALTIVEC + #if HAVE_MSA if( cpu&X264_CPU_MSA ) { diff --git a/common/deblock.c b/common/deblock.c index 2607b98..f8649ab 100644 --- a/common/deblock.c +++ b/common/deblock.c @@ -671,6 +671,9 @@ void x264_macroblock_deblock( x264_t *h ) #if HAVE_ALTIVEC #include "ppc/deblock.h" #endif +#if ARCH_E2K +#include "e2k/deblock.h" +#endif #if HAVE_ARMV6 #include "arm/deblock.h" #endif @@ -783,6 +786,14 @@ void x264_deblock_init( uint32_t cpu, x264_deblock_function_t *pf, int b_mbaff ) } #endif // HAVE_ALTIVEC +#if HAVE_E2K + if( cpu&X264_CPU_E2K ) + { + pf->deblock_luma[1] = x264_deblock_v_luma_e2k; + pf->deblock_luma[0] = x264_deblock_h_luma_e2k; + } +#endif + #if HAVE_ARMV6 || HAVE_AARCH64 if( cpu&X264_CPU_NEON ) { diff --git a/common/e2k/dct.c b/common/e2k/dct.c new file mode 100644 index 0000000..e9709aa --- /dev/null +++ b/common/e2k/dct.c @@ -0,0 +1,829 @@ +/***************************************************************************** + * dct.c: e2k transform and zigzag + ***************************************************************************** + * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd + * Copyright (C) 2003-2017 x264 project + * + * Authors: Guillaume Poirier + * Eric Petit + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "e2kcommon.h" +#include "dct.h" + +#if !HIGH_BIT_DEPTH +#define VEC_DCT(a0,a1,a2,a3,b0,b1,b2,b3) \ + b1 = _mm_add_epi16(a0, a3); \ + b3 = _mm_add_epi16(a1, a2); \ + b0 = _mm_add_epi16(b1, b3); \ + b2 = _mm_sub_epi16(b1, b3); \ + a0 = _mm_sub_epi16(a0, a3); \ + a1 = _mm_sub_epi16(a1, a2); \ + b1 = _mm_add_epi16(a0, a0); \ + b1 = _mm_add_epi16(b1, a1); \ + b3 = _mm_sub_epi16(a0, a1); \ + b3 = _mm_sub_epi16(b3, a1) + +void x264_sub4x4_dct_e2k(int16_t dct[16], uint8_t *pix1, uint8_t *pix2) { + PREP_DIFF_8BYTEALIGNED; + vec_s16_t dct0v, dct1v, dct2v, dct3v; + vec_s16_t tmp0v, tmp1v, tmp2v, tmp3v; + + VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct0v); + VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct1v); + VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct2v); + VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct3v); + VEC_DCT(dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v); + VEC_TRANSPOSE_4(tmp0v, tmp1v, tmp2v, tmp3v, + dct0v, dct1v, dct2v, dct3v); + VEC_DCT(dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v); + + VEC_ST(dct, _mm_unpacklo_epi64(tmp0v, tmp1v)); + VEC_ST(dct + 8, _mm_unpacklo_epi64(tmp2v, tmp3v)); +} + +void x264_sub8x8_dct_e2k(int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2) { + PREP_DIFF_8BYTEALIGNED; + vec_s16_t dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v; + vec_s16_t tmp0v, tmp1v, tmp2v, tmp3v, tmp4v, tmp5v, tmp6v, tmp7v; + + VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct0v); + VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct1v); + VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct2v); + VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct3v); + VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct4v); + VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct5v); + VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct6v); + VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct7v); + VEC_DCT(dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v); + VEC_DCT(dct4v, dct5v, dct6v, dct7v, tmp4v, tmp5v, tmp6v, tmp7v); + VEC_TRANSPOSE_8(tmp0v, tmp1v, tmp2v, tmp3v, + tmp4v, tmp5v, tmp6v, tmp7v, + dct0v, dct1v, dct2v, dct3v, + dct4v, dct5v, dct6v, dct7v); + + VEC_DCT(dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v); + VEC_DCT(dct4v, dct5v, dct6v, dct7v, tmp4v, tmp5v, tmp6v, tmp7v); + + VEC_ST(&dct[0][0], _mm_unpacklo_epi64(tmp0v, tmp1v)); + VEC_ST(&dct[0][8], _mm_unpacklo_epi64(tmp2v, tmp3v)); + VEC_ST(&dct[1][0], _mm_unpacklo_epi64(tmp4v, tmp5v)); + VEC_ST(&dct[1][8], _mm_unpacklo_epi64(tmp6v, tmp7v)); + VEC_ST(&dct[2][0], _mm_unpackhi_epi64(tmp0v, tmp1v)); + VEC_ST(&dct[2][8], _mm_unpackhi_epi64(tmp2v, tmp3v)); + VEC_ST(&dct[3][0], _mm_unpackhi_epi64(tmp4v, tmp5v)); + VEC_ST(&dct[3][8], _mm_unpackhi_epi64(tmp6v, tmp7v)); +} + +void x264_sub16x16_dct_e2k(int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2) { + x264_sub8x8_dct_e2k(&dct[0], &pix1[0], &pix2[0]); + x264_sub8x8_dct_e2k(&dct[4], &pix1[8], &pix2[8]); + x264_sub8x8_dct_e2k(&dct[8], &pix1[8 * FENC_STRIDE], &pix2[8 * FDEC_STRIDE]); + x264_sub8x8_dct_e2k(&dct[12], &pix1[8 * FENC_STRIDE + 8], &pix2[8 * FDEC_STRIDE + 8]); +} + +/*************************************************************************** + * 8x8 transform: + ***************************************************************************/ + +#define PIX_DIFF_ITER(j) \ + pix1v = vec_u8_to_s16(VEC_LD8(p1 + j * FENC_STRIDE)); \ + pix2v = vec_u8_to_s16(VEC_LD8(p2 + j * FDEC_STRIDE)); \ + tmp##j = _mm_sub_epi16(pix1v, pix2v) + +static ALWAYS_INLINE vec_s16_t pix_diff(uint8_t *p1, uint8_t *p2) { + vec_s16_t pix1v, pix2v, tmp0, tmp1, tmp2, tmp3; + LOAD_ZERO; + + PIX_DIFF_ITER(0); + PIX_DIFF_ITER(1); + PIX_DIFF_ITER(2); + PIX_DIFF_ITER(3); + + pix1v = _mm_add_epi16(tmp0, tmp1); + pix2v = _mm_add_epi16(tmp2, tmp3); + return _mm_add_epi16(pix1v, pix2v); +} + +void x264_sub8x8_dct_dc_e2k(int16_t dct[4], uint8_t *pix1, uint8_t *pix2) { + __m128i v0, v1; __m64 h0, h1; + + v0 = pix_diff(pix1, pix2); + v1 = pix_diff(pix1 + 4 * FENC_STRIDE, pix2 + 4 * FDEC_STRIDE); + + v0 = _mm_hadds_epi16(v0, v1); + v0 = _mm_hadds_epi16(v0, v0); + + /* 2x2 DC transform */ + h1 = _mm_setr_pi16(1, -1, 1, -1); + h0 = _mm_movepi64_pi64(v0); + h0 = _mm_hadds_pi16(h0, _mm_sign_pi16(h0, h1)); + h0 = _mm_hadds_pi16(h0, _mm_sign_pi16(h0, h1)); + h0 = _mm_shuffle_pi16(h0, 0xd8); + *(__m64*)dct = h0; +} + +void x264_sub8x16_dct_dc_e2k(int16_t dct[8], uint8_t *pix1, uint8_t *pix2) { + __m128i v0, v1, v2, v3; + + v0 = pix_diff(pix1, pix2); + v1 = pix_diff(pix1 + 4 * FENC_STRIDE, pix2 + 4 * FDEC_STRIDE); + v2 = pix_diff(pix1 + 8 * FENC_STRIDE, pix2 + 8 * FDEC_STRIDE); + v3 = pix_diff(pix1 + 12 * FENC_STRIDE, pix2 + 12 * FDEC_STRIDE); + + v0 = _mm_hadds_epi16(v0, v1); + v2 = _mm_hadds_epi16(v2, v3); + v0 = _mm_hadds_epi16(v0, v2); + + /* 2x4 DC transform */ + v1 = _mm_set1_epi32(-1 << 16 | 1); + v0 = _mm_hadd_epi16(v0, _mm_sign_epi16(v0, v1)); + v0 = _mm_hadd_epi16(v0, _mm_sign_epi16(v0, v1)); + v0 = _mm_hadd_epi16(v0, _mm_sign_epi16(v0, v1)); + VEC_ST(dct, _mm_shuffle_epi32(v0, 0x78)); +} + +/* DCT8_1D unrolled by 8 */ +#define DCT8_1D_E2K(dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v) { \ + /* int s07 = SRC(0) + SRC(7); */ \ + /* int s16 = SRC(1) + SRC(6); */ \ + /* int s25 = SRC(2) + SRC(5); */ \ + /* int s34 = SRC(3) + SRC(4); */ \ + vec_s16_t s07v = _mm_add_epi16(dct0v, dct7v); \ + vec_s16_t s16v = _mm_add_epi16(dct1v, dct6v); \ + vec_s16_t s25v = _mm_add_epi16(dct2v, dct5v); \ + vec_s16_t s34v = _mm_add_epi16(dct3v, dct4v); \ +\ + /* int a0 = s07 + s34; */ \ + /* int a1 = s16 + s25; */ \ + /* int a2 = s07 - s34; */ \ + /* int a3 = s16 - s25; */ \ + vec_s16_t a0v = _mm_add_epi16(s07v, s34v); \ + vec_s16_t a1v = _mm_add_epi16(s16v, s25v); \ + vec_s16_t a2v = _mm_sub_epi16(s07v, s34v); \ + vec_s16_t a3v = _mm_sub_epi16(s16v, s25v); \ +\ + /* int d07 = SRC(0) - SRC(7); */ \ + /* int d16 = SRC(1) - SRC(6); */ \ + /* int d25 = SRC(2) - SRC(5); */ \ + /* int d34 = SRC(3) - SRC(4); */ \ + vec_s16_t d07v = _mm_sub_epi16(dct0v, dct7v); \ + vec_s16_t d16v = _mm_sub_epi16(dct1v, dct6v); \ + vec_s16_t d25v = _mm_sub_epi16(dct2v, dct5v); \ + vec_s16_t d34v = _mm_sub_epi16(dct3v, dct4v); \ +\ + /* int a4 = d16 + d25 + (d07 + (d07>>1)); */ \ + /* int a5 = d07 - d34 - (d25 + (d25>>1)); */ \ + /* int a6 = d07 + d34 - (d16 + (d16>>1)); */ \ + /* int a7 = d16 - d25 + (d34 + (d34>>1)); */ \ + vec_s16_t a4v = _mm_add_epi16(_mm_add_epi16(d16v, d25v), _mm_add_epi16(d07v, _mm_srai_epi16(d07v, 1))); \ + vec_s16_t a5v = _mm_sub_epi16(_mm_sub_epi16(d07v, d34v), _mm_add_epi16(d25v, _mm_srai_epi16(d25v, 1))); \ + vec_s16_t a6v = _mm_sub_epi16(_mm_add_epi16(d07v, d34v), _mm_add_epi16(d16v, _mm_srai_epi16(d16v, 1))); \ + vec_s16_t a7v = _mm_add_epi16(_mm_sub_epi16(d16v, d25v), _mm_add_epi16(d34v, _mm_srai_epi16(d34v, 1))); \ +\ + /* DST(0) = a0 + a1; */ \ + /* DST(1) = a4 + (a7>>2); */ \ + /* DST(2) = a2 + (a3>>1); */ \ + /* DST(3) = a5 + (a6>>2); */ \ + dct0v = _mm_add_epi16(a0v, a1v); \ + dct1v = _mm_add_epi16(a4v, _mm_srai_epi16(a7v, 2)); \ + dct2v = _mm_add_epi16(a2v, _mm_srai_epi16(a3v, 1)); \ + dct3v = _mm_add_epi16(a5v, _mm_srai_epi16(a6v, 2)); \ + /* DST(4) = a0 - a1; */ \ + /* DST(5) = a6 - (a5>>2); */ \ + /* DST(6) = (a2>>1) - a3; */ \ + /* DST(7) = (a4>>2) - a7; */ \ + dct4v = _mm_sub_epi16(a0v, a1v); \ + dct5v = _mm_sub_epi16(a6v, _mm_srai_epi16(a5v, 2)); \ + dct6v = _mm_sub_epi16(_mm_srai_epi16(a2v, 1), a3v); \ + dct7v = _mm_sub_epi16(_mm_srai_epi16(a4v, 2), a7v); \ +} + + +void x264_sub8x8_dct8_e2k(int16_t dct[64], uint8_t *pix1, uint8_t *pix2) { + PREP_DIFF_8BYTEALIGNED; + + vec_s16_t dct0v, dct1v, dct2v, dct3v, + dct4v, dct5v, dct6v, dct7v; + + VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct0v); + VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct1v); + VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct2v); + VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct3v); + + VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct4v); + VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct5v); + VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct6v); + VEC_DIFF_H_8BYTE_ALIGNED(pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct7v); + + DCT8_1D_E2K(dct0v, dct1v, dct2v, dct3v, + dct4v, dct5v, dct6v, dct7v); + + vec_s16_t dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v, + dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v; + + VEC_TRANSPOSE_8(dct0v, dct1v, dct2v, dct3v, + dct4v, dct5v, dct6v, dct7v, + dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v, + dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v); + + DCT8_1D_E2K(dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v, + dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v); + + VEC_ST(dct, dct_tr0v); + VEC_ST(dct + 8, dct_tr1v); + VEC_ST(dct + 16, dct_tr2v); + VEC_ST(dct + 24, dct_tr3v); + VEC_ST(dct + 32, dct_tr4v); + VEC_ST(dct + 40, dct_tr5v); + VEC_ST(dct + 48, dct_tr6v); + VEC_ST(dct + 56, dct_tr7v); +} + +void x264_sub16x16_dct8_e2k(int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2) { + x264_sub8x8_dct8_e2k(dct[0], &pix1[0], &pix2[0]); + x264_sub8x8_dct8_e2k(dct[1], &pix1[8], &pix2[8]); + x264_sub8x8_dct8_e2k(dct[2], &pix1[8 * FENC_STRIDE], &pix2[8 * FDEC_STRIDE]); + x264_sub8x8_dct8_e2k(dct[3], &pix1[8 * FENC_STRIDE + 8], &pix2[8 * FDEC_STRIDE + 8]); +} + + +/**************************************************************************** + * IDCT transform: + ****************************************************************************/ + +#define E2K_DC_SUM8_CLIP(i, c0) { \ + /* unaligned load */ \ + t0 = VEC_LD8(&p_dst[i * FDEC_STRIDE]); \ + t1 = VEC_LD8(&p_dst[(i + 1) * FDEC_STRIDE]); \ + t0 = _mm_adds_epi16(c0, _mm_unpacklo_epi8(t0, zerov)); \ + t1 = _mm_adds_epi16(c0, _mm_unpacklo_epi8(t1, zerov)); \ + v##i = _mm_packus_epi16(t0, t1); \ +} + +void x264_add8x8_idct_dc_e2k(uint8_t *p_dst, int16_t dct[4]) { + LOAD_ZERO; + __m128i c0, c1, v0, v2, v4, v6, t0, t1; + __m64 h0 = *(__m64*)dct; + h0 = _mm_srai_pi16(_mm_add_pi16(h0, _mm_set1_pi16(32)), 6); + t0 = _mm_movpi64_epi64(h0); + t0 = _mm_unpacklo_epi16(t0, t0); + c0 = _mm_unpacklo_epi16(t0, t0); + c1 = _mm_unpackhi_epi16(t0, t0); + + E2K_DC_SUM8_CLIP(0, c0); + E2K_DC_SUM8_CLIP(2, c0); + E2K_DC_SUM8_CLIP(4, c1); + E2K_DC_SUM8_CLIP(6, c1); + + VEC_STL(&p_dst[0 * FDEC_STRIDE], v0); + VEC_STH(&p_dst[1 * FDEC_STRIDE], v0); + VEC_STL(&p_dst[2 * FDEC_STRIDE], v2); + VEC_STH(&p_dst[3 * FDEC_STRIDE], v2); + VEC_STL(&p_dst[4 * FDEC_STRIDE], v4); + VEC_STH(&p_dst[5 * FDEC_STRIDE], v4); + VEC_STL(&p_dst[6 * FDEC_STRIDE], v6); + VEC_STH(&p_dst[7 * FDEC_STRIDE], v6); +} + +#define E2K_DC_SUM16_CLIP(i) { \ + /* unaligned load */ \ + t1 = VEC_LD(&p_dst[i * FDEC_STRIDE]); \ + t0 = _mm_adds_epi16(c0, _mm_unpacklo_epi8(t1, zerov)); \ + t1 = _mm_adds_epi16(c1, _mm_unpackhi_epi8(t1, zerov)); \ + v##i = _mm_packus_epi16(t0, t1); \ +} + +void x264_add16x16_idct_dc_e2k(uint8_t *p_dst, int16_t dct[16]) { + int i; + LOAD_ZERO; + + for (i = 0; i < 4; i++, dct += 4, p_dst += 4 * FDEC_STRIDE) { + __m128i c0, c1, v0, v1, v2, v3, t0, t1; + __m64 h0 = *(__m64*)dct; + h0 = _mm_srai_pi16(_mm_add_pi16(h0, _mm_set1_pi16(32)), 6); + t0 = _mm_movpi64_epi64(h0); + t0 = _mm_unpacklo_epi16(t0, t0); + c0 = _mm_unpacklo_epi16(t0, t0); + c1 = _mm_unpackhi_epi16(t0, t0); + E2K_DC_SUM16_CLIP(0); + E2K_DC_SUM16_CLIP(1); + E2K_DC_SUM16_CLIP(2); + E2K_DC_SUM16_CLIP(3); + VEC_ST(&p_dst[0 * FDEC_STRIDE], v0); + VEC_ST(&p_dst[1 * FDEC_STRIDE], v1); + VEC_ST(&p_dst[2 * FDEC_STRIDE], v2); + VEC_ST(&p_dst[3 * FDEC_STRIDE], v3); + } +} + +#define IDCT_1D_E2K(s0, s1, s2, s3, d0, d1, d2, d3) { \ + /* a0 = SRC(0) + SRC(2); */ \ + /* a1 = SRC(0) - SRC(2); */ \ + /* a2 = (SRC(1)>>1) - SRC(3); */ \ + /* a3 = (SRC(3)>>1) + SRC(1); */ \ + vec_s16_t a0v = _mm_add_epi16(s0, s2); \ + vec_s16_t a1v = _mm_sub_epi16(s0, s2); \ + vec_s16_t a2v = _mm_sub_epi16(_mm_srai_epi16(s1, 1), s3); \ + vec_s16_t a3v = _mm_add_epi16(_mm_srai_epi16(s3, 1), s1); \ + /* DST(0, a0 + a3); */ \ + /* DST(1, a1 + a2); */ \ + /* DST(2, a1 - a2); */ \ + /* DST(3, a0 - a3); */ \ + d0 = _mm_add_epi16(a0v, a3v); \ + d1 = _mm_add_epi16(a1v, a2v); \ + d2 = _mm_sub_epi16(a1v, a2v); \ + d3 = _mm_sub_epi16(a0v, a3v); \ +} + +void x264_add4x4_idct_e2k(uint8_t *dst, int16_t dct[16]) { + dct[0] += 32; // rounding for the >>6 at the end + + vec_s16_t s0, s1, s2, s3; + s0 = VEC_LD(dct); + s1 = _mm_unpackhi_epi64(s0, s0); + s2 = VEC_LD(dct + 8); + s3 = _mm_unpackhi_epi64(s2, s2); + + vec_s16_t d0, d1, d2, d3, t0, t1, t2, t3; + IDCT_1D_E2K(s0, s1, s2, s3, d0, d1, d2, d3); + VEC_TRANSPOSE_4(d0, d1, d2, d3, t0, t1, t2, t3); + IDCT_1D_E2K(t0, t1, t2, t3, d0, d1, d2, d3); + + LOAD_ZERO; + t2 = _mm_setr_epi32(*(uint32_t*)(dst + 0 * FDEC_STRIDE), + *(uint32_t*)(dst + 1 * FDEC_STRIDE), + *(uint32_t*)(dst + 2 * FDEC_STRIDE), + *(uint32_t*)(dst + 3 * FDEC_STRIDE)); + + d0 = _mm_srai_epi16(_mm_unpacklo_epi64(d0, d1), 6); + d2 = _mm_srai_epi16(_mm_unpacklo_epi64(d2, d3), 6); + t0 = _mm_adds_epi16(d0, _mm_unpacklo_epi8(t2, zerov)); + t1 = _mm_adds_epi16(d2, _mm_unpackhi_epi8(t2, zerov)); + t0 = _mm_packus_epi16(t0, t1); + *(uint32_t*)(dst + 0 * FDEC_STRIDE) = _mm_extract_epi32(t0, 0); + *(uint32_t*)(dst + 1 * FDEC_STRIDE) = _mm_extract_epi32(t0, 1); + *(uint32_t*)(dst + 2 * FDEC_STRIDE) = _mm_extract_epi32(t0, 2); + *(uint32_t*)(dst + 3 * FDEC_STRIDE) = _mm_extract_epi32(t0, 3); +} + +#define VEC_TRANSPOSE_8X4(a0, a1, a2, a3, b0, b1, b2, b3) \ + b0 = _mm_unpacklo_epi16(a0, a2); \ + b1 = _mm_unpacklo_epi16(a1, a3); \ + b2 = _mm_unpackhi_epi16(a0, a2); \ + b3 = _mm_unpackhi_epi16(a1, a3); \ + a0 = _mm_unpacklo_epi16(b0, b1); \ + a1 = _mm_unpackhi_epi16(b0, b1); \ + a2 = _mm_unpacklo_epi16(b2, b3); \ + a3 = _mm_unpackhi_epi16(b2, b3); \ + b0 = _mm_unpacklo_epi64(a0, a2); \ + b1 = _mm_unpackhi_epi64(a0, a2); \ + b2 = _mm_unpacklo_epi64(a1, a3); \ + b3 = _mm_unpackhi_epi64(a1, a3) + +static ALWAYS_INLINE void x264_add8x4_idct_e2k(uint8_t *dst, int16_t *dct) { + vec_s16_t d0, d1, d2, d3, t0, t1, t2, t3; + d0 = VEC_LD(dct); + d1 = VEC_LD(dct + 16); + d2 = VEC_LD(dct + 8); + d3 = VEC_LD(dct + 24); + t0 = _mm_unpacklo_epi64(d0, d1); + // rounding for the >>6 at the end + t0 = _mm_add_epi16(t0, _mm_setr_epi32(32, 0, 32, 0)); + t1 = _mm_unpackhi_epi64(d0, d1); + t2 = _mm_unpacklo_epi64(d2, d3); + t3 = _mm_unpackhi_epi64(d2, d3); + + IDCT_1D_E2K(t0, t1, t2, t3, d0, d1, d2, d3); + VEC_TRANSPOSE_8X4(d0, d1, d2, d3, t0, t1, t2, t3); + IDCT_1D_E2K(t0, t1, t2, t3, d0, d1, d2, d3); + + LOAD_ZERO; + t0 = _mm_unpacklo_epi8(VEC_LD8(dst + 0 * FDEC_STRIDE), zerov); + t1 = _mm_unpacklo_epi8(VEC_LD8(dst + 1 * FDEC_STRIDE), zerov); + t2 = _mm_unpacklo_epi8(VEC_LD8(dst + 2 * FDEC_STRIDE), zerov); + t3 = _mm_unpacklo_epi8(VEC_LD8(dst + 3 * FDEC_STRIDE), zerov); + d0 = _mm_adds_epi16(_mm_srai_epi16(d0, 6), t0); + d1 = _mm_adds_epi16(_mm_srai_epi16(d1, 6), t1); + d2 = _mm_adds_epi16(_mm_srai_epi16(d2, 6), t2); + d3 = _mm_adds_epi16(_mm_srai_epi16(d3, 6), t3); + t0 = _mm_packus_epi16(d0, d1); + t1 = _mm_packus_epi16(d2, d3); + VEC_STL(dst + 0 * FDEC_STRIDE, t0); + VEC_STH(dst + 1 * FDEC_STRIDE, t0); + VEC_STL(dst + 2 * FDEC_STRIDE, t1); + VEC_STH(dst + 3 * FDEC_STRIDE, t1); +} + +void x264_add8x8_idct_e2k(uint8_t *p_dst, int16_t dct[4][16]) { + x264_add8x4_idct_e2k(p_dst, dct[0]); + x264_add8x4_idct_e2k(p_dst + 4 * FDEC_STRIDE, dct[2]); +} + +void x264_add16x16_idct_e2k(uint8_t *p_dst, int16_t dct[16][16]) { + x264_add8x8_idct_e2k(&p_dst[0], &dct[0]); + x264_add8x8_idct_e2k(&p_dst[8], &dct[4]); + x264_add8x8_idct_e2k(&p_dst[8 * FDEC_STRIDE], &dct[8]); + x264_add8x8_idct_e2k(&p_dst[8 * FDEC_STRIDE + 8], &dct[12]); +} + +#define IDCT8_1D_E2K(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) { \ + /* a0 = SRC(0) + SRC(4); */ \ + /* a2 = SRC(0) - SRC(4); */ \ + /* a4 = (SRC(2)>>1) - SRC(6); */ \ + /* a6 = (SRC(6)>>1) + SRC(2); */ \ + vec_s16_t a0v = _mm_add_epi16(s0, s4); \ + vec_s16_t a2v = _mm_sub_epi16(s0, s4); \ + vec_s16_t a4v = _mm_sub_epi16(_mm_srai_epi16(s2, 1), s6); \ + vec_s16_t a6v = _mm_add_epi16(_mm_srai_epi16(s6, 1), s2); \ + /* b0 = a0 + a6; */ \ + /* b2 = a2 + a4; */ \ + /* b4 = a2 - a4; */ \ + /* b6 = a0 - a6; */ \ + vec_s16_t b0v = _mm_add_epi16(a0v, a6v); \ + vec_s16_t b2v = _mm_add_epi16(a2v, a4v); \ + vec_s16_t b4v = _mm_sub_epi16(a2v, a4v); \ + vec_s16_t b6v = _mm_sub_epi16(a0v, a6v); \ + /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \ + /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \ + /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \ + /* a7 = SRC(5) + SRC(3) + SRC(1) + (SRC(1)>>1); */ \ + vec_s16_t a1v = _mm_sub_epi16(_mm_sub_epi16(s5, s3), _mm_add_epi16(s7, _mm_srai_epi16(s7, 1))); \ + vec_s16_t a3v = _mm_sub_epi16(_mm_add_epi16(s7, s1), _mm_add_epi16(s3, _mm_srai_epi16(s3, 1))); \ + vec_s16_t a5v = _mm_add_epi16(_mm_sub_epi16(s7, s1), _mm_add_epi16(s5, _mm_srai_epi16(s5, 1))); \ + vec_s16_t a7v = _mm_add_epi16(_mm_add_epi16(s5, s3), _mm_add_epi16(s1, _mm_srai_epi16(s1, 1))); \ + /* b1 = (a7>>2) + a1; */ \ + /* b3 = a3 + (a5>>2); */ \ + /* b5 = (a3>>2) - a5; */ \ + /* b7 = a7 - (a1>>2); */ \ + vec_s16_t b1v = _mm_add_epi16(_mm_srai_epi16(a7v, 2), a1v); \ + vec_s16_t b3v = _mm_add_epi16(a3v, _mm_srai_epi16(a5v, 2)); \ + vec_s16_t b5v = _mm_sub_epi16(_mm_srai_epi16(a3v, 2), a5v); \ + vec_s16_t b7v = _mm_sub_epi16(a7v, _mm_srai_epi16(a1v, 2)); \ + /* DST(0, b0 + b7); */ \ + /* DST(1, b2 + b5); */ \ + /* DST(2, b4 + b3); */ \ + /* DST(3, b6 + b1); */ \ + d0 = _mm_add_epi16(b0v, b7v); \ + d1 = _mm_add_epi16(b2v, b5v); \ + d2 = _mm_add_epi16(b4v, b3v); \ + d3 = _mm_add_epi16(b6v, b1v); \ + /* DST(4, b6 - b1); */ \ + /* DST(5, b4 - b3); */ \ + /* DST(6, b2 - b5); */ \ + /* DST(7, b0 - b7); */ \ + d4 = _mm_sub_epi16(b6v, b1v); \ + d5 = _mm_sub_epi16(b4v, b3v); \ + d6 = _mm_sub_epi16(b2v, b5v); \ + d7 = _mm_sub_epi16(b0v, b7v); \ +} + +#define E2K_STORE_SUM_CLIP(i, d0, d1) { \ + uint8_t *ptr = dst + i * FDEC_STRIDE; \ + /* unaligned load */ \ + t0 = _mm_unpacklo_epi8(VEC_LD8(ptr), zerov); \ + t1 = _mm_unpacklo_epi8(VEC_LD8(ptr + FDEC_STRIDE), zerov); \ + d0 = _mm_adds_epi16(_mm_srai_epi16(d0, 6), t0); \ + d1 = _mm_adds_epi16(_mm_srai_epi16(d1, 6), t1); \ + d0 = _mm_packus_epi16(d0, d1); \ + /* unaligned store */ \ + VEC_STL(ptr, d0); \ + VEC_STH(ptr + FDEC_STRIDE, d0); \ +} + +void x264_add8x8_idct8_e2k(uint8_t *dst, int16_t dct[64]) { + dct[0] += 32; // rounding for the >>6 at the end + + vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7; + vec_s16_t t0, t1, t2, t3, t4, t5, t6, t7; + t0 = VEC_LD(dct + 8 * 0); + t1 = VEC_LD(dct + 8 * 1); + t2 = VEC_LD(dct + 8 * 2); + t3 = VEC_LD(dct + 8 * 3); + t4 = VEC_LD(dct + 8 * 4); + t5 = VEC_LD(dct + 8 * 5); + t6 = VEC_LD(dct + 8 * 6); + t7 = VEC_LD(dct + 8 * 7); + + IDCT8_1D_E2K(t0, t1, t2, t3, t4, t5, t6, t7, + d0, d1, d2, d3, d4, d5, d6, d7); + VEC_TRANSPOSE_8(d0, d1, d2, d3, d4, d5, d6, d7, + t0, t1, t2, t3, t4, t5, t6, t7); + IDCT8_1D_E2K(t0, t1, t2, t3, t4, t5, t6, t7, + d0, d1, d2, d3, d4, d5, d6, d7); + LOAD_ZERO; + E2K_STORE_SUM_CLIP(0, d0, d1); + E2K_STORE_SUM_CLIP(2, d2, d3); + E2K_STORE_SUM_CLIP(4, d4, d5); + E2K_STORE_SUM_CLIP(6, d6, d7); +} + +void x264_add16x16_idct8_e2k(uint8_t *dst, int16_t dct[4][64]) { + x264_add8x8_idct8_e2k(&dst[0], dct[0]); + x264_add8x8_idct8_e2k(&dst[8], dct[1]); + x264_add8x8_idct8_e2k(&dst[8 * FDEC_STRIDE], dct[2]); + x264_add8x8_idct8_e2k(&dst[8 * FDEC_STRIDE + 8], dct[3]); +} + +#define SHUF16X4(a, b, c, d) _mm_setr_pi8( \ + a * 2, a * 2 + 1, b * 2, b * 2 + 1, c * 2, c * 2 + 1, d * 2, d * 2 + 1) + +#define SHUF16X8(a, b, c, d, e, f, g, h) _mm_setr_epi8( \ + a * 2, a * 2 + 1, b * 2, b * 2 + 1, c * 2, c * 2 + 1, d * 2, d * 2 + 1, \ + e * 2, e * 2 + 1, f * 2, f * 2 + 1, g * 2, g * 2 + 1, h * 2, h * 2 + 1) + +void x264_zigzag_scan_4x4_frame_e2k(int16_t level[16], int16_t dct[16]) { +#if defined(__iset__) && __iset__ >= 5 + __m128i v0, v1, v2, v3; + + v0 = VEC_LD(dct); + v1 = VEC_LD(dct + 8); + v2 = _mm_shuffle2_epi8(v0, v1, SHUF16X8(0, 4, 1, 2, 5, 8, 12, 9)); + v3 = _mm_shuffle2_epi8(v0, v1, SHUF16X8(6, 3, 7, 10, 13, 14, 11, 15)); + VEC_ST(level, v2); + VEC_ST(level + 8, v3); +#else + __m64 v0, v1, v2, v3, v4, v5, v6, v7; + + v0 = *(__m64*)dct; + v1 = *(__m64*)(dct + 4); + v2 = *(__m64*)(dct + 8); + v3 = *(__m64*)(dct + 12); + + __m64 mask = _mm_setr_pi16(-1, 0, 0, 0); + __m64 i5637 = SHUF16X4(5, 6, 3, 7); + + v4 = _mm_shuffle2_pi8(v0, v1, SHUF16X4(0, 4, 1, 2)); + v5 = _mm_shuffle2_pi8(v0, v1, i5637); + v6 = _mm_shuffle2_pi8(v2, v3, SHUF16X4(2, 0, 4, 1)); + v7 = _mm_shuffle2_pi8(v2, v3, i5637); + + v1 = _mm_blendv_pi8(v6, v5, mask); + v2 = _mm_alignr_pi8(v6, v5, 2); + + *(__m64*)level = v4; + *(__m64*)(level + 4) = v1; + *(__m64*)(level + 8) = v2; + *(__m64*)(level + 12) = v7; +#endif +} + +void x264_zigzag_scan_4x4_field_e2k(int16_t level[16], int16_t dct[16]) { + vec_s16_t dct0v, dct1v, tmp0v, tmp1v; + + dct0v = VEC_LD(dct); + dct1v = VEC_LD(dct + 8); + + vec_u8_t sel0 = SHUF16X8(0, 1, 4, 2, 3, 5, 6, 7); + + tmp0v = _mm_shuffle_epi8(dct0v, sel0); + tmp1v = dct1v; + + VEC_ST(level, tmp0v); + VEC_ST(level + 8, tmp1v); +} + +#define INSFI_M64(a, b, c, d) _mm_cvtsi64_m64(__builtin_e2k_insfd( \ + _mm_cvtm64_si64(a), (b & 63) | (d & 63) << 6, _mm_cvtm64_si64(c))) + +void x264_zigzag_scan_8x8_frame_e2k(int16_t level[64], int16_t dct[64]) { +#if defined(__iset__) && __iset__ >= 5 + vec_s16_t tmpv0, tmpv1, tmpv2, tmpv3, tmpv4, tmpv5; + vec_s16_t dct0v = VEC_LD(dct + 8 * 0); + vec_s16_t dct1v = VEC_LD(dct + 8 * 1); + vec_s16_t dct2v = VEC_LD(dct + 8 * 2); + vec_s16_t dct3v = VEC_LD(dct + 8 * 3); + vec_s16_t dct4v = VEC_LD(dct + 8 * 4); + vec_s16_t dct5v = VEC_LD(dct + 8 * 5); + vec_s16_t dct6v = VEC_LD(dct + 8 * 6); + vec_s16_t dct7v = VEC_LD(dct + 8 * 7); + + tmpv0 = _mm_unpacklo_epi16(dct0v, dct1v); + tmpv1 = _mm_unpacklo_epi16(dct2v, dct3v); + tmpv2 = _mm_unpacklo_epi32(tmpv0, tmpv1); + tmpv3 = _mm_shuffle2_epi8(tmpv2, dct0v, SHUF16X8( 0, 1, 9, 10, 5, 2, 3, 6)); + VEC_ST(level + 8 * 0, tmpv3); + + tmpv4 = _mm_unpacklo_epi16(dct4v, dct5v); + tmpv3 = _mm_shuffle2_epi8(tmpv0, tmpv4, SHUF16X8( 5, 6, 0, 7, 0, 0, 8, 9)); + tmpv3 = _mm_shuffle2_epi8(tmpv3, dct0v, SHUF16X8( 0, 1, 12, 3, 4, 5, 6, 7)); + tmpv3 = _mm_shuffle2_epi8(tmpv3, tmpv1, SHUF16X8( 0, 1, 2, 3, 12, 11, 6, 7)); + VEC_ST(level + 8 * 1, tmpv3); + + tmpv3 = _mm_unpackhi_epi16(dct0v, dct1v); + tmpv1 = _mm_unpackhi_epi16(tmpv1, dct2v); + tmpv5 = _mm_shuffle2_epi8(tmpv3, tmpv1, SHUF16X8( 0, 10, 12, 1, 2, 4, 3, 9)); + tmpv5 = _mm_shuffle2_epi8(tmpv5, dct4v, SHUF16X8( 9, 1, 2, 3, 4, 5, 6, 7)); + VEC_ST(level + 8 * 2, tmpv5); + + tmpv2 = _mm_unpacklo_epi16(dct5v, dct6v); + tmpv5 = _mm_unpacklo_epi16(tmpv2, dct7v); + tmpv4 = _mm_unpackhi_epi16(tmpv4, tmpv1); + tmpv0 = _mm_shuffle2_epi8(tmpv5, tmpv4, SHUF16X8(13, 8, 4, 2, 1, 6, 10, 12)); + VEC_ST(level + 8 * 3, tmpv0); + + tmpv1 = _mm_unpackhi_epi16(dct2v, dct3v); + tmpv0 = _mm_unpackhi_epi16(dct4v, dct5v); + tmpv4 = _mm_shuffle2_epi8(tmpv1, tmpv0, SHUF16X8( 1, 2, 3, 4, 8, 9, 10, 5)); + tmpv3 = _mm_shuffle2_epi8(tmpv4, tmpv3, SHUF16X8( 0, 1, 13, 14, 15, 3, 2, 4)); + VEC_ST(level + 8 * 4, tmpv3); + + tmpv3 = _mm_unpacklo_epi16(dct6v, dct7v); + tmpv2 = _mm_unpackhi_epi16(dct3v, dct4v); + tmpv2 = _mm_shuffle2_epi8(tmpv2, dct5v, SHUF16X8( 0, 11, 12, 3, 4, 5, 6, 13)); + tmpv3 = _mm_shuffle2_epi8(tmpv2, tmpv3, SHUF16X8( 1, 12, 11, 13, 14, 2, 3, 4)); + VEC_ST(level + 8 * 5, tmpv3); + + tmpv1 = _mm_unpackhi_epi16(tmpv1, tmpv2); + tmpv2 = _mm_unpackhi_epi16(dct6v, dct7v); + tmpv1 = _mm_shuffle2_epi8(tmpv1, tmpv2, SHUF16X8( 4, 5, 3, 7, 8, 0, 9, 10)); + tmpv1 = _mm_shuffle2_epi8(tmpv1, dct7v, SHUF16X8( 0, 1, 2, 3, 4, 11, 6, 7)); + VEC_ST(level + 8 * 6, tmpv1); + + tmpv2 = _mm_shuffle2_epi8(tmpv2, tmpv0, SHUF16X8(13, 14, 15, 4, 3, 5, 6, 7)); + VEC_ST(level + 8 * 7, tmpv2); +#else + __m64 d0l, d0h, d1l, d1h, d2l, d2h, d3l, d3h; + __m64 d4l, d4h, d5l, d5h, d6l, d6h, d7l, d7h; + __m64 h0, h1, h2, h3, r0, r1, c1256 = SHUF16X4(1, 2, 5, 6); + + d0l = *(__m64*)(dct + 8 * 0); d0h = *(__m64*)(dct + 8 * 0 + 4); + d1l = *(__m64*)(dct + 8 * 1); d1h = *(__m64*)(dct + 8 * 1 + 4); + d2l = *(__m64*)(dct + 8 * 2); d2h = *(__m64*)(dct + 8 * 2 + 4); + d3l = *(__m64*)(dct + 8 * 3); d3h = *(__m64*)(dct + 8 * 3 + 4); + d4l = *(__m64*)(dct + 8 * 4); d4h = *(__m64*)(dct + 8 * 4 + 4); + d5l = *(__m64*)(dct + 8 * 5); d5h = *(__m64*)(dct + 8 * 5 + 4); + d6l = *(__m64*)(dct + 8 * 6); d6h = *(__m64*)(dct + 8 * 6 + 4); + d7l = *(__m64*)(dct + 8 * 7); d7h = *(__m64*)(dct + 8 * 7 + 4); + + // d0l[0] d1l[0] d0l[1] d0l[2] + // d1l[1] d2l[0] d3l[0] d2l[1] + h0 = _mm_unpacklo_pi16(d0l, d1l); + h2 = _mm_unpacklo_pi32(d1l, d2l); + h3 = _mm_unpacklo_pi16(d2l, d3l); + r0 = INSFI_M64(d0l, -16, h0, 48); + r1 = _mm_shuffle2_pi8(h2, h3, c1256); + *(__m64*)(level + 8 * 0) = r0; + *(__m64*)(level + 8 * 0 + 4) = r1; + + // d1l[2] d0l[3] d0h[0] d1l[3] + // d2l[2] d3l[1] d4l[0] d5l[0] + h0 = INSFI_M64(d0l, 0, d1l, 48); + h1 = INSFI_M64(d1l, 32, d0h, 16); + h2 = _mm_unpacklo_pi16(d4l, d5l); + h3 = INSFI_M64(d3l, -32, d2l, 48); + r0 = _mm_alignr_pi8(h1, h0, 4); + r1 = _mm_alignr_pi8(h2, h3, 4); + *(__m64*)(level + 8 * 1) = r0; + *(__m64*)(level + 8 * 1 + 4) = r1; + + // d4l[1] d3l[2] d2l[3] d1h[0] + // d0h[1] d0h[2] d1h[1] d2h[0] + h0 = INSFI_M64(d3l, 0, d4l, 32); + h1 = _mm_alignr_pi8(d1h, d2l, 4); + h2 = _mm_unpacklo_pi32(d1h, d2h); + r0 = _mm_shuffle2_pi8(h0, h1, c1256); + r1 = _mm_shuffle2_pi8(d0h, h2, c1256); + *(__m64*)(level + 8 * 2) = r0; + *(__m64*)(level + 8 * 2 + 4) = r1; + + // d3l[3] d4l[2] d5l[1] d6l[0] + // d7l[0] d6l[1] d5l[2] d4l[3] + h0 = _mm_unpacklo_pi32(d5l, d6l); + h1 = _mm_unpackhi_pi32(d3l, d4l); + h2 = _mm_unpackhi_pi16(d4l, d5l); + h3 = INSFI_M64(d6l, 0, d7l, 16); + r0 = _mm_shuffle2_pi8(h1, h0, c1256); + r1 = INSFI_M64(h2, -16, h3, 32); + *(__m64*)(level + 8 * 3) = r0; + *(__m64*)(level + 8 * 3 + 4) = r1; + + // d3h[0] d2h[1] d1h[2] d0h[3] + // d1h[3] d2h[2] d3h[1] d4h[0] + h0 = _mm_unpackhi_pi16(d0h, d1h); + h1 = INSFI_M64(d2h, 0, d3h, 16); + h2 = _mm_alignr_pi8(d3h, d1h, 4); + h3 = _mm_alignr_pi8(d4h, d2h, 4); +// h2 = _mm_unpackhi_pi32(d1h, d2h); +// h3 = _mm_unpacklo_pi32(d3h, d4h); + r0 = INSFI_M64(h0, -16, h1, 32); + r1 = _mm_shuffle2_pi8(h2, h3, SHUF16X4(1, 4, 3, 6)); +// r1 = _mm_shuffle2_pi8(h2, h3, c1256); + *(__m64*)(level + 8 * 4) = r0; + *(__m64*)(level + 8 * 4 + 4) = r1; + + // d5l[3] d6l[2] d7l[1] d7l[2] + // d6l[3] d5h[0] d4h[1] d3h[2] + h0 = _mm_unpackhi_pi32(d5l, d6l); + h1 = INSFI_M64(d3h, 0, d4h, 32); + h2 = _mm_alignr_pi8(d5h, d6l, 6); + r0 = _mm_shuffle2_pi8(h0, d7l, c1256); + r1 = INSFI_M64(h1, -16, h2, 32); + *(__m64*)(level + 8 * 5) = r0; + *(__m64*)(level + 8 * 5 + 4) = r1; + + // d2h[3] d3h[3] d4h[2] d5h[1] + // d6h[0] d7l[3] d7h[0] d6h[1] + h0 = _mm_unpackhi_pi16(d2h, d3h); + h1 = INSFI_M64(d5h, -32, d4h, 48); + h2 = _mm_unpacklo_pi16(d6h, d7h); + r0 = _mm_unpackhi_pi32(h0, h1); + r1 = _mm_shuffle2_pi8(h2, d7l, SHUF16X4(0, 7, 1, 2)); + *(__m64*)(level + 8 * 6) = r0; + *(__m64*)(level + 8 * 6 + 4) = r1; + + // d5h[2] d4h[3] d5h[3] d6h[2] + // d7h[1] d7h[2] d6h[3] d7h[3] + h0 = _mm_srli_si64(_mm_unpackhi_pi16(d4h, d5h), 16); + r0 = INSFI_M64(d6h, -16, h0, 48); + r1 = _mm_shuffle2_pi8(d7h, d6h, SHUF16X4(1, 2, 7, 3)); + *(__m64*)(level + 8 * 7) = r0; + *(__m64*)(level + 8 * 7 + 4) = r1; +#endif +} + +void x264_zigzag_interleave_8x8_cavlc_e2k(int16_t *dst, int16_t *src, uint8_t *nnz) { + vec_s16_t merge0, merge1; + vec_s16_t permv0, permv1, permv2; + vec_s16_t src0v = VEC_LD(src + 8 * 0); + vec_s16_t src1v = VEC_LD(src + 8 * 1); + vec_s16_t src2v = VEC_LD(src + 8 * 2); + vec_s16_t src3v = VEC_LD(src + 8 * 3); + vec_s16_t src4v = VEC_LD(src + 8 * 4); + vec_s16_t src5v = VEC_LD(src + 8 * 5); + vec_s16_t src6v = VEC_LD(src + 8 * 6); + vec_s16_t src7v = VEC_LD(src + 8 * 7); + + vec_s16_t tmpv0 = _mm_unpacklo_epi16(src0v, src1v); + vec_s16_t tmpv1 = _mm_unpackhi_epi16(src0v, src1v); + vec_s16_t tmpv2 = _mm_unpacklo_epi16(src2v, src3v); + vec_s16_t tmpv3 = _mm_unpackhi_epi16(src2v, src3v); + vec_s16_t tmpv4 = _mm_unpacklo_epi16(src4v, src5v); + vec_s16_t tmpv5 = _mm_unpackhi_epi16(src4v, src5v); + vec_s16_t tmpv6 = _mm_unpacklo_epi16(src6v, src7v); + vec_s16_t tmpv7 = _mm_unpackhi_epi16(src6v, src7v); + + merge0 = _mm_unpacklo_epi16(tmpv0, tmpv1); + merge1 = _mm_unpacklo_epi16(tmpv2, tmpv3); + permv0 = _mm_unpacklo_epi64(merge0, merge1); + permv1 = _mm_unpackhi_epi64(merge0, merge1); + VEC_ST(dst + 8 * 0, permv0); + + merge0 = _mm_unpacklo_epi16(tmpv4, tmpv5); + merge1 = _mm_unpacklo_epi16(tmpv6, tmpv7); + permv0 = _mm_unpacklo_epi64(merge0, merge1); + permv2 = _mm_unpackhi_epi64(merge0, merge1); + VEC_ST(dst + 8 * 1, permv0); + VEC_ST(dst + 8 * 2, permv1); + VEC_ST(dst + 8 * 3, permv2); + + merge0 = _mm_unpackhi_epi16(tmpv0, tmpv1); + merge1 = _mm_unpackhi_epi16(tmpv2, tmpv3); + permv0 = _mm_unpacklo_epi64(merge0, merge1); + permv1 = _mm_unpackhi_epi64(merge0, merge1); + VEC_ST(dst + 8 * 4, permv0); + + merge0 = _mm_unpackhi_epi16(tmpv4, tmpv5); + merge1 = _mm_unpackhi_epi16(tmpv6, tmpv7); + permv0 = _mm_unpacklo_epi64(merge0, merge1); + permv2 = _mm_unpackhi_epi64(merge0, merge1); + VEC_ST(dst + 8 * 5, permv0); + VEC_ST(dst + 8 * 6, permv1); + VEC_ST(dst + 8 * 7, permv2); + + permv0 = _mm_or_si128(_mm_or_si128(src0v, src1v), _mm_or_si128(src2v, src3v)); + permv1 = _mm_or_si128(_mm_or_si128(src4v, src5v), _mm_or_si128(src6v, src7v)); + permv0 = _mm_or_si128(permv0, permv1); + + __m64 h0, h1; + h0 = _mm_movepi64_pi64(permv0); + h1 = _mm_movepi64_pi64(_mm_unpackhi_epi64(permv0, permv0)); + h0 = _mm_or_si64(h0, h1); + + h0 = _mm_packs_pi16(h0, h0); + h0 = _mm_cmpeq_pi8(h0, _mm_setzero_si64()); + h0 = _mm_sub_pi8(h0, _mm_set1_pi8(-1)); /* ff 00 -> 00 01 */ + *(uint16_t*)nnz = _mm_extract_pi16(h0, 0); + *(uint16_t*)(nnz + 8) = _mm_extract_pi16(h0, 1); +} + +#endif // !HIGH_BIT_DEPTH + diff --git a/common/e2k/dct.h b/common/e2k/dct.h new file mode 100644 index 0000000..bdf682a --- /dev/null +++ b/common/e2k/dct.h @@ -0,0 +1,69 @@ +/***************************************************************************** + * dct.h: e2k transform and zigzag + ***************************************************************************** + * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_E2K_DCT_H +#define X264_E2K_DCT_H + +#define x264_sub4x4_dct_e2k x264_template(sub4x4_dct_e2k) +void x264_sub4x4_dct_e2k(int16_t dct[16], uint8_t *pix1, uint8_t *pix2); +#define x264_sub8x8_dct_e2k x264_template(sub8x8_dct_e2k) +void x264_sub8x8_dct_e2k(int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2); +#define x264_sub16x16_dct_e2k x264_template(sub16x16_dct_e2k) +void x264_sub16x16_dct_e2k(int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2); + +#define x264_add8x8_idct_dc_e2k x264_template(add8x8_idct_dc_e2k) +void x264_add8x8_idct_dc_e2k(uint8_t *p_dst, int16_t dct[4]); +#define x264_add16x16_idct_dc_e2k x264_template(add16x16_idct_dc_e2k) +void x264_add16x16_idct_dc_e2k(uint8_t *p_dst, int16_t dct[16]); + +#define x264_add4x4_idct_e2k x264_template(add4x4_idct_e2k) +void x264_add4x4_idct_e2k(uint8_t *p_dst, int16_t dct[16]); +#define x264_add8x8_idct_e2k x264_template(add8x8_idct_e2k) +void x264_add8x8_idct_e2k(uint8_t *p_dst, int16_t dct[4][16]); +#define x264_add16x16_idct_e2k x264_template(add16x16_idct_e2k) +void x264_add16x16_idct_e2k(uint8_t *p_dst, int16_t dct[16][16]); + +#define x264_sub8x8_dct_dc_e2k x264_template(sub8x8_dct_dc_e2k) +void x264_sub8x8_dct_dc_e2k(int16_t dct[4], uint8_t *pix1, uint8_t *pix2); +#define x264_sub8x16_dct_dc_e2k x264_template(sub8x16_dct_dc_e2k) +void x264_sub8x16_dct_dc_e2k(int16_t dct[4], uint8_t *pix1, uint8_t *pix2); +#define x264_sub8x8_dct8_e2k x264_template(sub8x8_dct8_e2k) +void x264_sub8x8_dct8_e2k(int16_t dct[64], uint8_t *pix1, uint8_t *pix2); +#define x264_sub16x16_dct8_e2k x264_template(sub16x16_dct8_e2k) +void x264_sub16x16_dct8_e2k(int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2); + +#define x264_add8x8_idct8_e2k x264_template(add8x8_idct8_e2k) +void x264_add8x8_idct8_e2k(uint8_t *dst, int16_t dct[64]); +#define x264_add16x16_idct8_e2k x264_template(add16x16_idct8_e2k) +void x264_add16x16_idct8_e2k(uint8_t *dst, int16_t dct[4][64]); + +#define x264_zigzag_scan_4x4_frame_e2k x264_template(zigzag_scan_4x4_frame_e2k) +void x264_zigzag_scan_4x4_frame_e2k(int16_t level[16], int16_t dct[16]); +#define x264_zigzag_scan_4x4_field_e2k x264_template(zigzag_scan_4x4_field_e2k) +void x264_zigzag_scan_4x4_field_e2k(int16_t level[16], int16_t dct[16]); +#define x264_zigzag_scan_8x8_frame_e2k x264_template(zigzag_scan_8x8_frame_e2k) +void x264_zigzag_scan_8x8_frame_e2k(int16_t level[64], int16_t dct[64]); +#define x264_zigzag_interleave_8x8_cavlc_e2k x264_template(zigzag_interleave_8x8_cavlc_e2k) +void x264_zigzag_interleave_8x8_cavlc_e2k(int16_t *dst, int16_t *src, uint8_t *nnz); + +#endif diff --git a/common/e2k/deblock.c b/common/e2k/deblock.c new file mode 100644 index 0000000..349578f --- /dev/null +++ b/common/e2k/deblock.c @@ -0,0 +1,236 @@ +/***************************************************************************** + * deblock.c: e2k deblocking + ***************************************************************************** + * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd + * Copyright (C) 2007-2017 x264 project + * + * Authors: Guillaume Poirier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "e2kcommon.h" +#include "deblock.h" + +#if !HIGH_BIT_DEPTH +#define transpose4x16(r0, r1, r2, r3) { \ + vec_u8_t r4, r5, r6, r7; \ + \ + r4 = _mm_unpacklo_epi8(r0, r2); /*0, 2 set 0*/ \ + r5 = _mm_unpackhi_epi8(r0, r2); /*0, 2 set 1*/ \ + r6 = _mm_unpacklo_epi8(r1, r3); /*1, 3 set 0*/ \ + r7 = _mm_unpackhi_epi8(r1, r3); /*1, 3 set 1*/ \ + \ + r0 = _mm_unpacklo_epi8(r4, r6); /*all set 0*/ \ + r1 = _mm_unpackhi_epi8(r4, r6); /*all set 1*/ \ + r2 = _mm_unpacklo_epi8(r5, r7); /*all set 2*/ \ + r3 = _mm_unpackhi_epi8(r5, r7); /*all set 3*/ \ +} + +#define WRITE4(i, j) ((uint32_t*)dst)[(i * 4 + j) * (dst_stride >> 2)] = _mm_extract_epi32(r##i, j); + +static ALWAYS_INLINE void write16x4(uint8_t *dst, int dst_stride, + vec_u8_t r0, vec_u8_t r1, + vec_u8_t r2, vec_u8_t r3) { + + WRITE4(0, 0) WRITE4(0, 1) WRITE4(0, 2) WRITE4(0, 3) + WRITE4(1, 0) WRITE4(1, 1) WRITE4(1, 2) WRITE4(1, 3) + WRITE4(2, 0) WRITE4(2, 1) WRITE4(2, 2) WRITE4(2, 3) + WRITE4(3, 0) WRITE4(3, 1) WRITE4(3, 2) WRITE4(3, 3) +} + +/** \brief performs a 6x16 transpose of data in src, and stores it to dst */ +#define read_and_transpose16x6(src, st, r8, r9, r10, r11, r12, r13) { \ + vec_u8_t r0, r1, r2, r3, r4, r5, r6, r7, r14, r15; \ + r0 = VEC_LD8(src); \ + r1 = VEC_LD8(src + st); \ + r2 = VEC_LD8(src + st * 2); \ + r3 = VEC_LD8(src + st * 3); \ + r4 = VEC_LD8(src + st * 4); \ + r5 = VEC_LD8(src + st * 5); \ + r6 = VEC_LD8(src + st * 6); \ + r7 = VEC_LD8(src + st * 7); \ + r8 = VEC_LD8(src + st * 8); \ + r9 = VEC_LD8(src + st * 9); \ + r10 = VEC_LD8(src + st * 10); \ + r11 = VEC_LD8(src + st * 11); \ + r12 = VEC_LD8(src + st * 12); \ + r13 = VEC_LD8(src + st * 13); \ + r14 = VEC_LD8(src + st * 14); \ + r15 = VEC_LD8(src + st * 15); \ + \ + /*Merge first pairs*/ \ + r0 = _mm_unpacklo_epi8(r0, r4); /* 0, 4 */ \ + r1 = _mm_unpacklo_epi8(r1, r5); /* 1, 5 */ \ + r2 = _mm_unpacklo_epi8(r2, r6); /* 2, 6 */ \ + r3 = _mm_unpacklo_epi8(r3, r7); /* 3, 7 */ \ + r4 = _mm_unpacklo_epi8(r8, r12); /* 8,12 */ \ + r5 = _mm_unpacklo_epi8(r9, r13); /* 9,13 */ \ + r6 = _mm_unpacklo_epi8(r10, r14); /* 10,14 */ \ + r7 = _mm_unpacklo_epi8(r11, r15); /* 11,15 */ \ + \ + /*Merge second pairs*/ \ + r8 = _mm_unpacklo_epi8(r0, r2); /* 0, 2, 4, 6 set 0 */ \ + r9 = _mm_unpackhi_epi8(r0, r2); /* 0, 2, 4, 6 set 1 */ \ + r10 = _mm_unpacklo_epi8(r1, r3); /* 1, 3, 5, 7 set 0 */ \ + r11 = _mm_unpackhi_epi8(r1, r3); /* 1, 3, 5, 7 set 1 */ \ + r12 = _mm_unpacklo_epi8(r4, r6); /* 8,10,12,14 set 0 */ \ + r13 = _mm_unpackhi_epi8(r4, r6); /* 8,10,12,14 set 1 */ \ + r14 = _mm_unpacklo_epi8(r5, r7); /* 9,11,13,15 set 0 */ \ + r15 = _mm_unpackhi_epi8(r5, r7); /* 9,11,13,15 set 1 */ \ + \ + /*Third merge*/ \ + r0 = _mm_unpacklo_epi8(r8, r10); /* 0..7 set 0 */ \ + r1 = _mm_unpackhi_epi8(r8, r10); /* 0..7 set 1 */ \ + r2 = _mm_unpacklo_epi8(r9, r11); /* 0..7 set 2 */ \ + r4 = _mm_unpacklo_epi8(r12, r14); /* 8..15 set 0 */ \ + r5 = _mm_unpackhi_epi8(r12, r14); /* 8..15 set 1 */ \ + r6 = _mm_unpacklo_epi8(r13, r15); /* 8..15 set 2 */ \ + /* Don't need to compute 3 and 7*/ \ + \ + /*Final merge*/ \ + r8 = _mm_unpacklo_epi64(r0, r4); /* all set 0 */ \ + r9 = _mm_unpackhi_epi64(r0, r4); /* all set 1 */ \ + r10 = _mm_unpacklo_epi64(r1, r5); /* all set 2 */ \ + r11 = _mm_unpackhi_epi64(r1, r5); /* all set 3 */ \ + r12 = _mm_unpacklo_epi64(r2, r6); /* all set 4 */ \ + r13 = _mm_unpackhi_epi64(r2, r6); /* all set 5 */ \ + /* Don't need to compute 14 and 15*/ \ +} + +// out: o = |x-y| < a +static ALWAYS_INLINE vec_u8_t diff_lt_e2k(vec_u8_t x, vec_u8_t y, vec_u8_t a) { + vec_u8_t o = _mm_or_si128(_mm_subs_epu8(x, y), _mm_subs_epu8(y, x)); /* |x-y| */ + return _mm_cmpgt_epi8(a, _mm_xor_si128(o, _mm_set1_epi8(-128))); +} + +static ALWAYS_INLINE vec_u8_t h264_deblock_mask(vec_u8_t p0, vec_u8_t p1, vec_u8_t q0, + vec_u8_t q1, vec_u8_t alpha, vec_u8_t beta) { + vec_u8_t mask, tempmask; + + mask = diff_lt_e2k(p0, q0, alpha); + tempmask = diff_lt_e2k(p1, p0, beta); + mask = _mm_and_si128(mask, tempmask); + tempmask = diff_lt_e2k(q1, q0, beta); + mask = _mm_and_si128(mask, tempmask); + + return mask; +} + +// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) +static ALWAYS_INLINE vec_u8_t h264_deblock_q1(vec_u8_t p0, vec_u8_t p1, vec_u8_t p2, + vec_u8_t q0, vec_u8_t tc0) { + + vec_u8_t average = _mm_avg_epu8(p0, q0); + vec_u8_t temp, uncliped; + vec_u8_t ones = _mm_set1_epi8(1), maxv, minv; + + temp = _mm_xor_si128(average, p2); + average = _mm_avg_epu8(average, p2); /* avg(p2, avg(p0, q0)) */ + temp = _mm_and_si128(temp, ones); /* (p2^avg(p0, q0)) & 1 */ + uncliped = _mm_subs_epu8(average, temp); /* (p2+((p0+q0+1)>>1))>>1 */ + maxv = _mm_adds_epu8(p1, tc0); + minv = _mm_subs_epu8(p1, tc0); + return _mm_min_epu8(maxv, _mm_max_epu8(minv, uncliped)); +} + +#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \ + vec_u8_t pq0bit = _mm_xor_si128(p0, q0); \ + vec_u8_t q1minus, p0minus, stage1, stage2, c127 = _mm_set1_epi8(127); \ + vec_u8_t vec160 = _mm_set1_epi8(160), delta, deltaneg, notv = _mm_set1_epi8(-1); \ + \ + q1minus = _mm_xor_si128(q1, notv); /* 255 - q1 */ \ + stage1 = _mm_avg_epu8(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \ + stage2 = _mm_srli_epi16(stage1, 1); \ + stage2 = _mm_and_si128(stage2, c127); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \ + p0minus = _mm_xor_si128(p0, notv); /* 255 - p0 */ \ + stage1 = _mm_avg_epu8(q0, p0minus); /* (q0 - p0 + 256)>>1 */ \ + pq0bit = _mm_and_si128(pq0bit, _mm_set1_epi8(1)); \ + stage2 = _mm_avg_epu8(stage2, pq0bit); /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \ + stage2 = _mm_adds_epu8(stage2, stage1); /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */ \ + deltaneg = _mm_subs_epu8(vec160, stage2); /* -d */ \ + delta = _mm_subs_epu8(stage2, vec160); /* d */ \ + deltaneg = _mm_min_epu8(tc0masked, deltaneg); \ + delta = _mm_min_epu8(tc0masked, delta); \ + p0 = _mm_subs_epu8(p0, deltaneg); \ + q0 = _mm_subs_epu8(q0, delta); \ + p0 = _mm_adds_epu8(p0, delta); \ + q0 = _mm_adds_epu8(q0, deltaneg); \ +} + +#define h264_loop_filter_luma_e2k(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \ + vec_u8_t alphavec, betavec, mask, p1mask, q1mask; \ + vec_s8_t tc0vec; \ + vec_u8_t finaltc0, tc0masked, newp1, newq1; \ + \ + betavec = _mm_set1_epi8(beta - 128); \ + alphavec = _mm_set1_epi8(alpha - 128); \ + mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /* if in block */ \ + \ + tc0vec = _mm_cvtsi32_si128(*(uint32_t*)tc0); \ + tc0vec = _mm_unpacklo_epi8(tc0vec, tc0vec); \ + tc0vec = _mm_unpacklo_epi8(tc0vec, tc0vec); \ + mask = _mm_blendv_epi8(mask, _mm_setzero_si128(), tc0vec); \ + finaltc0 = _mm_and_si128(tc0vec, mask); /* if (tc0[i] >= 0) tc = tc0 */ \ + \ + p1mask = diff_lt_e2k(p2, p0, betavec); \ + p1mask = _mm_and_si128(p1mask, mask); /* if(|p2 - p0| < beta) */ \ + tc0masked = _mm_and_si128(p1mask, tc0vec); \ + finaltc0 = _mm_sub_epi8(finaltc0, p1mask); /* tc++ */ \ + newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \ + /*end if*/ \ + \ + q1mask = diff_lt_e2k(q2, q0, betavec); \ + q1mask = _mm_and_si128(q1mask, mask); /* if(|q2 - q0| < beta) */ \ + tc0masked = _mm_and_si128(q1mask, tc0vec); \ + finaltc0 = _mm_sub_epi8(finaltc0, q1mask); /* tc++ */ \ + newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \ + /*end if*/ \ + \ + h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \ + p1 = newp1; \ + q1 = newq1; \ +} + +void x264_deblock_v_luma_e2k(uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0) { + if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { + vec_u8_t p2, p1, p0, q0, q1, q2; + p2 = VEC_LD(pix - 3 * stride); + p1 = VEC_LD(pix - 2 * stride); + p0 = VEC_LD(pix - stride); + q0 = VEC_LD(pix); + q1 = VEC_LD(pix + stride); + q2 = VEC_LD(pix + 2 * stride); + h264_loop_filter_luma_e2k(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); + VEC_ST(pix - 2 * stride, p1); + VEC_ST(pix - 1 * stride, p0); + VEC_ST(pix, q0); + VEC_ST(pix + stride, q1); + } +} + +void x264_deblock_h_luma_e2k(uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0) { + vec_u8_t line0, line1, line2, line3, line4, line5; + if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) return; + read_and_transpose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5); + h264_loop_filter_luma_e2k(line0, line1, line2, line3, line4, line5, alpha, beta, tc0); + transpose4x16(line1, line2, line3, line4); + write16x4(pix-2, stride, line1, line2, line3, line4); +} +#endif // !HIGH_BIT_DEPTH diff --git a/common/e2k/deblock.h b/common/e2k/deblock.h new file mode 100644 index 0000000..87302e2 --- /dev/null +++ b/common/e2k/deblock.h @@ -0,0 +1,32 @@ +/***************************************************************************** + * deblock.h: e2k deblocking + ***************************************************************************** + * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_E2K_DEBLOCK_H +#define X264_E2K_DEBLOCK_H + +#define x264_deblock_v_luma_e2k x264_template(deblock_v_luma_e2k) +void x264_deblock_v_luma_e2k(uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0); +#define x264_deblock_h_luma_e2k x264_template(deblock_h_luma_e2k) +void x264_deblock_h_luma_e2k(uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0); + +#endif diff --git a/common/e2k/e2kcommon.h b/common/e2k/e2kcommon.h new file mode 100644 index 0000000..83d0a6c --- /dev/null +++ b/common/e2k/e2kcommon.h @@ -0,0 +1,253 @@ +/***************************************************************************** + * e2kcommon.h: e2k utility macros + ***************************************************************************** + * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd + * Copyright (C) 2003-2017 x264 project + * + * Authors: Eric Petit + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_E2K_E2KCOMMON_H +#define X264_E2K_E2KCOMMON_H + +#include /* SSE4.1 */ + +#ifdef ALWAYS_INLINE +#undef ALWAYS_INLINE +#endif +#define ALWAYS_INLINE __attribute__((__always_inline__)) inline + +#ifdef __e2k__ +#define PRAGMA_E2K _Pragma +#define _mm_shuffle2_pi8(a, b, c) \ + ((__m64)__builtin_e2k_pshufb((uint64_t)(b), (uint64_t)(a), (uint64_t)(c))) +#define _mm_shuffle2_epi8(a, b, c) \ + ((__m128i)__builtin_e2k_qppermb((__v2di)(b), (__v2di)(a), (__v2di)(c))) +#define _mm_blendv_pi8(a, b, c) \ + ((__m64)__builtin_e2k_pmerge((uint64_t)(a), (uint64_t)(b), (uint64_t)(c))) +#else +#define PRAGMA_E2K(x) +#define _mm_shuffle2_pi8(a, b, c) \ + _mm_movepi64_pi64(_mm_shuffle_epi8(_mm_unpacklo_epi64( \ + _mm_movpi64_epi64(a), _mm_movpi64_epi64(b)), _mm_movpi64_epi64(c))) +#define _mm_shuffle2_epi8(a, b, c) \ + _mm_blendv_epi8(_mm_shuffle_epi8(a, c), _mm_shuffle_epi8(b, c), \ + _mm_slli_epi16(c, 3)) +#define _mm_blendv_pi8(a, b, c) \ + _mm_movepi64_pi64(_mm_blendv_epi8(_mm_movpi64_epi64(a), \ + _mm_movpi64_epi64(b), _mm_movpi64_epi64(c))) + +static ALWAYS_INLINE uint64_t __builtin_e2k_insfd(uint64_t a, uint64_t b, uint64_t c) { + int n = b & 63; + a = a >> n | a << (64 - n); + return c ^ ((a ^ c) & (~0ll << (b >> 6 & 63))); +} +#endif + +#define _mm_extract_pi32(a, b) _mm_extract_epi32(_mm_movpi64_epi64(a), b) +#define VEC_ALIGNR8(a, b) _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), 1)) + +/*********************************************************************** + * Vector types + **********************************************************************/ +#define vec_u8_t __m128i +#define vec_s8_t __m128i +#define vec_u16_t __m128i +#define vec_s16_t __m128i +#define vec_u32_t __m128i +#define vec_s32_t __m128i +#define vec_u64_t __m128i +#define vec_s64_t __m128i + +/*********************************************************************** + * Null vector + **********************************************************************/ +#define LOAD_ZERO const vec_u8_t zerov = _mm_setzero_si128() + +/*********************************************************************** + * 8 <-> 16 bits conversions + **********************************************************************/ +#define vec_u8_to_u16_h(v) _mm_unpacklo_epi8(v, zerov) +#define vec_u8_to_u16_l(v) _mm_unpackhi_epi8(v, zerov) +#define vec_u8_to_s16_h(v) _mm_unpacklo_epi8(v, zerov) +#define vec_u8_to_s16_l(v) _mm_unpackhi_epi8(v, zerov) + +#define vec_u8_to_u16(v) vec_u8_to_u16_h(v) +#define vec_u8_to_s16(v) vec_u8_to_s16_h(v) + +/*********************************************************************** + * 16 <-> 32 bits conversions + **********************************************************************/ +#define vec_u16_to_u32_h(v) _mm_unpacklo_epi16(v, zerov) +#define vec_u16_to_u32_l(v) _mm_unpackhi_epi16(v, zerov) +#define vec_u16_to_s32_h(v) _mm_unpacklo_epi16(v, zerov) +#define vec_u16_to_s32_l(v) _mm_unpackhi_epi16(v, zerov) + +#define vec_u16_to_u32(v) vec_u16_to_u32_h(v) +#define vec_u16_to_s32(v) vec_u16_to_s32_h(v) + +/*********************************************************************** + * VEC_TRANSPOSE_8 + *********************************************************************** + * Transposes a 8x8 matrix of s16 vectors + **********************************************************************/ +#define VEC_TRANSPOSE_8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7) \ + b0 = _mm_unpacklo_epi16(a0, a2); \ + b1 = _mm_unpackhi_epi16(a0, a2); \ + b2 = _mm_unpacklo_epi16(a1, a3); \ + b3 = _mm_unpackhi_epi16(a1, a3); \ + b4 = _mm_unpacklo_epi16(a4, a6); \ + b5 = _mm_unpackhi_epi16(a4, a6); \ + b6 = _mm_unpacklo_epi16(a5, a7); \ + b7 = _mm_unpackhi_epi16(a5, a7); \ + a0 = _mm_unpacklo_epi16(b0, b2); \ + a1 = _mm_unpackhi_epi16(b0, b2); \ + a2 = _mm_unpacklo_epi16(b1, b3); \ + a3 = _mm_unpackhi_epi16(b1, b3); \ + a4 = _mm_unpacklo_epi16(b4, b6); \ + a5 = _mm_unpackhi_epi16(b4, b6); \ + a6 = _mm_unpacklo_epi16(b5, b7); \ + a7 = _mm_unpackhi_epi16(b5, b7); \ + b0 = _mm_unpacklo_epi64(a0, a4); \ + b1 = _mm_unpackhi_epi64(a0, a4); \ + b2 = _mm_unpacklo_epi64(a1, a5); \ + b3 = _mm_unpackhi_epi64(a1, a5); \ + b4 = _mm_unpacklo_epi64(a2, a6); \ + b5 = _mm_unpackhi_epi64(a2, a6); \ + b6 = _mm_unpacklo_epi64(a3, a7); \ + b7 = _mm_unpackhi_epi64(a3, a7) + +/*********************************************************************** + * VEC_TRANSPOSE_4 + *********************************************************************** + * Transposes a 4x4 matrix of s16 vectors. + * Actually source and destination are 8x4. The low elements of the + * source are discarded and the low elements of the destination mustn't + * be used. + **********************************************************************/ +#define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \ + b0 = _mm_unpacklo_epi16(a0, a2); \ + b1 = _mm_unpacklo_epi16(a1, a3); \ + a0 = _mm_unpacklo_epi16(b0, b1); \ + a1 = _mm_unpackhi_epi16(b0, b1); \ + b0 = _mm_unpacklo_epi64(a0, a0); \ + b1 = _mm_unpackhi_epi64(a0, a0); \ + b2 = _mm_unpacklo_epi64(a1, a1); \ + b3 = _mm_unpackhi_epi64(a1, a1) + +/*********************************************************************** + * VEC_DIFF_HL + *********************************************************************** + * p1, p2: u8 * + * i1, i2: int + * dh, dl: s16v + * + * Loads 16 bytes from p1 and p2, do the diff of the high elements into + * dh, the diff of the low elements into dl, increments p1 and p2 by i1 + * and i2 + **********************************************************************/ +#define VEC_DIFF_HL(p1,i1,p2,i2,dh,dl) \ + pix1v = VEC_LD(p1); \ + pix2v = VEC_LD(p2); \ + temp0v = vec_u8_to_s16_h(pix1v); \ + temp1v = vec_u8_to_s16_l(pix1v); \ + temp2v = vec_u8_to_s16_h(pix2v); \ + temp3v = vec_u8_to_s16_l(pix2v); \ + dh = _mm_sub_epi16(temp0v, temp2v); \ + dl = _mm_sub_epi16(temp1v, temp3v); \ + p1 += i1; \ + p2 += i2 + +/*********************************************************************** +* VEC_DIFF_H_8BYTE_ALIGNED +*********************************************************************** +* p1, p2: u8 * +* i1, i2, n: int +* d: s16v +* +* Loads n bytes from p1 and p2, do the diff of the high elements into +* d, increments p1 and p2 by i1 and i2 +* Slightly faster when we know we are loading/diffing 8bytes which +* are 8 byte aligned. Reduces need for two loads and two vec_lvsl()'s +**********************************************************************/ +#define PREP_DIFF_8BYTEALIGNED \ +LOAD_ZERO; \ +vec_s16_t pix1v, pix2v; \ +vec_u8_t pix1v8, pix2v8; \ + +#define VEC_DIFF_H_8BYTE_ALIGNED(p1,i1,p2,i2,n,d) \ +pix1v8 = VEC_LD8(p1); \ +pix2v8 = VEC_LD8(p2); \ +pix1v = vec_u8_to_s16(pix1v8); \ +pix2v = vec_u8_to_s16(pix2v8); \ +d = _mm_sub_epi16(pix1v, pix2v); \ +p1 += i1; \ +p2 += i2; + +#define VEC_LD(a) _mm_loadu_si128((const __m128i*)(a)) +#define VEC_ST(a, b) _mm_storeu_si128((__m128i*)(a), b) +#define VEC_LD8(a) _mm_loadl_epi64((const __m128i*)(a)) +#define VEC_STL(a, b) _mm_storel_epi64((__m128i*)(a), b) +#define VEC_STH(a, b) _mm_storeh_pd((double*)(a), _mm_castsi128_pd(b)); + +#define VEC_SPLAT16(v, i) _mm_shuffle_epi8(v, _mm_set1_epi16((i) * 2 | ((i) * 2 + 1) << 8)) + +#if !defined(__iset__) || __iset__ < 5 +#define NEED_ALIGN8 +#define ALIGN8_COMMON uint64_t src_shr; __m64 src_tmp0, __attribute__((unused)) src_tmp1; +#define ALIGN8_VARS(src) __m64 *src##_ptr, src##_next, src##_index; +#define ALIGN8_START(ptr, src) \ + src_shr = (intptr_t)(ptr - 1) & 7; \ + src##_ptr = (__m64*)((intptr_t)(ptr - 1) & -8); \ + src##_next = src##_ptr[src_shr == 7]; \ + src##_index = _mm_add_pi8(_mm_set1_pi8(src_shr), \ + _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8)); +#define ALIGN8_READ8(v0, src, i) \ + src_tmp0 = src##_ptr[i + 1]; \ + v0 = _mm_shuffle2_pi8(src##_next, src_tmp0, src##_index); \ + src##_next = src_tmp0; +#define ALIGN8_READ16(v0, src, i) \ + src_tmp1 = src##_ptr[i * 2 + 1]; \ + src_tmp0 = _mm_shuffle2_pi8(src##_next, src_tmp1, src##_index); \ + src##_next = src##_ptr[i * 2 + 2]; \ + src_tmp1 = _mm_shuffle2_pi8(src_tmp1, src##_next, src##_index); \ + v0 = _mm_setr_epi64(src_tmp0, src_tmp1); + +#define ALIGN8A_COMMON __m64 src_tmp0, src_tmp1; +#define ALIGN8A_VARS(src) uint64_t src##_shr; __m64 *src##_ptr, src##_index; +#define ALIGN8A_START(ptr, src) \ + src##_shr = (intptr_t)(ptr - 1) & 7; \ + src##_ptr = (__m64*)((intptr_t)(ptr - 1) & -8); \ + src##_index = _mm_add_pi8(_mm_set1_pi8(src##_shr), \ + _mm_setr_pi8(1, 2, 3, 4, 5, 6, 7, 8)); \ + src##_shr = src##_shr == 7; +#define ALIGN8A_READ8(v0, src) \ + src_tmp0 = src##_ptr[src##_shr]; \ + src_tmp1 = src##_ptr[1]; \ + v0 = _mm_shuffle2_pi8(src_tmp0, src_tmp1, src##_index); +#define ALIGN8A_READ16(v0, src) \ + src_tmp0 = src##_ptr[src##_shr]; \ + src_tmp1 = src##_ptr[1]; \ + src_tmp0 = _mm_shuffle2_pi8(src_tmp0, src_tmp1, src##_index); \ + src_tmp1 = _mm_shuffle2_pi8(src_tmp1, src##_ptr[2], src##_index); \ + v0 = _mm_setr_epi64(src_tmp0, src_tmp1); +#endif + +#endif // X264_E2K_E2KCOMMON_H diff --git a/common/e2k/mc.c b/common/e2k/mc.c new file mode 100644 index 0000000..430b705 --- /dev/null +++ b/common/e2k/mc.c @@ -0,0 +1,1401 @@ +/***************************************************************************** + * mc.c: e2k motion compensation + ***************************************************************************** + * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd + * Copyright (C) 2003-2017 x264 project + * + * Authors: Eric Petit + * Guillaume Poirier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "e2kcommon.h" +#include "mc.h" + +#if !HIGH_BIT_DEPTH +static ALWAYS_INLINE void pixel_avg2_w4_e2k(uint8_t * restrict dst, intptr_t i_dst, + uint8_t * restrict src1, intptr_t i_src1, + uint8_t * restrict src2, intptr_t i_src2, int i_height) { + int y; + __m64 v0, v1; + + PRAGMA_E2K("ivdep") + for (y = 0; y < i_height; y++, dst += i_dst) { + v0 = _mm_cvtsi32_si64(*(uint32_t*)src1); src1 += i_src1; + v1 = _mm_cvtsi32_si64(*(uint32_t*)src2); src2 += i_src2; + *(uint32_t*)dst = _mm_cvtsi64_si32(_mm_avg_pu8(v0, v1)); + } +} + +static ALWAYS_INLINE void pixel_avg2_w8_e2k(uint8_t * restrict dst, intptr_t i_dst, + uint8_t * restrict src1, intptr_t i_src1, + uint8_t * restrict src2, intptr_t i_src2, int i_height) { + int y; + __m64 v0, v1; + + PRAGMA_E2K("ivdep") + for (y = 0; y < i_height; y++, dst += i_dst) { + v0 = *(__m64*)src1; src1 += i_src1; + v1 = *(__m64*)src2; src2 += i_src2; + *(__m64*)dst = _mm_avg_pu8(v0, v1); + } +} + +static ALWAYS_INLINE void pixel_avg2_w16_e2k(uint8_t * restrict dst, intptr_t i_dst, + uint8_t * restrict src1, intptr_t i_src1, + uint8_t * restrict src2, intptr_t i_src2, int i_height) { + int y; + __m128i v0, v1; + + PRAGMA_E2K("ivdep") + for (y = 0; y < i_height; y++, dst += i_dst) { + v0 = VEC_LD(src1); src1 += i_src1; + v1 = VEC_LD(src2); src2 += i_src2; + VEC_ST(dst, _mm_avg_epu8(v0, v1)); + } +} + +static ALWAYS_INLINE void pixel_avg2_w20_e2k(uint8_t *dst, intptr_t i_dst, + uint8_t *src1, intptr_t i_src1, + uint8_t *src2, intptr_t i_src2, int i_height) { + int y; + __m128i v0, v1; __m64 h0, h1; + + PRAGMA_E2K("ivdep") + for (y = 0; y < i_height; y++, dst += i_dst) { + v0 = VEC_LD(src1); h0 = _mm_cvtsi32_si64(*(uint32_t*)(src1 + 16)); + v1 = VEC_LD(src2); h1 = _mm_cvtsi32_si64(*(uint32_t*)(src2 + 16)); + src1 += i_src1; src2 += i_src2; + + VEC_ST(dst, _mm_avg_epu8(v0, v1)); + *(uint32_t*)(dst + 16) = _mm_cvtsi64_si32(_mm_avg_pu8(h0, h1)); + } +} + +#define PIXEL_AVG_FN(wxh) \ +static void pixel_avg_##wxh##_e2k(uint8_t * restrict dst, intptr_t i_dst, \ + uint8_t * restrict src1, intptr_t i_src1, \ + uint8_t * restrict src2, intptr_t i_src2, int weight) + +#define INIT_AVG_4X2_E2K \ + __m64 v0, t0, t1, t2, t3, c0, c1; \ + c0 = _mm_set1_pi16((weight & 255) | (64 - weight) << 8); \ + c1 = _mm_set1_pi16(32); + +#define PIXEL_AVG_2X2_E2K(i) \ + t0 = _mm_cvtsi32_si64(*(uint16_t*)src1); \ + t1 = _mm_cvtsi32_si64(*(uint16_t*)(src1 + i_src1)); \ + t2 = _mm_cvtsi32_si64(*(uint16_t*)src2); \ + t3 = _mm_cvtsi32_si64(*(uint16_t*)(src2 + i_src2)); \ + src1 += i_src1 * 2; src2 += i_src2 * 2; \ + t0 = _mm_unpacklo_pi32(_mm_unpacklo_pi8(t0, t2), _mm_unpacklo_pi8(t1, t3)); \ + v##i = _mm_srai_pi16(_mm_add_pi16(_mm_maddubs_pi16(t0, c0), c1), 6); \ + +#define STORE_2X4_E2K(v0, v1) \ + v0 = _mm_packs_pu16(v0, v1); \ + *(uint16_t*)dst = _mm_cvtsi64_si32(v0); \ + *(uint16_t*)(dst + i_dst) = _mm_extract_pi16(v0, 1); dst += i_dst * 2; \ + *(uint16_t*)dst = _mm_extract_pi16(v0, 2); \ + *(uint16_t*)(dst + i_dst) = _mm_extract_pi16(v0, 3); dst += i_dst * 2; + +PIXEL_AVG_FN(2x2) { + INIT_AVG_4X2_E2K + PIXEL_AVG_2X2_E2K(0) + v0 = _mm_packs_pu16(v0, v0); + *(uint16_t*)dst = _mm_cvtsi64_si32(v0); + *(uint16_t*)(dst + i_dst) = _mm_extract_pi16(v0, 1); +} + +PIXEL_AVG_FN(2x4) { + __m64 v1; + INIT_AVG_4X2_E2K + PIXEL_AVG_2X2_E2K(0) + PIXEL_AVG_2X2_E2K(1) + STORE_2X4_E2K(v0, v1) +} + +PIXEL_AVG_FN(2x8) { + __m64 v1, v2, v3; + INIT_AVG_4X2_E2K + PIXEL_AVG_2X2_E2K(0) + PIXEL_AVG_2X2_E2K(1) + PIXEL_AVG_2X2_E2K(2) + PIXEL_AVG_2X2_E2K(3) + STORE_2X4_E2K(v0, v1) + STORE_2X4_E2K(v2, v3) +} + +#define PIXEL_AVG_4X2_E2K \ + t0 = _mm_cvtsi32_si64(*(uint32_t*)src1); \ + t1 = _mm_cvtsi32_si64(*(uint32_t*)(src1 + i_src1)); \ + t2 = _mm_cvtsi32_si64(*(uint32_t*)src2); \ + t3 = _mm_cvtsi32_si64(*(uint32_t*)(src2 + i_src2)); \ + src1 += i_src1 * 2; src2 += i_src2 * 2; \ + t0 = _mm_maddubs_pi16(_mm_unpacklo_pi8(t0, t2), c0); \ + t1 = _mm_maddubs_pi16(_mm_unpacklo_pi8(t1, t3), c0); \ + t0 = _mm_srai_pi16(_mm_add_pi16(t0, c1), 6); \ + t1 = _mm_srai_pi16(_mm_add_pi16(t1, c1), 6); \ + v0 = _mm_packs_pu16(t0, t1); \ + *(uint32_t*)dst = _mm_cvtsi64_si32(v0); \ + *(uint32_t*)(dst + i_dst) = _mm_extract_pi32(v0, 1); \ + dst += i_dst * 2; + +PIXEL_AVG_FN(4x2) { + INIT_AVG_4X2_E2K + PIXEL_AVG_4X2_E2K +} + +PIXEL_AVG_FN(4x4) { + INIT_AVG_4X2_E2K + PIXEL_AVG_4X2_E2K + PIXEL_AVG_4X2_E2K +} + +PIXEL_AVG_FN(4x8) { + INIT_AVG_4X2_E2K + PIXEL_AVG_4X2_E2K + PIXEL_AVG_4X2_E2K + PIXEL_AVG_4X2_E2K + PIXEL_AVG_4X2_E2K +} + +PIXEL_AVG_FN(4x16) { + int y; + INIT_AVG_4X2_E2K + PRAGMA_E2K("unroll(8)") + PRAGMA_E2K("ivdep") + for (y = 0; y < 8; y++) { + PIXEL_AVG_4X2_E2K + } +} + +#define PIXEL_AVG_8X2_E2K(i) \ + t0 = VEC_LD8(src1); t1 = VEC_LD8(src1 + i_src1); src1 += i_src1 * 2; \ + t2 = VEC_LD8(src2); t3 = VEC_LD8(src2 + i_src2); src2 += i_src2 * 2; \ + t0 = _mm_maddubs_epi16(_mm_unpacklo_epi8(t0, t2), c0); \ + t1 = _mm_maddubs_epi16(_mm_unpacklo_epi8(t1, t3), c0); \ + t0 = _mm_srai_epi16(_mm_add_epi16(t0, c1), 6); \ + t1 = _mm_srai_epi16(_mm_add_epi16(t1, c1), 6); \ + v##i = _mm_packus_epi16(t0, t1); + +static ALWAYS_INLINE void pixel_avg_w8_e2k(uint8_t * restrict dst, intptr_t i_dst, + uint8_t * restrict src1, intptr_t i_src1, + uint8_t * restrict src2, intptr_t i_src2, int i_height, int weight) { + int y; + __m128i v0, v1, t0, t1, t2, t3, c0, c1; + c0 = _mm_set1_epi16((weight & 255) | (64 - weight) << 8); + c1 = _mm_set1_epi16(32); + + PRAGMA_E2K("ivdep") + for (y = 0; y < i_height; y += 4) { + PIXEL_AVG_8X2_E2K(0) + PIXEL_AVG_8X2_E2K(1) + VEC_STL(dst, v0); VEC_STH(dst + i_dst, v0); dst += i_dst * 2; + VEC_STL(dst, v1); VEC_STH(dst + i_dst, v1); dst += i_dst * 2; + } +} + +static ALWAYS_INLINE void pixel_avg_w16_e2k(uint8_t * restrict dst, intptr_t i_dst, + uint8_t * restrict src1, intptr_t i_src1, + uint8_t * restrict src2, intptr_t i_src2, int i_height, int weight) { + int y; + __m128i v0, v1, v4, v5, c0, c1; + c0 = _mm_set1_epi16((weight & 255) | (64 - weight) << 8); + c1 = _mm_set1_epi16(32); + + PRAGMA_E2K("ivdep") + for (y = 0; y < i_height; y++, dst += i_dst) { + v0 = VEC_LD(src1); src1 += i_src1; + v1 = VEC_LD(src2); src2 += i_src2; + v4 = _mm_maddubs_epi16(_mm_unpacklo_epi8(v0, v1), c0); + v5 = _mm_maddubs_epi16(_mm_unpackhi_epi8(v0, v1), c0); + + v4 = _mm_srai_epi16(_mm_add_epi16(v4, c1), 6); + v5 = _mm_srai_epi16(_mm_add_epi16(v5, c1), 6); + VEC_ST(dst, _mm_packus_epi16(v4, v5)); + } +} + +#define PIXEL_AVG_E2K(w, h) \ +PIXEL_AVG_FN(w##x##h) { \ + if (weight == 32) pixel_avg2_w##w##_e2k(dst, i_dst, src1, i_src1, src2, i_src2, h); \ + else pixel_avg_w##w##_e2k(dst, i_dst, src1, i_src1, src2, i_src2, h, weight); \ +} + +PIXEL_AVG_E2K(16, 16) +PIXEL_AVG_E2K(16, 8) +PIXEL_AVG_E2K(8, 16) +PIXEL_AVG_E2K(8, 8) +PIXEL_AVG_E2K(8, 4) + +/* mc_copy: plain c */ + +#define MC_COPY(name, a, T) \ +static void name(uint8_t *dst, intptr_t i_dst, \ + uint8_t *src, intptr_t i_src, int i_height) { \ + int y; \ + PRAGMA_E2K("ivdep") \ + for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) { \ + *(T*)dst = *(T*)src; \ + } \ +} +MC_COPY(mc_copy_w4_e2k, 4, uint32_t) +MC_COPY(mc_copy_w8_e2k, 8, uint64_t) + +static void mc_copy_w16_e2k(uint8_t *dst, intptr_t i_dst, + uint8_t *src, intptr_t i_src, int i_height) { + int y; + PRAGMA_E2K("ivdep") + for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) { + __m128i v0 = VEC_LD(src); + VEC_ST(dst, v0); + } +} + +static void mc_copy_w16_aligned_e2k(uint8_t *dst, intptr_t i_dst, + uint8_t *src, intptr_t i_src, int i_height) { + int y; + PRAGMA_E2K("ivdep") + for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) { + __m128i v0 = VEC_LD(src); + VEC_ST(dst, v0); + } +} + +static void x264_plane_copy_swap_core_e2k(uint8_t *dst, intptr_t i_dst, + uint8_t *src, intptr_t i_src, int w, int h) { + int x, y; + __m128i mask = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, + 9, 8, 11, 10, 13, 12, 15, 14); + + for (y = 0; y < h; y++, dst += i_dst, src += i_src) + PRAGMA_E2K("ivdep") + for (x = 0; x < 2 * w; x += 16) { + __m128i v0 = VEC_LD(src + x); + v0 = _mm_shuffle_epi8(v0, mask); + VEC_ST(dst + x, v0); + } +} + +static void x264_plane_copy_interleave_core_e2k(uint8_t *dst, intptr_t i_dst, + uint8_t *srcu, intptr_t i_srcu, + uint8_t *srcv, intptr_t i_srcv, int w, int h) { + int x, y; + __m128i v0, v1, v2, v3; + + if (w <= 0) return; + + for (y = 0; y < h; y++, dst += i_dst, srcu += i_srcu, srcv += i_srcv) + PRAGMA_E2K("ivdep") + for (x = 0; x < w; x += 16) { + v0 = VEC_LD(srcu + x); + v1 = VEC_LD(srcv + x); + v2 = _mm_unpacklo_epi8(v0, v1); + v3 = _mm_unpackhi_epi8(v0, v1); + VEC_ST(dst + 2 * x, v2); + VEC_ST(dst + 2 * x + 16, v3); + } +} + +static void store_interleave_chroma_e2k(uint8_t *dst, intptr_t i_dst, + uint8_t *srcu, uint8_t *srcv, int height) { + int y; + __m128i v0, v1; + + PRAGMA_E2K("ivdep") + for (y = 0; y < height; y++, dst += i_dst, srcu += FDEC_STRIDE, srcv += FDEC_STRIDE) { + v0 = VEC_LD8(srcu); + v1 = VEC_LD8(srcv); + v0 = _mm_unpacklo_epi8(v0, v1); + VEC_ST(dst, v0); + } +} + +static void load_deinterleave_chroma_fenc_e2k(uint8_t *dst, uint8_t *src, intptr_t i_src, int height) { + __m128i index = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + __m128i v0, v1; + + int y; + PRAGMA_E2K("ivdep") + for (y = 0; y < height; y += 2, dst += 2 * FENC_STRIDE, src += i_src * 2) { + v0 = VEC_LD(src); + v1 = VEC_LD(src + i_src); + v0 = _mm_shuffle_epi8(v0, index); + v1 = _mm_shuffle_epi8(v1, index); +#if FENC_STRIDE == 16 + VEC_ST(dst + 0 * FENC_STRIDE, v0); + VEC_ST(dst + 1 * FENC_STRIDE, v1); +#else + VEC_STL(dst + 0 * FENC_STRIDE, v0); + VEC_STH(dst + 0 * FENC_STRIDE + FENC_STRIDE / 2, v0); + VEC_STL(dst + 1 * FENC_STRIDE, v1); + VEC_STH(dst + 1 * FENC_STRIDE + FENC_STRIDE / 2, v1); +#endif + } +} + +static void load_deinterleave_chroma_fdec_e2k(uint8_t *dst, uint8_t *src, intptr_t i_src, int height) { + __m128i index = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + __m128i v0, v1; + + int y; + PRAGMA_E2K("ivdep") + for (y = 0; y < height; y += 2, dst += 2 * FDEC_STRIDE, src += i_src * 2) { + v0 = VEC_LD(src); + v1 = VEC_LD(src + i_src); + v0 = _mm_shuffle_epi8(v0, index); + v1 = _mm_shuffle_epi8(v1, index); + VEC_STL(dst + 0 * FDEC_STRIDE, v0); + VEC_STH(dst + 0 * FDEC_STRIDE + FDEC_STRIDE / 2, v0); + VEC_STL(dst + 1 * FDEC_STRIDE, v1); + VEC_STH(dst + 1 * FDEC_STRIDE + FDEC_STRIDE / 2, v1); + } +} + +#define DEINTERLEAVE_INDEX(x) 0 + x, 2 + x, 4 + x, 6 + x, 8 + x, 10 + x, 12 + x, 14 + x + +#if defined(__iset__) && __iset__ >= 5 + +static void plane_copy_deinterleave_e2k(uint8_t *dstu, intptr_t i_dstu, + uint8_t *dstv, intptr_t i_dstv, + uint8_t *src, intptr_t i_src, int w, int h) { + int x, y; + __m128i v0, v1, v2, v3; + __m128i index0 = _mm_setr_epi8(DEINTERLEAVE_INDEX(0), DEINTERLEAVE_INDEX(16)); + __m128i index1 = _mm_setr_epi8(DEINTERLEAVE_INDEX(1), DEINTERLEAVE_INDEX(17)); + + if (w <= 0) return; + PRAGMA_E2K("ivdep") + for (y = 0; y < h; y++, dstu += i_dstu, dstv += i_dstv, src += i_src) { + for (x = 0; x < w; x += 16) { + v0 = VEC_LD(src + 2 * x); + v1 = VEC_LD(src + 2 * x + 16); + v2 = _mm_shuffle2_epi8(v0, v1, index0); + v3 = _mm_shuffle2_epi8(v0, v1, index1); + VEC_ST(dstu + x, v2); + VEC_ST(dstv + x, v3); + } + } +} + +#define RGB_INDEX3(x) 0 + x, 3 + x, 6 + x, 9 + x, 12 + x, 15 + x, 18 + x, 21 + x +#define RGB_INDEX4(x) 0 + x, 4 + x, 8 + x, 12 + x, 16 + x, 20 + x, 24 + x, 28 + x + +static void plane_copy_deinterleave_rgb_e2k(uint8_t *dsta, intptr_t i_dsta, + uint8_t *dstb, intptr_t i_dstb, + uint8_t *dstc, intptr_t i_dstc, + uint8_t *src, intptr_t i_src, + int pw, int w, int h) { + int x, y; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + + if (h <= 0 || w <= 0) return; + if (pw == 3) { + __m128i index0 = _mm_setr_epi8(RGB_INDEX3(0), RGB_INDEX3(1)); + __m128i index1 = _mm_setr_epi8(RGB_INDEX3(2), RGB_INDEX3(3)); + __m128i index2 = _mm_setr_epi8(RGB_INDEX3(8), RGB_INDEX3(9)); + __m128i index3 = _mm_setr_epi8(RGB_INDEX3(10), RGB_INDEX3(11)); + + for (y = 0; y < h; y++, dsta += i_dsta, dstb += i_dstb, dstc += i_dstc, src += i_src) { + PRAGMA_E2K("ivdep") + for (x = 0; x < w; x += 16) { + v0 = VEC_LD(src + 3 * x); + v1 = VEC_LD(src + 3 * x + 16); + v2 = VEC_LD(src + 3 * x + 32); + v4 = _mm_shuffle2_epi8(v0, v1, index0); // rg0 + v5 = _mm_shuffle2_epi8(v0, v1, index1); // bx0 + v6 = _mm_shuffle2_epi8(v1, v2, index2); // rg1 + v7 = _mm_shuffle2_epi8(v1, v2, index3); // bx1 + + VEC_ST(dsta + x, _mm_unpacklo_epi64(v4, v6)); + VEC_ST(dstb + x, _mm_unpackhi_epi64(v4, v6)); + VEC_ST(dstc + x, _mm_unpacklo_epi64(v5, v7)); + } + } + } else { + __m128i index0 = _mm_setr_epi8(RGB_INDEX4(0), RGB_INDEX4(1)); + __m128i index1 = _mm_setr_epi8(RGB_INDEX4(2), RGB_INDEX4(3)); + + for (y = 0; y < h; y++, dsta += i_dsta, dstb += i_dstb, dstc += i_dstc, src += i_src) { + PRAGMA_E2K("ivdep") + for (x = 0; x < w; x += 16) { + v0 = VEC_LD(src + 4 * x); + v1 = VEC_LD(src + 4 * x + 16); + v2 = VEC_LD(src + 4 * x + 32); + v3 = VEC_LD(src + 4 * x + 48); + v4 = _mm_shuffle2_epi8(v0, v1, index0); // rg0 + v5 = _mm_shuffle2_epi8(v0, v1, index1); // bx0 + v6 = _mm_shuffle2_epi8(v2, v3, index0); // rg1 + v7 = _mm_shuffle2_epi8(v2, v3, index1); // bx1 + + VEC_ST(dsta + x, _mm_unpacklo_epi64(v4, v6)); + VEC_ST(dstb + x, _mm_unpackhi_epi64(v4, v6)); + VEC_ST(dstc + x, _mm_unpacklo_epi64(v5, v7)); + } + } + } +} +#else +static void plane_copy_deinterleave_e2k(uint8_t *dstu, intptr_t i_dstu, + uint8_t *dstv, intptr_t i_dstv, + uint8_t *src, intptr_t i_src, int w, int h) { + int x, y; + __m64 v0, v1, v2, v3, v4, v5; + __m64 index0 = _mm_setr_pi8(DEINTERLEAVE_INDEX(0)); + __m64 index1 = _mm_setr_pi8(DEINTERLEAVE_INDEX(1)); + + if (w <= 0) return; + for (y = 0; y < h; y++, dstu += i_dstu, dstv += i_dstv, src += i_src) { + PRAGMA_E2K("ivdep") + for (x = 0; x < w; x += 16) { + v0 = *(__m64*)(src + 2 * x); + v1 = *(__m64*)(src + 2 * x + 8); + v2 = _mm_shuffle2_pi8(v0, v1, index0); + v3 = _mm_shuffle2_pi8(v0, v1, index1); + v0 = *(__m64*)(src + 2 * x + 16); + v1 = *(__m64*)(src + 2 * x + 24); + v4 = _mm_shuffle2_pi8(v0, v1, index0); + v5 = _mm_shuffle2_pi8(v0, v1, index1); + *(__m64*)(dstu + x) = v2; + *(__m64*)(dstv + x) = v3; + *(__m64*)(dstu + x + 8) = v4; + *(__m64*)(dstv + x + 8) = v5; + } + } +} + +#define RGB_INDEX3(x) 0 + x, 3 + x, 6 + x, 9 + x +#define RGB_INDEX4(x) 0 + x, 4 + x, 8 + x, 12 + x + +static void plane_copy_deinterleave_rgb_e2k(uint8_t *dsta, intptr_t i_dsta, + uint8_t *dstb, intptr_t i_dstb, + uint8_t *dstc, intptr_t i_dstc, + uint8_t *src, intptr_t i_src, + int pw, int w, int h) { + int x, y; + __m64 v0, v1, v2, v3, v4, v5, v6, v7; + ALIGN8_COMMON + ALIGN8_VARS(src) + + if (h <= 0 || w <= 0) return; + if (pw == 3) { + __m64 index0 = _mm_setr_pi8(RGB_INDEX3(0), RGB_INDEX3(1)); + __m64 index1 = _mm_setr_pi8(RGB_INDEX3(2), RGB_INDEX3(3)); + __m64 index2 = _mm_setr_pi8(RGB_INDEX3(4), RGB_INDEX3(5)); + __m64 index3 = _mm_setr_pi8(RGB_INDEX3(6), RGB_INDEX3(7)); + for (y = 0; y < h; y++, dsta += i_dsta, dstb += i_dstb, dstc += i_dstc, src += i_src) { + ALIGN8_START(src, src) + PRAGMA_E2K("ivdep") + PRAGMA_E2K("unroll(2)") + for (x = 0; x < w; x += 8) { + ALIGN8_READ8(v0, src, 0) + ALIGN8_READ8(v1, src, 1) + ALIGN8_READ8(v2, src, 2) + src_ptr += 3; + v4 = _mm_shuffle2_pi8(v0, v1, index0); // rg0 + v5 = _mm_shuffle2_pi8(v0, v1, index1); // bx0 + v6 = _mm_shuffle2_pi8(v1, v2, index2); // rg1 + v7 = _mm_shuffle2_pi8(v1, v2, index3); // bx1 + *(__m64*)(dsta + x) = _mm_unpacklo_pi32(v4, v6); + *(__m64*)(dstb + x) = _mm_unpackhi_pi32(v4, v6); + *(__m64*)(dstc + x) = _mm_unpacklo_pi32(v5, v7); + } + } + } else { + __m64 index0 = _mm_setr_pi8(RGB_INDEX4(0), RGB_INDEX4(1)); + __m64 index1 = _mm_setr_pi8(RGB_INDEX4(2), RGB_INDEX4(3)); + + for (y = 0; y < h; y++, dsta += i_dsta, dstb += i_dstb, dstc += i_dstc, src += i_src) { + ALIGN8_START(src, src) + PRAGMA_E2K("ivdep") + PRAGMA_E2K("unroll(2)") + for (x = 0; x < w; x += 8) { + ALIGN8_READ8(v0, src, 0) + ALIGN8_READ8(v1, src, 1) + ALIGN8_READ8(v2, src, 2) + ALIGN8_READ8(v3, src, 3) + src_ptr += 4; + v4 = _mm_shuffle2_pi8(v0, v1, index0); // rg0 + v5 = _mm_shuffle2_pi8(v0, v1, index1); // bx0 + v6 = _mm_shuffle2_pi8(v2, v3, index0); // rg1 + v7 = _mm_shuffle2_pi8(v2, v3, index1); // bx1 + *(__m64*)(dsta + x) = _mm_unpacklo_pi32(v4, v6); + *(__m64*)(dstb + x) = _mm_unpackhi_pi32(v4, v6); + *(__m64*)(dstc + x) = _mm_unpacklo_pi32(v5, v7); + } + } + } +} +#endif + +static void mc_luma_e2k(uint8_t *dst, intptr_t i_dst, + uint8_t *src[4], intptr_t i_src, + int mvx, int mvy, + int i_width, int i_height, const x264_weight_t *weight) { + int qpel_idx = ((mvy & 3) << 2) + (mvx & 3); + intptr_t offset = (mvy >> 2) * i_src + (mvx >> 2); + uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy & 3) == 3) * i_src; + if (qpel_idx & 5) { /* qpel interpolation needed */ + uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx & 3) == 3); + + switch (i_width) { + case 4: + pixel_avg2_w4_e2k(dst, i_dst, src1, i_src, src2, i_src, i_height); + break; + case 8: + pixel_avg2_w8_e2k(dst, i_dst, src1, i_src, src2, i_src, i_height); + break; + case 16: + default: + pixel_avg2_w16_e2k(dst, i_dst, src1, i_src, src2, i_src, i_height); + } + if (weight->weightfn) + weight->weightfn[i_width>>2](dst, i_dst, dst, i_dst, weight, i_height); + } + else if (weight->weightfn) + weight->weightfn[i_width>>2](dst, i_dst, src1, i_src, weight, i_height); + else { + switch (i_width) { + case 4: + mc_copy_w4_e2k(dst, i_dst, src1, i_src, i_height); + break; + case 8: + mc_copy_w8_e2k(dst, i_dst, src1, i_src, i_height); + break; + case 16: + mc_copy_w16_e2k(dst, i_dst, src1, i_src, i_height); + break; + } + } +} + +static uint8_t *get_ref_e2k(uint8_t *dst, intptr_t *i_dst, + uint8_t *src[4], intptr_t i_src, + int mvx, int mvy, + int i_width, int i_height, const x264_weight_t *weight) { + int qpel_idx = ((mvy & 3) << 2) + (mvx & 3); + intptr_t offset = (mvy >> 2) * i_src + (mvx >> 2); + uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy & 3) == 3) * i_src; + if (qpel_idx & 5) { /* qpel interpolation needed */ + uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx & 3) == 3); + switch (i_width) { + case 4: + pixel_avg2_w4_e2k(dst, *i_dst, src1, i_src, src2, i_src, i_height); + break; + case 8: + pixel_avg2_w8_e2k(dst, *i_dst, src1, i_src, src2, i_src, i_height); + break; + case 12: + case 16: + default: + pixel_avg2_w16_e2k(dst, *i_dst, src1, i_src, src2, i_src, i_height); + break; + case 20: + pixel_avg2_w20_e2k(dst, *i_dst, src1, i_src, src2, i_src, i_height); + break; + } + if (weight->weightfn) + weight->weightfn[i_width >> 2](dst, *i_dst, dst, *i_dst, weight, i_height); + return dst; + } + else if (weight->weightfn) { + weight->weightfn[i_width >> 2](dst, *i_dst, src1, i_src, weight, i_height); + return dst; + } else { + *i_dst = i_src; + return src1; + } +} + +#define VSLD(a, b, n) _mm_alignr_epi8(b, a, n) + +static void mc_chroma_2xh(uint8_t *dstu, uint8_t *dstv, intptr_t i_dst, + uint8_t *src, intptr_t i_src, + int mvx, int mvy, int i_height) { + int y, d8x = mvx & 7, d8y = mvy & 7; + __m128i t0, t1, c01, c23, k32v = _mm_set1_epi16(32); + __m64 index = _mm_setr_pi8(0, 2, 2, 4, 1, 3, 3, 5); + __m64 h1, h2, h3; + + c01 = _mm_set1_epi16(((8 - d8x) | d8x << 8) * (8 - d8y)); + c23 = _mm_set1_epi16(((8 - d8x) | d8x << 8) * d8y); + + src += (mvy >> 3) * i_src + (mvx >> 3) * 2; + h3 = *(__m64*)src; src += i_src; + h3 = _mm_shuffle_pi8(h3, index); + + PRAGMA_E2K("ivdep") + for (y = 0; y < i_height; y += 2) { + h1 = h3; + h2 = *(__m64*)src; src += i_src; + h3 = *(__m64*)src; src += i_src; + h2 = _mm_shuffle_pi8(h2, index); + h3 = _mm_shuffle_pi8(h3, index); + + t0 = _mm_unpacklo_epi64(_mm_movpi64_epi64(h1), _mm_movpi64_epi64(h2)); + t1 = _mm_unpacklo_epi64(_mm_movpi64_epi64(h2), _mm_movpi64_epi64(h3)); + t0 = _mm_maddubs_epi16(t0, c01); + t1 = _mm_maddubs_epi16(t1, c23); + t0 = _mm_add_epi16(_mm_add_epi16(t0, t1), k32v); + t0 = _mm_srli_epi16(t0, 6); + + t0 = _mm_packus_epi16(t0, t0); + *(uint16_t*)dstu = _mm_extract_epi16(t0, 0); dstu += i_dst; + *(uint16_t*)dstv = _mm_extract_epi16(t0, 1); dstv += i_dst; + *(uint16_t*)dstu = _mm_extract_epi16(t0, 2); dstu += i_dst; + *(uint16_t*)dstv = _mm_extract_epi16(t0, 3); dstv += i_dst; + } +} + +static void mc_chroma_4xh_e2k(uint8_t *dstu, uint8_t *dstv, intptr_t i_dst, + uint8_t *src, intptr_t i_src, + int mvx, int mvy, int i_height) { + int y, d8x = mvx & 7, d8y = mvy & 7; + __m128i c01, c23, v01, v23, v2, dstuv; + __m128i d0, d1, k32v = _mm_set1_epi16(32); + __m128i perm0v = _mm_setr_epi8(1, 5, 9, 13, 3, 7, 11, 15, + 1, 5, 9, 13, 3, 7, 11, 15); + + c01 = _mm_set1_epi16(((8 - d8x) | d8x << 8) * (8 - d8y)); + c23 = _mm_set1_epi16(((8 - d8x) | d8x << 8) * d8y); + + src += (mvy >> 3) * i_src + (mvx >> 3) * 2; + v2 = VEC_LD(src); src += i_src; + v23 = _mm_unpacklo_epi8(v2, _mm_bsrli_si128(v2, 2)); + + PRAGMA_E2K("ivdep") + for (y = 0; y < i_height; y += 2) { + v01 = v23; + v2 = VEC_LD(src); src += i_src; + v23 = _mm_unpacklo_epi8(v2, _mm_bsrli_si128(v2, 2)); + + d0 = _mm_maddubs_epi16(v01, c01); + d1 = _mm_maddubs_epi16(v23, c23); + d0 = _mm_add_epi16(_mm_add_epi16(d0, d1), k32v); + d0 = _mm_slli_epi16(d0, 2); + + dstuv = _mm_shuffle_epi8(d0, perm0v); + *(uint32_t*)dstu = _mm_extract_epi32(dstuv, 0); dstu += i_dst; + *(uint32_t*)dstv = _mm_extract_epi32(dstuv, 1); dstv += i_dst; + + v01 = v23; + v2 = VEC_LD(src); src += i_src; + v23 = _mm_unpacklo_epi8(v2, _mm_bsrli_si128(v2, 2)); + + d0 = _mm_maddubs_epi16(v01, c01); + d1 = _mm_maddubs_epi16(v23, c23); + d0 = _mm_add_epi16(_mm_add_epi16(d0, d1), k32v); + d0 = _mm_slli_epi16(d0, 2); + + dstuv = _mm_shuffle_epi8(d0, perm0v); + *(uint32_t*)dstu = _mm_extract_epi32(dstuv, 0); dstu += i_dst; + *(uint32_t*)dstv = _mm_extract_epi32(dstuv, 1); dstv += i_dst; + } +} + +static void mc_chroma_8xh_e2k(uint8_t *dstu, uint8_t *dstv, intptr_t i_dst, + uint8_t *src, intptr_t i_src, + int mvx, int mvy, int i_height) { + int y, d8x = mvx & 7, d8y = mvy & 7; + __m128i c01, c23, v01l, v23l, v01h, v23h, v2, v3; + __m128i d0, d1, d2, d3, k32v = _mm_set1_epi16(32); + __m128i index = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, + 1, 3, 5, 7, 9, 11, 13, 15); + + c01 = _mm_set1_epi16(((8 - d8x) | d8x << 8) * (8 - d8y)); + c23 = _mm_set1_epi16(((8 - d8x) | d8x << 8) * d8y); + + src += (mvy >> 3) * i_src + (mvx >> 3) * 2; + v2 = VEC_LD(src); v3 = VEC_LD8(src + 16); src += i_src; + v3 = _mm_alignr_epi8(v3, v2, 2); + v23l = _mm_unpacklo_epi8(v2, v3); + v23h = _mm_unpackhi_epi8(v2, v3); + + PRAGMA_E2K("ivdep") + for (y = 0; y < i_height; y += 2) { + v01l = v23l; v01h = v23h; + v2 = VEC_LD(src); v3 = VEC_LD8(src + 16); src += i_src; + v3 = _mm_alignr_epi8(v3, v2, 2); + v23l = _mm_unpacklo_epi8(v2, v3); + v23h = _mm_unpackhi_epi8(v2, v3); + + d0 = _mm_maddubs_epi16(v01l, c01); + d1 = _mm_maddubs_epi16(v01h, c01); + d2 = _mm_maddubs_epi16(v23l, c23); + d3 = _mm_maddubs_epi16(v23h, c23); + d0 = _mm_add_epi16(_mm_add_epi16(d0, d2), k32v); + d1 = _mm_add_epi16(_mm_add_epi16(d1, d3), k32v); + d0 = _mm_srli_epi16(d0, 6); + d1 = _mm_srli_epi16(d1, 6); + d0 = _mm_packus_epi16(d0, d1); + d0 = _mm_shuffle_epi8(d0, index); + + VEC_STL(dstu, d0); dstu += i_dst; + VEC_STH(dstv, d0); dstv += i_dst; + + v01l = v23l; v01h = v23h; + v2 = VEC_LD(src); v3 = VEC_LD8(src + 16); src += i_src; + v3 = _mm_alignr_epi8(v3, v2, 2); + v23l = _mm_unpacklo_epi8(v2, v3); + v23h = _mm_unpackhi_epi8(v2, v3); + + d0 = _mm_maddubs_epi16(v01l, c01); + d1 = _mm_maddubs_epi16(v01h, c01); + d2 = _mm_maddubs_epi16(v23l, c23); + d3 = _mm_maddubs_epi16(v23h, c23); + d0 = _mm_add_epi16(_mm_add_epi16(d0, d2), k32v); + d1 = _mm_add_epi16(_mm_add_epi16(d1, d3), k32v); + d0 = _mm_srli_epi16(d0, 6); + d1 = _mm_srli_epi16(d1, 6); + d0 = _mm_packus_epi16(d0, d1); + d0 = _mm_shuffle_epi8(d0, index); + + VEC_STL(dstu, d0); dstu += i_dst; + VEC_STH(dstv, d0); dstv += i_dst; + } +} + +static void mc_chroma_e2k(uint8_t *dstu, uint8_t *dstv, intptr_t i_dst, + uint8_t *src, intptr_t i_src, + int mvx, int mvy, int i_width, int i_height) { + + if (i_width == 8) + mc_chroma_8xh_e2k(dstu, dstv, i_dst, src, i_src, mvx, mvy, i_height); + else if (i_width == 4) + mc_chroma_4xh_e2k(dstu, dstv, i_dst, src, i_src, mvx, mvy, i_height); + else + mc_chroma_2xh(dstu, dstv, i_dst, src, i_src, mvx, mvy, i_height); +} + +#if 1 /* 3 unpack + 3 mul + 2 add/sub */ +#define HPEL_FILTER_1_INIT __m128i \ + madd1v = _mm_set1_epi8(-1), madd2v = _mm_set1_epi16(20 - 5 * 256); + +#define HPEL_FILTER_1(lo, dest) { \ + temp1v = _mm_unpack##lo##_epi8(src1v, src6v); \ + temp2v = _mm_unpack##lo##_epi8(src3v, src2v); \ + temp3v = _mm_unpack##lo##_epi8(src4v, src5v); \ + /* a-5*b+20*c */ \ + temp1v = _mm_maddubs_epi16(temp1v, madd1v); \ + temp2v = _mm_maddubs_epi16(temp2v, madd2v); \ + temp3v = _mm_maddubs_epi16(temp3v, madd2v); \ + temp2v = _mm_sub_epi16(temp2v, temp1v); \ + dest = _mm_add_epi16(temp2v, temp3v); \ +} +#else /* 6 unpack + 7 add/sub + 2 shl */ +#define HPEL_FILTER_1_INIT __m128i zerov = _mm_setzero_si128(); + +#define HPEL_FILTER_1(lo, dest) { \ + temp1v = _mm_unpack##lo##_epi8(src1v, zerov); \ + temp2v = _mm_unpack##lo##_epi8(src2v, zerov); \ + temp3v = _mm_unpack##lo##_epi8(src3v, zerov); \ + temp4v = _mm_unpack##lo##_epi8(src4v, zerov); \ + temp5v = _mm_unpack##lo##_epi8(src5v, zerov); \ + temp6v = _mm_unpack##lo##_epi8(src6v, zerov); \ + \ + temp1v = _mm_add_epi16(temp1v, temp6v); \ + temp2v = _mm_add_epi16(temp2v, temp5v); \ + temp3v = _mm_add_epi16(temp3v, temp4v); \ + \ + temp1v = _mm_sub_epi16(temp1v, temp2v); /* (a-b) */ \ + temp2v = _mm_sub_epi16(temp2v, temp3v); /* (b-c) */ \ + temp2v = _mm_slli_epi16(temp2v, 2); /* (b-c)*4 */ \ + temp1v = _mm_sub_epi16(temp1v, temp2v); /* a-5*b+4*c */ \ + temp3v = _mm_slli_epi16(temp3v, 4); /* 16*c */ \ + dest = _mm_add_epi16(temp1v, temp3v); /* a-5*b+20*c */ \ +} +#endif + +#define HPEL_FILTER_2(t1v, t2v, t3v, t4v, t5v, t6v) { \ + t1v = _mm_add_epi16(t1v, t6v); \ + t2v = _mm_add_epi16(t2v, t5v); \ + t3v = _mm_add_epi16(t3v, t4v); \ + \ + t1v = _mm_sub_epi16(t1v, t2v); /* (a-b) */ \ + t1v = _mm_srai_epi16(t1v, 2); /* (a-b)/4 */ \ + t1v = _mm_sub_epi16(t1v, t2v); /* (a-b)/4-b */ \ + t1v = _mm_add_epi16(t1v, t3v); /* (a-b)/4-b+c */ \ + t1v = _mm_srai_epi16(t1v, 2); /* ((a-b)/4-b+c)/4 */ \ + t1v = _mm_add_epi16(t1v, t3v); /* ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 */ \ +} + +#define HPEL_FILTER_HORIZONTAL() { \ + src2v = VEC_LD8(src + x + i_stride * y - 8); \ + src2v = _mm_unpacklo_epi64(src2v, src2v); \ + src3v = VEC_LD(src + x + i_stride * y); \ + src6v = VEC_LD8(src + x + i_stride * y + 16); \ + \ + src1v = VSLD(src2v, src3v, 14); \ + src2v = VSLD(src2v, src3v, 15); \ + src4v = VSLD(src3v, src6v, 1); \ + src5v = VSLD(src3v, src6v, 2); \ + src6v = VSLD(src3v, src6v, 3); \ + \ + HPEL_FILTER_1(lo, temp1v); \ + \ + dest1v = _mm_add_epi16(temp1v, sixteenv); \ + dest1v = _mm_srai_epi16(dest1v, 5); \ + \ + HPEL_FILTER_1(hi, temp1v); \ + \ + dest2v = _mm_add_epi16(temp1v, sixteenv); \ + dest2v = _mm_srai_epi16(dest2v, 5); \ + \ + destv = _mm_packus_epi16(dest1v, dest2v); \ + VEC_ST(dsth + x + i_stride * y, destv); \ +} + +#define HPEL_FILTER_VERTICAL() { \ + src1v = VEC_LD(src + x + i_stride * (y - 2)); \ + src2v = VEC_LD(src + x + i_stride * (y - 1)); \ + /* src3v = VEC_LD(src + x + i_stride * y); */ \ + src4v = VEC_LD(src + x + i_stride * (y + 1)); \ + src5v = VEC_LD(src + x + i_stride * (y + 2)); \ + src6v = VEC_LD(src + x + i_stride * (y + 3)); \ + \ + HPEL_FILTER_1(lo, temp7v); \ + \ + dest1v = _mm_add_epi16(temp7v, sixteenv); \ + dest1v = _mm_srai_epi16(dest1v, 5); \ + \ + HPEL_FILTER_1(hi, temp1v); \ + \ + dest2v = _mm_add_epi16(temp1v, sixteenv); \ + dest2v = _mm_srai_epi16(dest2v, 5); \ + \ + destv = _mm_packus_epi16(dest1v, dest2v); \ + VEC_ST(dstv + x + i_stride * y, destv); \ +} + +#define HPEL_FILTER_CENTRAL() { \ + temp1v = VSLD(tempav, tempbv, 12); \ + temp2v = VSLD(tempav, tempbv, 14); \ + temp3v = tempbv; \ + temp4v = VSLD(tempbv, tempcv, 2); \ + temp5v = VSLD(tempbv, tempcv, 4); \ + temp6v = VSLD(tempbv, tempcv, 6); \ + \ + HPEL_FILTER_2(temp1v, temp2v, temp3v, \ + temp4v, temp5v, temp6v); \ + \ + dest1v = _mm_add_epi16(temp1v, thirtytwov); \ + dest1v = _mm_srai_epi16(dest1v, 6); \ + \ + temp1v = VSLD(tempbv, tempcv, 12); \ + temp2v = VSLD(tempbv, tempcv, 14); \ + temp3v = tempcv; \ + temp4v = VSLD(tempcv, tempdv, 2); \ + temp5v = VSLD(tempcv, tempdv, 4); \ + temp6v = VSLD(tempcv, tempdv, 6); \ + \ + HPEL_FILTER_2(temp1v, temp2v, temp3v, \ + temp4v, temp5v, temp6v); \ + \ + dest2v = _mm_add_epi16(temp1v, thirtytwov); \ + dest2v = _mm_srai_epi16(dest2v, 6); \ + \ + destv = _mm_packus_epi16(dest1v, dest2v); \ + VEC_ST(dstc + x - 16 + i_stride * y, destv); \ +} + +static void hpel_filter_e2k(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, + intptr_t i_stride, int i_width, int i_height, int16_t *buf) { + int y; + vec_u8_t destv; + vec_u8_t src1v, src2v, src3v, src4v, src5v, src6v; + vec_s16_t dest1v, dest2v; + vec_s16_t temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v; + vec_s16_t tempav, tempbv, tempcv, tempdv, tempev; + HPEL_FILTER_1_INIT; + + vec_s16_t sixteenv, thirtytwov; + sixteenv = _mm_set1_epi16(16); + thirtytwov = _mm_set1_epi16(32); + + for (y = 0; y < i_height; y++) { + int x = 0; + + /* horizontal_filter */ + HPEL_FILTER_HORIZONTAL(); + + /* vertical_filter */ + HPEL_FILTER_VERTICAL(); + + /* central_filter */ + tempav = tempcv; + tempbv = tempdv; + tempcv = VEC_SPLAT16(temp1v, 0); /* first only */ + tempdv = temp7v; + tempev = temp1v; + + PRAGMA_E2K("ivdep") + for (x = 16; x < i_width; x += 16) { + /* horizontal_filter */ + HPEL_FILTER_HORIZONTAL(); + + /* vertical_filter */ + HPEL_FILTER_VERTICAL(); + + /* central_filter */ + tempav = tempcv; + tempbv = tempdv; + tempcv = tempev; + tempdv = temp7v; + tempev = temp1v; + + HPEL_FILTER_CENTRAL(); + } + + /* Partial vertical filter */ + src1v = VEC_LD(src + x + i_stride * (y - 2)); + src2v = VEC_LD(src + x + i_stride * (y - 1)); + src3v = VEC_LD(src + x + i_stride * y); + src4v = VEC_LD(src + x + i_stride * (y + 1)); + src5v = VEC_LD(src + x + i_stride * (y + 2)); + src6v = VEC_LD(src + x + i_stride * (y + 3)); + + HPEL_FILTER_1(lo, temp1v); + + /* central_filter */ + tempav = tempcv; + tempbv = tempdv; + tempcv = tempev; + tempdv = temp1v; + /* tempev is not used */ + + HPEL_FILTER_CENTRAL(); + } +} + +static void frame_init_lowres_core_e2k(uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, + intptr_t src_stride, intptr_t dst_stride, int width, int height) { + int x, y, w = width >> 4, end = (width & 15); + vec_u8_t src0v, src1v, src2v; + vec_u8_t lv, hv, src1p1v; + vec_u8_t avg0v, avg1v, avghv, avghp1v, avgleftv, avgrightv; + __m128i mask_even = _mm_set1_epi16(0xFF); + + for (y = 0; y < height; y++) { + uint8_t *src1 = src0 + src_stride; + uint8_t *src2 = src1 + src_stride; + + src0v = VEC_LD(src0); + src1v = VEC_LD(src1); + src2v = VEC_LD(src2); + + avg0v = _mm_avg_epu8(src0v, src1v); + avg1v = _mm_avg_epu8(src1v, src2v); + + PRAGMA_E2K("ivdep") + for (x = 0; x < w; x++) { + lv = VEC_LD(src0 + 16 * (x * 2 + 1)); + src1v = VEC_LD(src1 + 16 * (x * 2 + 1)); + avghv = _mm_avg_epu8(lv, src1v); + + lv = VEC_LD(src0 + 16 * (x * 2 + 2)); + src1p1v = VEC_LD(src1 + 16 * (x * 2 + 2)); + avghp1v = _mm_avg_epu8(lv, src1p1v); + + avgleftv = _mm_avg_epu8(VSLD(avg0v, avghv, 1), avg0v); + avgrightv = _mm_avg_epu8(VSLD(avghv, avghp1v, 1), avghv); + + lv = _mm_packus_epi16(_mm_and_si128(avgleftv, mask_even), _mm_and_si128(avgrightv, mask_even)); + hv = _mm_packus_epi16(_mm_srli_epi16(avgleftv, 8), _mm_srli_epi16(avgrightv, 8)); + VEC_ST(dst0 + 16 * x, lv); + VEC_ST(dsth + 16 * x, hv); + + avg0v = avghp1v; + + hv = VEC_LD(src2 + 16 * (x * 2 + 1)); + avghv = _mm_avg_epu8(src1v, hv); + + hv = VEC_LD(src2 + 16 * (x * 2 + 2)); + avghp1v = _mm_avg_epu8(src1p1v, hv); + + avgleftv = _mm_avg_epu8(VSLD(avg1v, avghv, 1), avg1v); + avgrightv = _mm_avg_epu8(VSLD(avghv, avghp1v, 1), avghv); + + lv = _mm_packus_epi16(_mm_and_si128(avgleftv, mask_even), _mm_and_si128(avgrightv, mask_even)); + hv = _mm_packus_epi16(_mm_srli_epi16(avgleftv, 8), _mm_srli_epi16(avgrightv, 8)); + VEC_ST(dstv + 16 * x, lv); + VEC_ST(dstc + 16 * x, hv); + + avg1v = avghp1v; + + } + if (end) { + lv = VEC_LD(src0 + 16 * (x * 2 + 1)); + src1v = VEC_LD(src1 + 16 * (x * 2 + 1)); + avghv = _mm_avg_epu8(lv, src1v); + + lv = VEC_LD(src2 + 16 * (x * 2 + 1)); + avghp1v = _mm_avg_epu8(src1v, lv); + + avgleftv = _mm_avg_epu8(VSLD(avg0v, avghv, 1), avg0v); + avgrightv = _mm_avg_epu8(VSLD(avg1v, avghp1v, 1), avg1v); + + lv = _mm_packus_epi16(_mm_and_si128(avgleftv, mask_even), _mm_and_si128(avgrightv, mask_even)); + hv = _mm_packus_epi16(_mm_srli_epi16(avgleftv, 8), _mm_srli_epi16(avgrightv, 8)); + VEC_STL(dst0 + 16 * x, lv); + VEC_STL(dsth + 16 * x, hv); + + VEC_STH(dstv + 16 * x, lv); + VEC_STH(dstc + 16 * x, hv); + } + + src0 += src_stride*2; + dst0 += dst_stride; + dsth += dst_stride; + dstv += dst_stride; + dstc += dst_stride; + } +} + +static void mc_weight_w2_e2k(uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src, + const x264_weight_t *weight, int i_height) { + LOAD_ZERO; + vec_u8_t srcv; + vec_s16_t weightv, scalev, offsetv, roundv; + + int y, denom = weight->i_denom; + scalev = _mm_set1_epi16(weight->i_scale); + offsetv = _mm_set1_epi16(weight->i_offset); + + if (denom >= 1) { + roundv = _mm_set1_epi16(1 << (denom - 1)); + + PRAGMA_E2K("ivdep") + for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) { + srcv = VEC_LD8(src); + weightv = vec_u8_to_s16(srcv); + + weightv = _mm_add_epi16(_mm_mullo_epi16(weightv, scalev), roundv); + weightv = _mm_srai_epi16(weightv, denom); + weightv = _mm_add_epi16(weightv, offsetv); + + srcv = _mm_packus_epi16(weightv, weightv); + *(uint16_t*)dst = _mm_extract_epi16(srcv, 0); + } + } else { + PRAGMA_E2K("ivdep") + for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) { + srcv = VEC_LD8(src); + weightv = vec_u8_to_s16(srcv); + + weightv = _mm_add_epi16(_mm_mullo_epi16(weightv, scalev), offsetv); + + srcv = _mm_packus_epi16(weightv, weightv); + *(uint16_t*)dst = _mm_extract_epi16(srcv, 0); + } + } +} + +static void mc_weight_w4_e2k(uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src, + const x264_weight_t *weight, int i_height) { + LOAD_ZERO; + vec_u8_t srcv; + vec_s16_t weightv, scalev, offsetv, roundv; + + int y, denom = weight->i_denom; + scalev = _mm_set1_epi16(weight->i_scale); + offsetv = _mm_set1_epi16(weight->i_offset); + + if (denom >= 1) { + roundv = _mm_set1_epi16(1 << (denom - 1)); + + PRAGMA_E2K("ivdep") + for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) { + srcv = VEC_LD8(src); + weightv = vec_u8_to_s16(srcv); + + weightv = _mm_add_epi16(_mm_mullo_epi16(weightv, scalev), roundv); + weightv = _mm_srai_epi16(weightv, denom); + weightv = _mm_add_epi16(weightv, offsetv); + + srcv = _mm_packus_epi16(weightv, weightv); + *(uint32_t*)dst = _mm_extract_epi32(srcv, 0); + } + } else { + PRAGMA_E2K("ivdep") + for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) { + srcv = VEC_LD8(src); + weightv = vec_u8_to_s16(srcv); + + weightv = _mm_add_epi16(_mm_mullo_epi16(weightv, scalev), offsetv); + + srcv = _mm_packus_epi16(weightv, weightv); + *(uint32_t*)dst = _mm_extract_epi32(srcv, 0); + } + } +} + +static void mc_weight_w8_e2k(uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src, + const x264_weight_t *weight, int i_height) { + LOAD_ZERO; + vec_u8_t srcv; + vec_s16_t weightv, scalev, offsetv, roundv; + + int y, denom = weight->i_denom; + scalev = _mm_set1_epi16(weight->i_scale); + offsetv = _mm_set1_epi16(weight->i_offset); + + if (denom >= 1) { + roundv = _mm_set1_epi16(1 << (denom - 1)); + + PRAGMA_E2K("ivdep") + for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) { + srcv = VEC_LD(src); + weightv = vec_u8_to_s16(srcv); + + weightv = _mm_add_epi16(_mm_mullo_epi16(weightv, scalev), roundv); + weightv = _mm_srai_epi16(weightv, denom); + weightv = _mm_add_epi16(weightv, offsetv); + + srcv = _mm_packus_epi16(weightv, weightv); + VEC_STL(dst, srcv); + } + } else { + PRAGMA_E2K("ivdep") + for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) { + srcv = VEC_LD(src); + weightv = vec_u8_to_s16(srcv); + + weightv = _mm_add_epi16(_mm_mullo_epi16(weightv, scalev), offsetv); + + srcv = _mm_packus_epi16(weightv, weightv); + VEC_STL(dst, srcv); + } + } +} + +static void mc_weight_w16_e2k(uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src, + const x264_weight_t *weight, int i_height) { + LOAD_ZERO; + vec_u8_t srcv; + vec_s16_t weight_lv, weight_hv, scalev, offsetv, roundv; + + int y, denom = weight->i_denom; + scalev = _mm_set1_epi16(weight->i_scale); + offsetv = _mm_set1_epi16(weight->i_offset); + + if (denom >= 1) { + roundv = _mm_set1_epi16(1 << (denom - 1)); + + PRAGMA_E2K("ivdep") + for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) { + srcv = VEC_LD(src); + weight_hv = vec_u8_to_s16_h(srcv); + weight_lv = vec_u8_to_s16_l(srcv); + + weight_hv = _mm_add_epi16(_mm_mullo_epi16(weight_hv, scalev), roundv); + weight_lv = _mm_add_epi16(_mm_mullo_epi16(weight_lv, scalev), roundv); + weight_hv = _mm_srai_epi16(weight_hv, denom); + weight_lv = _mm_srai_epi16(weight_lv, denom); + weight_hv = _mm_add_epi16(weight_hv, offsetv); + weight_lv = _mm_add_epi16(weight_lv, offsetv); + + srcv = _mm_packus_epi16(weight_hv, weight_lv); + VEC_ST(dst, srcv); + } + } else { + PRAGMA_E2K("ivdep") + for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) { + srcv = VEC_LD(src); + weight_hv = vec_u8_to_s16_h(srcv); + weight_lv = vec_u8_to_s16_l(srcv); + + weight_hv = _mm_add_epi16(_mm_mullo_epi16(weight_hv, scalev), offsetv); + weight_lv = _mm_add_epi16(_mm_mullo_epi16(weight_lv, scalev), offsetv); + + srcv = _mm_packus_epi16(weight_hv, weight_lv); + VEC_ST(dst, srcv); + } + } +} + +static void mc_weight_w20_e2k(uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src, + const x264_weight_t *weight, int i_height) { + LOAD_ZERO; + vec_u8_t srcv, srcv2; + vec_s16_t weight_lv, weight_hv, weight_3v; + vec_s16_t scalev, offsetv, roundv; + + int y, denom = weight->i_denom; + scalev = _mm_set1_epi16(weight->i_scale); + offsetv = _mm_set1_epi16(weight->i_offset); + + if (denom >= 1) { + roundv = _mm_set1_epi16(1 << (denom - 1)); + + PRAGMA_E2K("ivdep") + for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) { + srcv = VEC_LD(src); + srcv2 = VEC_LD8(src + 16); + + weight_hv = vec_u8_to_s16_h(srcv); + weight_lv = vec_u8_to_s16_l(srcv); + weight_3v = vec_u8_to_s16_h(srcv2); + + weight_hv = _mm_add_epi16(_mm_mullo_epi16(weight_hv, scalev), roundv); + weight_lv = _mm_add_epi16(_mm_mullo_epi16(weight_lv, scalev), roundv); + weight_3v = _mm_add_epi16(_mm_mullo_epi16(weight_3v, scalev), roundv); + + weight_hv = _mm_srai_epi16(weight_hv, denom); + weight_lv = _mm_srai_epi16(weight_lv, denom); + weight_3v = _mm_srai_epi16(weight_3v, denom); + + weight_hv = _mm_add_epi16(weight_hv, offsetv); + weight_lv = _mm_add_epi16(weight_lv, offsetv); + weight_3v = _mm_add_epi16(weight_3v, offsetv); + + srcv = _mm_packus_epi16(weight_hv, weight_lv); + srcv2 = _mm_packus_epi16(weight_3v, weight_3v); + VEC_ST(dst, srcv); + *(uint32_t*)(dst + 16) = _mm_extract_epi32(srcv2, 0); + } + } else { + PRAGMA_E2K("ivdep") + for (y = 0; y < i_height; y++, dst += i_dst, src += i_src) { + srcv = VEC_LD(src); + srcv2 = VEC_LD8(src + 16); + + weight_hv = vec_u8_to_s16_h(srcv); + weight_lv = vec_u8_to_s16_l(srcv); + weight_3v = vec_u8_to_s16_h(srcv2); + + weight_hv = _mm_add_epi16(_mm_mullo_epi16(weight_hv, scalev), offsetv); + weight_lv = _mm_add_epi16(_mm_mullo_epi16(weight_lv, scalev), offsetv); + weight_3v = _mm_add_epi16(_mm_mullo_epi16(weight_3v, scalev), offsetv); + + srcv = _mm_packus_epi16(weight_hv, weight_lv); + srcv2 = _mm_packus_epi16(weight_3v, weight_3v); + VEC_ST(dst, srcv); + *(uint32_t*)(dst + 16) = _mm_extract_epi32(srcv2, 0); + } + } +} + +static weight_fn_t mc_weight_wtab_e2k[6] = { + mc_weight_w2_e2k, + mc_weight_w4_e2k, + mc_weight_w8_e2k, + mc_weight_w16_e2k, + mc_weight_w16_e2k, + mc_weight_w20_e2k, +}; + +PLANE_COPY_SWAP(16, e2k) +PLANE_INTERLEAVE(e2k) +#endif // !HIGH_BIT_DEPTH + +#if HIGH_BIT_DEPTH + +#define STORE_8(a, b, shl0, shl1) { \ + v3 = _mm_shuffle_epi8(v##a, idx##a); \ + v5 = _mm_shuffle_epi8(v##b, idx##b); \ + v4 = _mm_unpacklo_epi64(v3, v5); \ + v5 = _mm_unpackhi_epi64(v3, v5); \ + v4 = _mm_mullo_epi16(v4, shl0); \ + v5 = _mm_mullo_epi16(v5, shl1); \ + v4 = _mm_srli_epi16(v4, 6); \ + v5 = _mm_srli_epi16(v5, 6); \ + VEC_ST(dstc + offset, v4); \ + VEC_ST(dsty + offset, v5); \ + offset += 8; \ +} + +// v210 input is only compatible with bit-depth of 10 bits +static void plane_copy_deinterleave_v210_e2k(uint16_t *dsty, intptr_t i_dsty, + uint16_t *dstc, intptr_t i_dstc, + uint32_t *src, intptr_t i_src, int w, int h) { + __m128i idx0 = _mm_setr_epi8( + 0x00, 0x01, 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x01, 0x02, 0x04, 0x05, 0x06, 0x07, 0x09, 0x0A); + __m128i idx1 = _mm_setr_epi8( + 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05, 0x06, 0x07, 0x09, 0x0A, 0x0C, 0x0D); + __m128i idx2 = _mm_setr_epi8( + 0x05, 0x06, 0x08, 0x09, 0x0A, 0x0B, 0x0D, 0x0E, 0x06, 0x07, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x0F); + + __m128i shl0 = _mm_setr_epi16(64, 4, 16, 64, 4, 16, 64, 4), + shl1 = _mm_setr_epi16(16, 64, 4, 16, 64, 4, 16, 64), + shl2 = _mm_setr_epi16(4, 16, 64, 4, 16, 64, 4, 16); + + __m128i v0, v1, v2, v3, v4, v5; + int i, j; + + for (i = 0; i < h; i++, dsty += i_dsty, dstc += i_dstc, src += i_src) { + int offset = 0, s = 0; + + PRAGMA_E2K("ivdep") + for (j = 0; j < w; j += 24, s += 16) { + v0 = VEC_LD(src + s); + v2 = VEC_LD(src + s + 4); + v1 = _mm_alignr_epi8(v2, v0, 8); + STORE_8(0, 1, shl0, shl1); + + v0 = VEC_LD(src + s + 8); + STORE_8(2, 0, shl1, shl2); + + v2 = VEC_LD(src + s + 12); + v1 = _mm_alignr_epi8(v2, v0, 8); + STORE_8(1, 2, shl2, shl0); + } + } +} +#endif // HIGH_BIT_DEPTH + +void x264_mc_init_e2k(x264_mc_functions_t *pf) { +#if HIGH_BIT_DEPTH + pf->plane_copy_deinterleave_v210 = plane_copy_deinterleave_v210_e2k; +#else // !HIGH_BIT_DEPTH + pf->mc_luma = mc_luma_e2k; + pf->get_ref = get_ref_e2k; + pf->mc_chroma = mc_chroma_e2k; + + pf->avg[PIXEL_16x16] = pixel_avg_16x16_e2k; + pf->avg[PIXEL_16x8] = pixel_avg_16x8_e2k; + pf->avg[PIXEL_8x16] = pixel_avg_8x16_e2k; + pf->avg[PIXEL_8x8] = pixel_avg_8x8_e2k; + pf->avg[PIXEL_8x4] = pixel_avg_8x4_e2k; + pf->avg[PIXEL_4x16] = pixel_avg_4x16_e2k; + pf->avg[PIXEL_4x8] = pixel_avg_4x8_e2k; + pf->avg[PIXEL_4x4] = pixel_avg_4x4_e2k; + pf->avg[PIXEL_4x2] = pixel_avg_4x2_e2k; + pf->avg[PIXEL_2x8] = pixel_avg_2x8_e2k; + pf->avg[PIXEL_2x4] = pixel_avg_2x4_e2k; + pf->avg[PIXEL_2x2] = pixel_avg_2x2_e2k; + + pf->copy_16x16_unaligned = mc_copy_w16_e2k; + pf->copy[PIXEL_16x16] = mc_copy_w16_aligned_e2k; + + pf->hpel_filter = hpel_filter_e2k; + pf->frame_init_lowres_core = frame_init_lowres_core_e2k; + + pf->weight = mc_weight_wtab_e2k; + + pf->plane_copy_swap = plane_copy_swap_e2k; + pf->plane_copy_interleave = plane_copy_interleave_e2k; + pf->store_interleave_chroma = store_interleave_chroma_e2k; + pf->plane_copy_deinterleave = plane_copy_deinterleave_e2k; + pf->plane_copy_deinterleave_rgb = plane_copy_deinterleave_rgb_e2k; + + pf->load_deinterleave_chroma_fenc = load_deinterleave_chroma_fenc_e2k; + pf->load_deinterleave_chroma_fdec = load_deinterleave_chroma_fdec_e2k; +#endif // !HIGH_BIT_DEPTH +} diff --git a/common/e2k/mc.h b/common/e2k/mc.h new file mode 100644 index 0000000..3e0c8e1 --- /dev/null +++ b/common/e2k/mc.h @@ -0,0 +1,30 @@ +/***************************************************************************** + * mc.h: e2k motion compensation + ***************************************************************************** + * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_E2K_MC_H +#define X264_E2K_MC_H + +#define x264_mc_init_e2k x264_template(mc_init_e2k) +void x264_mc_init_e2k( x264_mc_functions_t *pf ); + +#endif diff --git a/common/e2k/pixel.c b/common/e2k/pixel.c new file mode 100644 index 0000000..e68fbda --- /dev/null +++ b/common/e2k/pixel.c @@ -0,0 +1,1169 @@ +/***************************************************************************** + * pixel.c: e2k pixel metrics + ***************************************************************************** + * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd + * Copyright (C) 2003-2017 x264 project + * + * Authors: Eric Petit + * Guillaume Poirier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "e2kcommon.h" +#include "pixel.h" + +#if !HIGH_BIT_DEPTH +/*********************************************************************** + * SAD routines + **********************************************************************/ + +#define SAD_LD8(pix) *(__m64*)pix +#define SAD_LD16(pix) VEC_LD(pix) +#define SAD_NEXT8(sumv, fencv, pixv) sumv = _mm_add_pi32(sumv, _mm_sad_pu8(fencv, pixv)) +#define SAD_NEXT16(sumv, fencv, pixv) sumv = _mm_add_epi32(sumv, _mm_sad_epu8(fencv, pixv)) +#define SAD_SUM8(sumv) _mm_extract_pi32(sumv, 0) +#define SAD_SUM16(sumv) _mm_extract_epi32(sumv, 0) + _mm_extract_epi32(sumv, 2) + +#ifdef NEED_ALIGN8 +#define PIXEL_SAD_E2K(lx, ly, vec, zerov) \ +static int pixel_sad_##lx##x##ly##_e2k(uint8_t *pix1, intptr_t i_pix1, \ + uint8_t *pix2, intptr_t i_pix2) { \ + vec v0, v1, sumv = zerov; \ + int y; \ + for (y = 0; y < ly; y++) { \ + v0 = SAD_LD##lx(pix1); pix1 += i_pix1; \ + v1 = SAD_LD##lx(pix2); pix2 += i_pix2; \ + SAD_NEXT##lx(sumv, v0, v1); \ + } \ + return SAD_SUM##lx(sumv); \ +} \ +static void pixel_sad_x4_##lx##x##ly##_e2k(uint8_t *fenc, \ + uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, \ + intptr_t stride, int scores[4]) { \ + vec fencv, pix0v, pix1v, pix2v, pix3v; \ + vec sum0v = zerov, sum1v = sum0v, sum2v = sum0v, sum3v = sum0v; \ + int y; \ + if (!(stride & 7) && !(FENC_STRIDE & 7)) { \ + ALIGN8A_COMMON \ + ALIGN8A_VARS(fenc) \ + ALIGN8A_VARS(pix0) \ + ALIGN8A_VARS(pix1) \ + ALIGN8A_VARS(pix2) \ + ALIGN8A_VARS(pix3) \ + stride >>= 3; \ + ALIGN8A_START(fenc, fenc) \ + ALIGN8A_START(pix0, pix0) \ + ALIGN8A_START(pix1, pix1) \ + ALIGN8A_START(pix2, pix2) \ + ALIGN8A_START(pix3, pix3) \ + for (y = 0; y < ly; y++) { \ + ALIGN8A_READ##lx(pix0v, pix0); pix0_ptr += stride; \ + ALIGN8A_READ##lx(pix1v, pix1); pix1_ptr += stride; \ + ALIGN8A_READ##lx(fencv, fenc); fenc_ptr += FENC_STRIDE / 8; \ + ALIGN8A_READ##lx(pix2v, pix2); pix2_ptr += stride; \ + ALIGN8A_READ##lx(pix3v, pix3); pix3_ptr += stride; \ + SAD_NEXT##lx(sum0v, fencv, pix0v); \ + SAD_NEXT##lx(sum1v, fencv, pix1v); \ + SAD_NEXT##lx(sum2v, fencv, pix2v); \ + SAD_NEXT##lx(sum3v, fencv, pix3v); \ + } \ + } else \ + for (y = 0; y < ly; y++) { \ + pix0v = SAD_LD##lx(pix0); pix0 += stride; \ + pix1v = SAD_LD##lx(pix1); pix1 += stride; \ + fencv = SAD_LD##lx(fenc); fenc += FENC_STRIDE; \ + pix2v = SAD_LD##lx(pix2); pix2 += stride; \ + pix3v = SAD_LD##lx(pix3); pix3 += stride; \ + SAD_NEXT##lx(sum0v, fencv, pix0v); \ + SAD_NEXT##lx(sum1v, fencv, pix1v); \ + SAD_NEXT##lx(sum2v, fencv, pix2v); \ + SAD_NEXT##lx(sum3v, fencv, pix3v); \ + } \ + scores[0] = SAD_SUM##lx(sum0v); \ + scores[1] = SAD_SUM##lx(sum1v); \ + scores[2] = SAD_SUM##lx(sum2v); \ + scores[3] = SAD_SUM##lx(sum3v); \ +} \ +static void pixel_sad_x3_##lx##x##ly##_e2k(uint8_t *fenc, \ + uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, \ + intptr_t stride, int scores[4]) { \ + vec fencv, pix0v, pix1v, pix2v; \ + vec sum0v = zerov, sum1v = sum0v, sum2v = sum0v; \ + int y; \ + if (!(stride & 7) && !(FENC_STRIDE & 7)) { \ + ALIGN8A_COMMON \ + ALIGN8A_VARS(fenc) \ + ALIGN8A_VARS(pix0) \ + ALIGN8A_VARS(pix1) \ + ALIGN8A_VARS(pix2) \ + stride >>= 3; \ + ALIGN8A_START(fenc, fenc) \ + ALIGN8A_START(pix0, pix0) \ + ALIGN8A_START(pix1, pix1) \ + ALIGN8A_START(pix2, pix2) \ + for (y = 0; y < ly; y++) { \ + ALIGN8A_READ##lx(pix0v, pix0); pix0_ptr += stride; \ + ALIGN8A_READ##lx(pix1v, pix1); pix1_ptr += stride; \ + ALIGN8A_READ##lx(fencv, fenc); fenc_ptr += FENC_STRIDE / 8; \ + ALIGN8A_READ##lx(pix2v, pix2); pix2_ptr += stride; \ + SAD_NEXT##lx(sum0v, fencv, pix0v); \ + SAD_NEXT##lx(sum1v, fencv, pix1v); \ + SAD_NEXT##lx(sum2v, fencv, pix2v); \ + } \ + } else \ + for (y = 0; y < ly; y++) { \ + pix0v = SAD_LD##lx(pix0); pix0 += stride; \ + pix1v = SAD_LD##lx(pix1); pix1 += stride; \ + fencv = SAD_LD##lx(fenc); fenc += FENC_STRIDE; \ + pix2v = SAD_LD##lx(pix2); pix2 += stride; \ + SAD_NEXT##lx(sum0v, fencv, pix0v); \ + SAD_NEXT##lx(sum1v, fencv, pix1v); \ + SAD_NEXT##lx(sum2v, fencv, pix2v); \ + } \ + scores[0] = SAD_SUM##lx(sum0v); \ + scores[1] = SAD_SUM##lx(sum1v); \ + scores[2] = SAD_SUM##lx(sum2v); \ +} + +#else + +#define PIXEL_SAD_E2K(lx, ly, vec, zerov) \ +static int pixel_sad_##lx##x##ly##_e2k(uint8_t *pix1, intptr_t i_pix1, \ + uint8_t *pix2, intptr_t i_pix2) { \ + vec v0, v1, sumv = zerov; \ + int y; \ + for (y = 0; y < ly; y++) { \ + v0 = SAD_LD##lx(pix1); pix1 += i_pix1; \ + v1 = SAD_LD##lx(pix2); pix2 += i_pix2; \ + SAD_NEXT##lx(sumv, v0, v1); \ + } \ + return SAD_SUM##lx(sumv); \ +} \ +static void pixel_sad_x4_##lx##x##ly##_e2k(uint8_t *fenc, \ + uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, \ + intptr_t stride, int scores[4]) { \ + vec fencv, pix0v, pix1v, pix2v, pix3v; \ + vec sum0v = zerov, sum1v = sum0v, sum2v = sum0v, sum3v = sum0v; \ + int y; \ + for (y = 0; y < ly; y++) { \ + pix0v = SAD_LD##lx(pix0); pix0 += stride; \ + pix1v = SAD_LD##lx(pix1); pix1 += stride; \ + fencv = SAD_LD##lx(fenc); fenc += FENC_STRIDE; \ + pix2v = SAD_LD##lx(pix2); pix2 += stride; \ + pix3v = SAD_LD##lx(pix3); pix3 += stride; \ + SAD_NEXT##lx(sum0v, fencv, pix0v); \ + SAD_NEXT##lx(sum1v, fencv, pix1v); \ + SAD_NEXT##lx(sum2v, fencv, pix2v); \ + SAD_NEXT##lx(sum3v, fencv, pix3v); \ + } \ + scores[0] = SAD_SUM##lx(sum0v); \ + scores[1] = SAD_SUM##lx(sum1v); \ + scores[2] = SAD_SUM##lx(sum2v); \ + scores[3] = SAD_SUM##lx(sum3v); \ +} \ +static void pixel_sad_x3_##lx##x##ly##_e2k(uint8_t *fenc, \ + uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, \ + intptr_t stride, int scores[4]) { \ + vec fencv, pix0v, pix1v, pix2v; \ + vec sum0v = zerov, sum1v = sum0v, sum2v = sum0v; \ + int y; \ + for (y = 0; y < ly; y++) { \ + pix0v = SAD_LD##lx(pix0); pix0 += stride; \ + pix1v = SAD_LD##lx(pix1); pix1 += stride; \ + fencv = SAD_LD##lx(fenc); fenc += FENC_STRIDE; \ + pix2v = SAD_LD##lx(pix2); pix2 += stride; \ + SAD_NEXT##lx(sum0v, fencv, pix0v); \ + SAD_NEXT##lx(sum1v, fencv, pix1v); \ + SAD_NEXT##lx(sum2v, fencv, pix2v); \ + } \ + scores[0] = SAD_SUM##lx(sum0v); \ + scores[1] = SAD_SUM##lx(sum1v); \ + scores[2] = SAD_SUM##lx(sum2v); \ +} + +#endif + +PIXEL_SAD_E2K(8, 8, __m64, _mm_setzero_si64()) +PIXEL_SAD_E2K(8, 16, __m64, _mm_setzero_si64()) +PIXEL_SAD_E2K(16, 8, __m128i, _mm_setzero_si128()) +PIXEL_SAD_E2K(16, 16, __m128i, _mm_setzero_si128()) + +/*********************************************************************** + * SATD routines + **********************************************************************/ + +/*********************************************************************** + * VEC_HADAMAR + *********************************************************************** + * b[0] = a[0] + a[1] + a[2] + a[3] + * b[1] = a[0] + a[1] - a[2] - a[3] + * b[2] = a[0] - a[1] - a[2] + a[3] + * b[3] = a[0] - a[1] + a[2] - a[3] + **********************************************************************/ +#define VEC_HADAMAR(a0, a1, a2, a3, b0, b1, b2, b3) \ + b2 = _mm_add_epi16(a0, a1); \ + b3 = _mm_add_epi16(a2, a3); \ + a0 = _mm_sub_epi16(a0, a1); \ + a2 = _mm_sub_epi16(a2, a3); \ + b0 = _mm_add_epi16(b2, b3); \ + b1 = _mm_sub_epi16(b2, b3); \ + b2 = _mm_sub_epi16(a0, a2); \ + b3 = _mm_add_epi16(a0, a2) + +/*********************************************************************** + * VEC_ABS + *********************************************************************** + * a: s16v + * + * a = abs(a) + * + * Call vec_sub()/vec_max() instead of vec_abs() because vec_abs() + * actually also calls vec_splat(0), but we already have a null vector. + **********************************************************************/ +#define VEC_ABSOLUTE(a) _mm_abs_epi16(a) + +#define VEC_TRANSPOSE_8X4(a0, a1, a2, a3, b0, b1, b2, b3) \ + b0 = _mm_unpacklo_epi16(a0, a2); \ + b1 = _mm_unpackhi_epi16(a0, a2); \ + b2 = _mm_unpacklo_epi16(a1, a3); \ + b3 = _mm_unpackhi_epi16(a1, a3); \ + a0 = _mm_unpacklo_epi16(b0, b2); \ + a1 = _mm_unpackhi_epi16(b0, b2); \ + a2 = _mm_unpacklo_epi16(b1, b3); \ + a3 = _mm_unpackhi_epi16(b1, b3); \ + b0 = _mm_unpacklo_epi64(a0, a2); \ + b1 = _mm_unpackhi_epi64(a0, a2); \ + b2 = _mm_unpacklo_epi64(a1, a3); \ + b3 = _mm_unpackhi_epi64(a1, a3); + +#define PREP_DIFF \ + LOAD_ZERO; \ + vec_s16_t pix1v, pix2v; + +#define VEC_DIFF8X1_H(p1,i1,p2,i2,d) \ + pix1v = VEC_LD8(p1); \ + pix2v = VEC_LD8(p2); \ + d = _mm_sub_epi16(vec_u8_to_s16(pix1v), vec_u8_to_s16(pix2v)); \ + p1 += i1; p2 += i2 + +#define VEC_DIFF4X1_H(p1,i1,p2,i2,d) \ + pix1v = _mm_cvtsi32_si128(*(uint32_t*)p1); \ + pix2v = _mm_cvtsi32_si128(*(uint32_t*)p2); \ + d = _mm_sub_epi16(vec_u8_to_s16(pix1v), vec_u8_to_s16(pix2v)); \ + p1 += i1; p2 += i2 + +#define VEC_DIFF4X2_H(p1,i1,p2,i2,d) \ + pix1v = _mm_setr_epi32(*(uint32_t*)p1, *(uint32_t*)(p1 + i1 * 4), 0, 0); \ + pix2v = _mm_setr_epi32(*(uint32_t*)p2, *(uint32_t*)(p2 + i2 * 4), 0, 0); \ + d = _mm_sub_epi16(vec_u8_to_s16(pix1v), vec_u8_to_s16(pix2v)); \ + p1 += i1; p2 += i2 + +/*********************************************************************** + * SATD 4x4 + **********************************************************************/ +static int pixel_satd_4x4_e2k(uint8_t *pix1, intptr_t i_pix1, + uint8_t *pix2, intptr_t i_pix2) { + PREP_DIFF; + vec_s16_t diff0v, diff1v, diff2v, diff3v; + vec_s16_t temp0v, temp1v, temp2v, temp3v; + vec_u16_t satdv; __m64 halfv; + + VEC_DIFF4X1_H(pix1, i_pix1, pix2, i_pix2, diff0v); + VEC_DIFF4X1_H(pix1, i_pix1, pix2, i_pix2, diff1v); + VEC_DIFF4X1_H(pix1, i_pix1, pix2, i_pix2, diff2v); + VEC_DIFF4X1_H(pix1, i_pix1, pix2, i_pix2, diff3v); + + /* Hadamar H */ + VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v, + temp0v, temp1v, temp2v, temp3v); + + VEC_TRANSPOSE_4(temp0v, temp1v, temp2v, temp3v, + diff0v, diff1v, diff2v, diff3v); + /* Hadamar V */ + VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v, + temp0v, temp1v, temp2v, temp3v); + + temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v)); + temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v)); + satdv = _mm_add_epi16(temp0v, temp2v); + + halfv = _mm_movepi64_pi64(satdv); + halfv = _mm_hadd_pi16(halfv, halfv); + halfv = _mm_hadd_pi16(halfv, halfv); + return _mm_extract_pi16(halfv, 0) >> 1; +} + +/*********************************************************************** + * SATD 4x8 + **********************************************************************/ +static int pixel_satd_4x8_e2k(uint8_t *pix1, intptr_t i_pix1, + uint8_t *pix2, intptr_t i_pix2) { + PREP_DIFF; + vec_s16_t diff0v, diff1v, diff2v, diff3v; + vec_s16_t temp0v, temp1v, temp2v, temp3v; + vec_u16_t satdv; __m64 halfv; + + VEC_DIFF4X2_H(pix1, i_pix1, pix2, i_pix2, diff0v); + VEC_DIFF4X2_H(pix1, i_pix1, pix2, i_pix2, diff1v); + VEC_DIFF4X2_H(pix1, i_pix1, pix2, i_pix2, diff2v); + VEC_DIFF4X2_H(pix1, i_pix1, pix2, i_pix2, diff3v); + + VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v, + temp0v, temp1v, temp2v, temp3v); + VEC_TRANSPOSE_8X4(temp0v, temp1v, temp2v, temp3v, + diff0v, diff1v, diff2v, diff3v); + VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v, + temp0v, temp1v, temp2v, temp3v); + + temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v)); + temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v)); + satdv = _mm_add_epi16(temp0v, temp2v); + satdv = _mm_hadd_epi16(satdv, satdv); + + halfv = _mm_movepi64_pi64(satdv); + halfv = _mm_hadd_pi16(halfv, halfv); + return (_mm_extract_pi16(halfv, 0) + _mm_extract_pi16(halfv, 1)) >> 1; +} + +/*********************************************************************** + * SATD 8x4 + **********************************************************************/ +static int pixel_satd_8x4_e2k(uint8_t *pix1, intptr_t i_pix1, + uint8_t *pix2, intptr_t i_pix2) { + PREP_DIFF; + vec_s16_t diff0v, diff1v, diff2v, diff3v; + vec_s16_t temp0v, temp1v, temp2v, temp3v; + vec_u16_t satdv; __m64 halfv; + + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff0v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff1v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff2v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff3v); + + VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v, + temp0v, temp1v, temp2v, temp3v); + VEC_TRANSPOSE_8X4(temp0v, temp1v, temp2v, temp3v, + diff0v, diff1v, diff2v, diff3v); + VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v, + temp0v, temp1v, temp2v, temp3v); + + temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v)); + temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v)); + satdv = _mm_add_epi16(temp0v, temp2v); + satdv = _mm_hadd_epi16(satdv, satdv); + + halfv = _mm_movepi64_pi64(satdv); + halfv = _mm_hadd_pi16(halfv, halfv); + return (_mm_extract_pi16(halfv, 0) + _mm_extract_pi16(halfv, 1)) >> 1; +} + +/*********************************************************************** + * SATD 8x8 + **********************************************************************/ +static int pixel_satd_8x8_e2k(uint8_t *pix1, intptr_t i_pix1, + uint8_t *pix2, intptr_t i_pix2) { + PREP_DIFF; + vec_s16_t diff0v, diff1v, diff2v, diff3v, + diff4v, diff5v, diff6v, diff7v; + vec_s16_t temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v; + vec_u16_t satdv; + + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff0v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff1v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff2v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff3v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff4v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff5v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff6v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff7v); + + VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v, + temp0v, temp1v, temp2v, temp3v); + VEC_HADAMAR(diff4v, diff5v, diff6v, diff7v, + temp4v, temp5v, temp6v, temp7v); + + VEC_TRANSPOSE_8(temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v, + diff0v, diff1v, diff2v, diff3v, + diff4v, diff5v, diff6v, diff7v); + + VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v, + temp0v, temp1v, temp2v, temp3v); + VEC_HADAMAR(diff4v, diff5v, diff6v, diff7v, + temp4v, temp5v, temp6v, temp7v); + + temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v)); + temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v)); + temp0v = _mm_add_epi16(temp0v, temp2v); + temp4v = _mm_add_epi16(_mm_abs_epi16(temp4v), _mm_abs_epi16(temp5v)); + temp6v = _mm_add_epi16(_mm_abs_epi16(temp6v), _mm_abs_epi16(temp7v)); + temp4v = _mm_add_epi16(temp4v, temp6v); + satdv = _mm_add_epi16(temp0v, temp4v); + satdv = _mm_add_epi32(_mm_unpacklo_epi16(satdv, zerov), _mm_unpackhi_epi16(satdv, zerov)); + + satdv = _mm_hadd_epi32(satdv, satdv); + return (_mm_extract_epi32(satdv, 0) + _mm_extract_epi32(satdv, 1)) >> 1; +} + +/*********************************************************************** + * SATD 8x16 + **********************************************************************/ +static int pixel_satd_8x16_e2k(uint8_t *pix1, intptr_t i_pix1, + uint8_t *pix2, intptr_t i_pix2) { + PREP_DIFF; + vec_s16_t diff0v, diff1v, diff2v, diff3v, + diff4v, diff5v, diff6v, diff7v; + vec_s16_t temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v; + vec_u16_t satdv; + + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff0v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff1v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff2v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff3v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff4v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff5v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff6v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff7v); + VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v, + temp0v, temp1v, temp2v, temp3v); + VEC_HADAMAR(diff4v, diff5v, diff6v, diff7v, + temp4v, temp5v, temp6v, temp7v); + VEC_TRANSPOSE_8(temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v, + diff0v, diff1v, diff2v, diff3v, + diff4v, diff5v, diff6v, diff7v); + VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v, + temp0v, temp1v, temp2v, temp3v); + VEC_HADAMAR(diff4v, diff5v, diff6v, diff7v, + temp4v, temp5v, temp6v, temp7v); + + temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v)); + temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v)); + temp0v = _mm_add_epi16(temp0v, temp2v); + temp4v = _mm_add_epi16(_mm_abs_epi16(temp4v), _mm_abs_epi16(temp5v)); + temp6v = _mm_add_epi16(_mm_abs_epi16(temp6v), _mm_abs_epi16(temp7v)); + temp4v = _mm_add_epi16(temp4v, temp6v); + satdv = _mm_add_epi16(temp0v, temp4v); + + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff0v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff1v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff2v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff3v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff4v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff5v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff6v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff7v); + VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v, + temp0v, temp1v, temp2v, temp3v); + VEC_HADAMAR(diff4v, diff5v, diff6v, diff7v, + temp4v, temp5v, temp6v, temp7v); + VEC_TRANSPOSE_8(temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v, + diff0v, diff1v, diff2v, diff3v, + diff4v, diff5v, diff6v, diff7v); + VEC_HADAMAR(diff0v, diff1v, diff2v, diff3v, + temp0v, temp1v, temp2v, temp3v); + VEC_HADAMAR(diff4v, diff5v, diff6v, diff7v, + temp4v, temp5v, temp6v, temp7v); + + temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v)); + temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v)); + temp0v = _mm_add_epi16(temp0v, temp2v); + temp4v = _mm_add_epi16(_mm_abs_epi16(temp4v), _mm_abs_epi16(temp5v)); + temp6v = _mm_add_epi16(_mm_abs_epi16(temp6v), _mm_abs_epi16(temp7v)); + temp4v = _mm_add_epi16(temp4v, temp6v); + satdv = _mm_add_epi16(satdv, _mm_add_epi16(temp0v, temp4v)); + satdv = _mm_add_epi32(_mm_unpacklo_epi16(satdv, zerov), _mm_unpackhi_epi16(satdv, zerov)); + + satdv = _mm_hadd_epi32(satdv, satdv); + return (_mm_extract_epi32(satdv, 0) + _mm_extract_epi32(satdv, 1)) >> 1; +} + +/*********************************************************************** + * SATD 16x8 + **********************************************************************/ +static int pixel_satd_16x8_e2k(uint8_t *pix1, intptr_t i_pix1, + uint8_t *pix2, intptr_t i_pix2) { + LOAD_ZERO; + vec_u16_t satdv; + vec_s16_t pix1v, pix2v; + vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v, + diffh4v, diffh5v, diffh6v, diffh7v; + vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v, + diffl4v, diffl5v, diffl6v, diffl7v; + vec_s16_t temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v; + + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v); + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v); + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v); + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v); + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v); + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v); + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v); + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v); + + VEC_HADAMAR(diffh0v, diffh1v, diffh2v, diffh3v, + temp0v, temp1v, temp2v, temp3v); + VEC_HADAMAR(diffh4v, diffh5v, diffh6v, diffh7v, + temp4v, temp5v, temp6v, temp7v); + + VEC_TRANSPOSE_8(temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v, + diffh0v, diffh1v, diffh2v, diffh3v, + diffh4v, diffh5v, diffh6v, diffh7v); + + VEC_HADAMAR(diffh0v, diffh1v, diffh2v, diffh3v, + temp0v, temp1v, temp2v, temp3v); + VEC_HADAMAR(diffh4v, diffh5v, diffh6v, diffh7v, + temp4v, temp5v, temp6v, temp7v); + + temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v)); + temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v)); + temp0v = _mm_add_epi16(temp0v, temp2v); + temp4v = _mm_add_epi16(_mm_abs_epi16(temp4v), _mm_abs_epi16(temp5v)); + temp6v = _mm_add_epi16(_mm_abs_epi16(temp6v), _mm_abs_epi16(temp7v)); + temp4v = _mm_add_epi16(temp4v, temp6v); + satdv = _mm_add_epi16(temp0v, temp4v); + + VEC_HADAMAR(diffl0v, diffl1v, diffl2v, diffl3v, + temp0v, temp1v, temp2v, temp3v); + VEC_HADAMAR(diffl4v, diffl5v, diffl6v, diffl7v, + temp4v, temp5v, temp6v, temp7v); + + VEC_TRANSPOSE_8(temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v, + diffl0v, diffl1v, diffl2v, diffl3v, + diffl4v, diffl5v, diffl6v, diffl7v); + + VEC_HADAMAR(diffl0v, diffl1v, diffl2v, diffl3v, + temp0v, temp1v, temp2v, temp3v); + VEC_HADAMAR(diffl4v, diffl5v, diffl6v, diffl7v, + temp4v, temp5v, temp6v, temp7v); + + temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v)); + temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v)); + temp0v = _mm_add_epi16(temp0v, temp2v); + temp4v = _mm_add_epi16(_mm_abs_epi16(temp4v), _mm_abs_epi16(temp5v)); + temp6v = _mm_add_epi16(_mm_abs_epi16(temp6v), _mm_abs_epi16(temp7v)); + temp4v = _mm_add_epi16(temp4v, temp6v); + satdv = _mm_add_epi16(satdv, _mm_add_epi16(temp0v, temp4v)); + satdv = _mm_add_epi32(_mm_unpacklo_epi16(satdv, zerov), _mm_unpackhi_epi16(satdv, zerov)); + + satdv = _mm_hadd_epi32(satdv, satdv); + return (_mm_extract_epi32(satdv, 0) + _mm_extract_epi32(satdv, 1)) >> 1; +} + +/*********************************************************************** + * SATD 16x16 + **********************************************************************/ +static int pixel_satd_16x16_e2k(uint8_t *pix1, intptr_t i_pix1, + uint8_t *pix2, intptr_t i_pix2) { + LOAD_ZERO; + vec_u16_t satdv; + vec_s16_t pix1v, pix2v; + vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v, + diffh4v, diffh5v, diffh6v, diffh7v; + vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v, + diffl4v, diffl5v, diffl6v, diffl7v; + vec_s16_t temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v; + + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v); + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v); + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v); + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v); + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v); + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v); + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v); + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v); + VEC_HADAMAR(diffh0v, diffh1v, diffh2v, diffh3v, + temp0v, temp1v, temp2v, temp3v); + VEC_HADAMAR(diffh4v, diffh5v, diffh6v, diffh7v, + temp4v, temp5v, temp6v, temp7v); + VEC_TRANSPOSE_8(temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v, + diffh0v, diffh1v, diffh2v, diffh3v, + diffh4v, diffh5v, diffh6v, diffh7v); + VEC_HADAMAR(diffh0v, diffh1v, diffh2v, diffh3v, + temp0v, temp1v, temp2v, temp3v); + VEC_HADAMAR(diffh4v, diffh5v, diffh6v, diffh7v, + temp4v, temp5v, temp6v, temp7v); + + temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v)); + temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v)); + temp0v = _mm_add_epi16(temp0v, temp2v); + temp4v = _mm_add_epi16(_mm_abs_epi16(temp4v), _mm_abs_epi16(temp5v)); + temp6v = _mm_add_epi16(_mm_abs_epi16(temp6v), _mm_abs_epi16(temp7v)); + temp4v = _mm_add_epi16(temp4v, temp6v); + satdv = _mm_add_epi16(temp0v, temp4v); + + VEC_HADAMAR(diffl0v, diffl1v, diffl2v, diffl3v, + temp0v, temp1v, temp2v, temp3v); + VEC_HADAMAR(diffl4v, diffl5v, diffl6v, diffl7v, + temp4v, temp5v, temp6v, temp7v); + VEC_TRANSPOSE_8(temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v, + diffl0v, diffl1v, diffl2v, diffl3v, + diffl4v, diffl5v, diffl6v, diffl7v); + VEC_HADAMAR(diffl0v, diffl1v, diffl2v, diffl3v, + temp0v, temp1v, temp2v, temp3v); + VEC_HADAMAR(diffl4v, diffl5v, diffl6v, diffl7v, + temp4v, temp5v, temp6v, temp7v); + + temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v)); + temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v)); + temp0v = _mm_add_epi16(temp0v, temp2v); + temp4v = _mm_add_epi16(_mm_abs_epi16(temp4v), _mm_abs_epi16(temp5v)); + temp6v = _mm_add_epi16(_mm_abs_epi16(temp6v), _mm_abs_epi16(temp7v)); + temp4v = _mm_add_epi16(temp4v, temp6v); + satdv = _mm_add_epi16(satdv, _mm_add_epi16(temp0v, temp4v)); + + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v); + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v); + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v); + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v); + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v); + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v); + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v); + VEC_DIFF_HL(pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v); + VEC_HADAMAR(diffh0v, diffh1v, diffh2v, diffh3v, + temp0v, temp1v, temp2v, temp3v); + VEC_HADAMAR(diffh4v, diffh5v, diffh6v, diffh7v, + temp4v, temp5v, temp6v, temp7v); + VEC_TRANSPOSE_8(temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v, + diffh0v, diffh1v, diffh2v, diffh3v, + diffh4v, diffh5v, diffh6v, diffh7v); + VEC_HADAMAR(diffh0v, diffh1v, diffh2v, diffh3v, + temp0v, temp1v, temp2v, temp3v); + VEC_HADAMAR(diffh4v, diffh5v, diffh6v, diffh7v, + temp4v, temp5v, temp6v, temp7v); + + temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v)); + temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v)); + temp0v = _mm_add_epi16(temp0v, temp2v); + temp4v = _mm_add_epi16(_mm_abs_epi16(temp4v), _mm_abs_epi16(temp5v)); + temp6v = _mm_add_epi16(_mm_abs_epi16(temp6v), _mm_abs_epi16(temp7v)); + temp4v = _mm_add_epi16(temp4v, temp6v); + satdv = _mm_add_epi16(satdv, _mm_add_epi16(temp0v, temp4v)); + + VEC_HADAMAR(diffl0v, diffl1v, diffl2v, diffl3v, + temp0v, temp1v, temp2v, temp3v); + VEC_HADAMAR(diffl4v, diffl5v, diffl6v, diffl7v, + temp4v, temp5v, temp6v, temp7v); + VEC_TRANSPOSE_8(temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v, + diffl0v, diffl1v, diffl2v, diffl3v, + diffl4v, diffl5v, diffl6v, diffl7v); + VEC_HADAMAR(diffl0v, diffl1v, diffl2v, diffl3v, + temp0v, temp1v, temp2v, temp3v); + VEC_HADAMAR(diffl4v, diffl5v, diffl6v, diffl7v, + temp4v, temp5v, temp6v, temp7v); + + temp0v = _mm_add_epi16(_mm_abs_epi16(temp0v), _mm_abs_epi16(temp1v)); + temp2v = _mm_add_epi16(_mm_abs_epi16(temp2v), _mm_abs_epi16(temp3v)); + temp0v = _mm_add_epi16(temp0v, temp2v); + temp4v = _mm_add_epi16(_mm_abs_epi16(temp4v), _mm_abs_epi16(temp5v)); + temp6v = _mm_add_epi16(_mm_abs_epi16(temp6v), _mm_abs_epi16(temp7v)); + temp4v = _mm_add_epi16(temp4v, temp6v); + satdv = _mm_add_epi16(satdv, _mm_add_epi16(temp0v, temp4v)); + satdv = _mm_add_epi32(_mm_unpacklo_epi16(satdv, zerov), _mm_unpackhi_epi16(satdv, zerov)); + + satdv = _mm_hadd_epi32(satdv, satdv); + return (_mm_extract_epi32(satdv, 0) + _mm_extract_epi32(satdv, 1)) >> 1; +} + + +/*********************************************************************** +* SSD routines +**********************************************************************/ + +static int pixel_ssd_16x16_e2k(uint8_t *pix1, intptr_t stride1, + uint8_t *pix2, intptr_t stride2) { + LOAD_ZERO; + __m128i v0, v1, v2, v3, sumv = zerov; + int y; + + for (y = 0; y < 16; y++) { + v0 = VEC_LD(pix1); pix1 += stride1; + v1 = VEC_LD(pix2); pix2 += stride2; + v2 = _mm_sub_epi16(_mm_unpacklo_epi8(v0, zerov), _mm_unpacklo_epi8(v1, zerov)); + v3 = _mm_sub_epi16(_mm_unpackhi_epi8(v0, zerov), _mm_unpackhi_epi8(v1, zerov)); + sumv = _mm_add_epi32(sumv, _mm_madd_epi16(v2, v2)); + sumv = _mm_add_epi32(sumv, _mm_madd_epi16(v3, v3)); + } + + sumv = _mm_hadd_epi32(sumv, sumv); + return _mm_extract_epi32(sumv, 0) + _mm_extract_epi32(sumv, 1); +} + +static int pixel_ssd_8x8_e2k(uint8_t *pix1, intptr_t stride1, + uint8_t *pix2, intptr_t stride2) { + LOAD_ZERO; + __m128i v0, v1, v2, v3, sumv = zerov; + int y; + + for (y = 0; y < 4; y++) { + v0 = VEC_LD8(pix1); pix1 += stride1; + v2 = VEC_LD8(pix2); pix2 += stride2; + v1 = VEC_LD8(pix1); pix1 += stride1; + v3 = VEC_LD8(pix2); pix2 += stride2; + v2 = _mm_sub_epi16(_mm_unpacklo_epi8(v0, zerov), _mm_unpacklo_epi8(v2, zerov)); + v3 = _mm_sub_epi16(_mm_unpacklo_epi8(v1, zerov), _mm_unpacklo_epi8(v3, zerov)); + sumv = _mm_add_epi32(sumv, _mm_madd_epi16(v2, v2)); + sumv = _mm_add_epi32(sumv, _mm_madd_epi16(v3, v3)); + } + + sumv = _mm_hadd_epi32(sumv, sumv); + return _mm_extract_epi32(sumv, 0) + _mm_extract_epi32(sumv, 1); +} + +/**************************************************************************** + * variance + ****************************************************************************/ +static uint64_t pixel_var_16x16_e2k(uint8_t *pix, intptr_t stride) { + LOAD_ZERO; + __m128i v0, v1, sqrv = zerov, sumv = zerov; + int y, sum, sqr; + + for (y = 0; y < 16; y++, pix += stride) { + v1 = VEC_LD(pix); + sumv = _mm_add_epi16(sumv, _mm_sad_epu8(v1, zerov)); + v0 = _mm_unpacklo_epi8(v1, zerov); + v1 = _mm_unpackhi_epi8(v1, zerov); + sqrv = _mm_add_epi32(sqrv, _mm_madd_epi16(v0, v0)); + sqrv = _mm_add_epi32(sqrv, _mm_madd_epi16(v1, v1)); + } + sqrv = _mm_hadd_epi32(sqrv, sqrv); + sum = _mm_extract_epi16(sumv, 0) + _mm_extract_epi16(sumv, 4); + sqr = _mm_extract_epi32(sqrv, 0) + _mm_extract_epi32(sqrv, 1); + return sum | (uint64_t)sqr << 32; +} + +static uint64_t pixel_var_8x8_e2k(uint8_t *pix, intptr_t stride) { + LOAD_ZERO; + __m128i v0, v1, sqrv = zerov, sumv = zerov; + int y, sum, sqr; + + for (y = 0; y < 4; y++, pix += stride * 2) { + v0 = VEC_LD8(pix); + v1 = VEC_LD8(pix + stride); + sumv = _mm_add_epi16(sumv, _mm_sad_epu8(_mm_unpacklo_epi64(v0, v1), zerov)); + v0 = _mm_unpacklo_epi8(v0, zerov); + v1 = _mm_unpacklo_epi8(v1, zerov); + sqrv = _mm_add_epi32(sqrv, _mm_madd_epi16(v0, v0)); + sqrv = _mm_add_epi32(sqrv, _mm_madd_epi16(v1, v1)); + } + sqrv = _mm_hadd_epi32(sqrv, sqrv); + sum = _mm_extract_epi16(sumv, 0) + _mm_extract_epi16(sumv, 4); + sqr = _mm_extract_epi32(sqrv, 0) + _mm_extract_epi32(sqrv, 1); + return sum | (uint64_t)sqr << 32; +} + +/********************************************************************** + * SA8D routines: sum of 8x8 Hadamard transformed differences + **********************************************************************/ +/* SA8D_1D unrolled by 8 */ +#define SA8D_1D_E2K(sa8d0v, sa8d1v, sa8d2v, sa8d3v, \ + sa8d4v, sa8d5v, sa8d6v, sa8d7v) { \ + /* int a0 = SRC(0) + SRC(4) */ \ + /* int a4 = SRC(0) - SRC(4) */ \ + /* int a1 = SRC(1) + SRC(5) */ \ + /* int a5 = SRC(1) - SRC(5) */ \ + /* int a2 = SRC(2) + SRC(6) */ \ + /* int a6 = SRC(2) - SRC(6) */ \ + /* int a3 = SRC(3) + SRC(7) */ \ + /* int a7 = SRC(3) - SRC(7) */ \ + vec_s16_t a0v = _mm_add_epi16(sa8d0v, sa8d4v); \ + vec_s16_t a4v = _mm_sub_epi16(sa8d0v, sa8d4v); \ + vec_s16_t a1v = _mm_add_epi16(sa8d1v, sa8d5v); \ + vec_s16_t a5v = _mm_sub_epi16(sa8d1v, sa8d5v); \ + vec_s16_t a2v = _mm_add_epi16(sa8d2v, sa8d6v); \ + vec_s16_t a6v = _mm_sub_epi16(sa8d2v, sa8d6v); \ + vec_s16_t a3v = _mm_add_epi16(sa8d3v, sa8d7v); \ + vec_s16_t a7v = _mm_sub_epi16(sa8d3v, sa8d7v); \ + \ + /* int b0 = a0 + a2; */ \ + /* int b2 = a0 - a2; */ \ + /* int b1 = a1 + a3; */ \ + /* int b3 = a1 - a3; */ \ + /* int b4 = a4 + a6; */ \ + /* int b6 = a4 - a6; */ \ + /* int b5 = a5 + a7; */ \ + /* int b7 = a5 - a7; */ \ + vec_s16_t b0v = _mm_add_epi16(a0v, a2v); \ + vec_s16_t b2v = _mm_sub_epi16(a0v, a2v); \ + vec_s16_t b1v = _mm_add_epi16(a1v, a3v); \ + vec_s16_t b3v = _mm_sub_epi16(a1v, a3v); \ + vec_s16_t b4v = _mm_add_epi16(a4v, a6v); \ + vec_s16_t b6v = _mm_sub_epi16(a4v, a6v); \ + vec_s16_t b5v = _mm_add_epi16(a5v, a7v); \ + vec_s16_t b7v = _mm_sub_epi16(a5v, a7v); \ + \ + /* DST(0, b0 + b1) */ \ + /* DST(1, b0 - b1) */ \ + /* DST(2, b2 + b3) */ \ + /* DST(3, b2 - b3) */ \ + /* DST(4, b4 + b5) */ \ + /* DST(5, b4 - b5) */ \ + /* DST(6, b6 + b7) */ \ + /* DST(7, b6 - b7) */ \ + sa8d0v = _mm_add_epi16(b0v, b1v); \ + sa8d1v = _mm_sub_epi16(b0v, b1v); \ + sa8d2v = _mm_add_epi16(b2v, b3v); \ + sa8d3v = _mm_sub_epi16(b2v, b3v); \ + sa8d4v = _mm_add_epi16(b4v, b5v); \ + sa8d5v = _mm_sub_epi16(b4v, b5v); \ + sa8d6v = _mm_add_epi16(b6v, b7v); \ + sa8d7v = _mm_sub_epi16(b6v, b7v); \ +} + +static int pixel_sa8d_8x8_core_e2k(uint8_t *pix1, intptr_t i_pix1, + uint8_t *pix2, intptr_t i_pix2) { + PREP_DIFF; + vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v; + + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff0v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff1v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff2v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff3v); + + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff4v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff5v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff6v); + VEC_DIFF8X1_H(pix1, i_pix1, pix2, i_pix2, diff7v); + + vec_s16_t sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v; + + SA8D_1D_E2K(diff0v, diff1v, diff2v, diff3v, + diff4v, diff5v, diff6v, diff7v); + + VEC_TRANSPOSE_8(diff0v, diff1v, diff2v, diff3v, + diff4v, diff5v, diff6v, diff7v, + sa8d0v, sa8d1v, sa8d2v, sa8d3v, + sa8d4v, sa8d5v, sa8d6v, sa8d7v); + + SA8D_1D_E2K(sa8d0v, sa8d1v, sa8d2v, sa8d3v, + sa8d4v, sa8d5v, sa8d6v, sa8d7v); + + /* accumulation of the absolute value of all elements of the resulting bloc */ + vec_s16_t sum0v = _mm_add_epi16(_mm_abs_epi16(sa8d0v), _mm_abs_epi16(sa8d1v)); + vec_s16_t sum1v = _mm_add_epi16(_mm_abs_epi16(sa8d2v), _mm_abs_epi16(sa8d3v)); + vec_s16_t sum2v = _mm_add_epi16(_mm_abs_epi16(sa8d4v), _mm_abs_epi16(sa8d5v)); + vec_s16_t sum3v = _mm_add_epi16(_mm_abs_epi16(sa8d6v), _mm_abs_epi16(sa8d7v)); + sum0v = _mm_add_epi16(sum0v, sum1v); + sum2v = _mm_add_epi16(sum2v, sum3v); + sum0v = _mm_add_epi32(_mm_unpacklo_epi16(sum0v, zerov), _mm_unpackhi_epi16(sum0v, zerov)); + sum1v = _mm_add_epi32(_mm_unpacklo_epi16(sum2v, zerov), _mm_unpackhi_epi16(sum2v, zerov)); + sum0v = _mm_add_epi32(sum0v, sum1v); + + sum0v = _mm_hadd_epi32(sum0v, sum0v); + return _mm_extract_epi32(sum0v, 0) + _mm_extract_epi32(sum0v, 1); +} + +static int pixel_sa8d_8x8_e2k(uint8_t *pix1, intptr_t i_pix1, + uint8_t *pix2, intptr_t i_pix2) { + return (pixel_sa8d_8x8_core_e2k(pix1, i_pix1, pix2, i_pix2) + 2) >> 2; +} + +static int pixel_sa8d_16x16_e2k(uint8_t *pix1, intptr_t i_pix1, + uint8_t *pix2, intptr_t i_pix2) { + return (pixel_sa8d_8x8_core_e2k(&pix1[0], i_pix1, &pix2[0], i_pix2) + + pixel_sa8d_8x8_core_e2k(&pix1[8], i_pix1, &pix2[8], i_pix2) + + pixel_sa8d_8x8_core_e2k(&pix1[8 * i_pix1], i_pix1, &pix2[8 * i_pix2], i_pix2) + + pixel_sa8d_8x8_core_e2k(&pix1[8 * i_pix1 + 8], i_pix1, &pix2[8 * i_pix2 + 8], i_pix2) + 2) >> 2; +} + +#define HADAMARD4_E2K(d0,d1,d2,d3,s0,s1,s2,s3) { \ + vec_s16_t t0 = _mm_add_epi16(s0, s1); \ + vec_s16_t t1 = _mm_sub_epi16(s0, s1); \ + vec_s16_t t2 = _mm_add_epi16(s2, s3); \ + vec_s16_t t3 = _mm_sub_epi16(s2, s3); \ + d0 = _mm_add_epi16(t0, t2); \ + d2 = _mm_sub_epi16(t0, t2); \ + d1 = _mm_add_epi16(t1, t3); \ + d3 = _mm_sub_epi16(t1, t3); \ +} + +#define VEC_LOAD_HIGH(p, num) \ + vec_u8_t pix8_##num = VEC_LD8(p + stride * num); \ + vec_s16_t pix16_s##num = vec_u8_to_s16(pix8_##num); \ + vec_s16_t pix16_d##num; + +static uint64_t pixel_hadamard_ac_e2k(uint8_t *pix, intptr_t stride) { + LOAD_ZERO; + + VEC_LOAD_HIGH(pix, 0); + VEC_LOAD_HIGH(pix, 1); + VEC_LOAD_HIGH(pix, 2); + VEC_LOAD_HIGH(pix, 3); + HADAMARD4_E2K(pix16_d0, pix16_d1, pix16_d2, pix16_d3, + pix16_s0, pix16_s1, pix16_s2, pix16_s3); + + VEC_LOAD_HIGH(pix, 4); + VEC_LOAD_HIGH(pix, 5); + VEC_LOAD_HIGH(pix, 6); + VEC_LOAD_HIGH(pix, 7); + HADAMARD4_E2K(pix16_d4, pix16_d5, pix16_d6, pix16_d7, + pix16_s4, pix16_s5, pix16_s6, pix16_s7); + + VEC_TRANSPOSE_8(pix16_d0, pix16_d1, pix16_d2, pix16_d3, + pix16_d4, pix16_d5, pix16_d6, pix16_d7, + pix16_s0, pix16_s1, pix16_s2, pix16_s3, + pix16_s4, pix16_s5, pix16_s6, pix16_s7); + + HADAMARD4_E2K(pix16_d0, pix16_d1, pix16_d2, pix16_d3, + pix16_s0, pix16_s1, pix16_s2, pix16_s3); + + HADAMARD4_E2K(pix16_d4, pix16_d5, pix16_d6, pix16_d7, + pix16_s4, pix16_s5, pix16_s6, pix16_s7); + + vec_u16_t addabs01 = _mm_add_epi16(_mm_abs_epi16(pix16_d0), _mm_abs_epi16(pix16_d1)); + vec_u16_t addabs23 = _mm_add_epi16(_mm_abs_epi16(pix16_d2), _mm_abs_epi16(pix16_d3)); + vec_u16_t addabs45 = _mm_add_epi16(_mm_abs_epi16(pix16_d4), _mm_abs_epi16(pix16_d5)); + vec_u16_t addabs67 = _mm_add_epi16(_mm_abs_epi16(pix16_d6), _mm_abs_epi16(pix16_d7)); + + vec_u16_t sum4_v = _mm_add_epi16(_mm_add_epi16(addabs01, addabs23), _mm_add_epi16(addabs45, addabs67)); + sum4_v = _mm_add_epi32(_mm_unpacklo_epi16(sum4_v, zerov), _mm_unpackhi_epi16(sum4_v, zerov)); + sum4_v = _mm_hadd_epi32(sum4_v, sum4_v); + int sum4 = _mm_extract_epi32(sum4_v, 0) + _mm_extract_epi32(sum4_v, 1); + + vec_s16_t tmpi0 = _mm_add_epi16(pix16_d0, pix16_d4); + vec_s16_t tmpi4 = _mm_sub_epi16(pix16_d0, pix16_d4); + vec_s16_t tmpi1 = _mm_add_epi16(pix16_d1, pix16_d5); + vec_s16_t tmpi5 = _mm_sub_epi16(pix16_d1, pix16_d5); + vec_s16_t tmpi2 = _mm_add_epi16(pix16_d2, pix16_d6); + vec_s16_t tmpi6 = _mm_sub_epi16(pix16_d2, pix16_d6); + vec_s16_t tmpi3 = _mm_add_epi16(pix16_d3, pix16_d7); + vec_s16_t tmpi7 = _mm_sub_epi16(pix16_d3, pix16_d7); + + VEC_TRANSPOSE_8(tmpi0, tmpi1, tmpi2, tmpi3, + tmpi4, tmpi5, tmpi6, tmpi7, + pix16_d0, pix16_d1, pix16_d2, pix16_d3, + pix16_d4, pix16_d5, pix16_d6, pix16_d7); + + vec_u16_t addsum04 = _mm_add_epi16(_mm_abs_epi16(_mm_add_epi16(pix16_d0, pix16_d4)), + _mm_abs_epi16(_mm_sub_epi16(pix16_d0, pix16_d4))); + vec_u16_t addsum15 = _mm_add_epi16(_mm_abs_epi16(_mm_add_epi16(pix16_d1, pix16_d5)), + _mm_abs_epi16(_mm_sub_epi16(pix16_d1, pix16_d5))); + vec_u16_t addsum26 = _mm_add_epi16(_mm_abs_epi16(_mm_add_epi16(pix16_d2, pix16_d6)), + _mm_abs_epi16(_mm_sub_epi16(pix16_d2, pix16_d6))); + vec_u16_t addsum37 = _mm_add_epi16(_mm_abs_epi16(_mm_add_epi16(pix16_d3, pix16_d7)), + _mm_abs_epi16(_mm_sub_epi16(pix16_d3, pix16_d7))); + + vec_u16_t sum8_v = _mm_add_epi16(_mm_add_epi16(addsum04, addsum15), _mm_add_epi16(addsum26, addsum37)); + sum8_v = _mm_add_epi32(_mm_unpacklo_epi16(sum8_v, zerov), _mm_unpackhi_epi16(sum8_v, zerov)); + sum8_v = _mm_hadd_epi32(sum8_v, sum8_v); + int sum8 = _mm_extract_epi32(sum8_v, 0) + _mm_extract_epi32(sum8_v, 1); + + int tmp0 = _mm_extract_epi16(_mm_add_epi16(pix16_d0, pix16_d4), 0); + return (uint64_t)(sum8 - tmp0) << 32 | (sum4 - tmp0); +} + +static uint64_t pixel_hadamard_ac_16x16_e2k(uint8_t *pix, intptr_t stride) { + uint64_t sum = pixel_hadamard_ac_e2k(pix, stride); + sum += pixel_hadamard_ac_e2k(pix + 8, stride); + sum += pixel_hadamard_ac_e2k(pix + 8 * stride, stride); + sum += pixel_hadamard_ac_e2k(pix + 8 * stride + 8, stride); + return (sum >> 34) << 32 | (uint32_t)sum >> 1; +} + +static uint64_t pixel_hadamard_ac_16x8_e2k(uint8_t *pix, intptr_t stride) { + uint64_t sum = pixel_hadamard_ac_e2k(pix, stride); + sum += pixel_hadamard_ac_e2k(pix + 8, stride); + return (sum >> 34) << 32 | (uint32_t)sum >> 1; +} + +static uint64_t pixel_hadamard_ac_8x16_e2k(uint8_t *pix, intptr_t stride) { + uint64_t sum = pixel_hadamard_ac_e2k(pix, stride); + sum += pixel_hadamard_ac_e2k(pix + 8 * stride, stride); + return (sum >> 34) << 32 | (uint32_t)sum >> 1; +} + +static uint64_t pixel_hadamard_ac_8x8_e2k(uint8_t *pix, intptr_t stride) { + uint64_t sum = pixel_hadamard_ac_e2k(pix, stride); + return (sum >> 34) << 32 | (uint32_t)sum >> 1; +} + + +/**************************************************************************** + * structural similarity metric + ****************************************************************************/ +static void ssim_4x4x2_core_e2k(const uint8_t *pix1, intptr_t stride1, + const uint8_t *pix2, intptr_t stride2, + int sums[2][4]) { + LOAD_ZERO; + __m128i v0, v1, s1v, s2v, ssv, s12v; + int y; + s1v = s2v = ssv = s12v = zerov; + + for (y = 0; y < 4; y++) { + v0 = VEC_LD8(pix1); pix1 += stride1; + v1 = VEC_LD8(pix2); pix2 += stride2; + v0 = _mm_unpacklo_epi8(v0, zerov); + v1 = _mm_unpacklo_epi8(v1, zerov); + + s1v = _mm_add_epi16(s1v, _mm_sad_epu8(v0, zerov)); + s2v = _mm_add_epi16(s2v, _mm_sad_epu8(v1, zerov)); + ssv = _mm_add_epi32(ssv, _mm_madd_epi16(v0, v0)); + ssv = _mm_add_epi32(ssv, _mm_madd_epi16(v1, v1)); + s12v = _mm_add_epi32(s12v, _mm_madd_epi16(v0, v1)); + } + ssv = _mm_hadd_epi32(ssv, ssv); + s12v = _mm_hadd_epi32(s12v, s12v); + + sums[0][0] = _mm_extract_epi16(s1v, 0); + sums[1][0] = _mm_extract_epi16(s1v, 4); + sums[0][1] = _mm_extract_epi16(s2v, 0); + sums[1][1] = _mm_extract_epi16(s2v, 4); + sums[0][2] = _mm_extract_epi32(ssv, 0); + sums[1][2] = _mm_extract_epi32(ssv, 1); + sums[0][3] = _mm_extract_epi32(s12v, 0); + sums[1][3] = _mm_extract_epi32(s12v, 1); +} + +#define SATD_X(size) \ +static void pixel_satd_x3_##size##_e2k(uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, \ + intptr_t stride, int scores[3]) { \ + scores[0] = pixel_satd_##size##_e2k(fenc, FENC_STRIDE, pix0, stride); \ + scores[1] = pixel_satd_##size##_e2k(fenc, FENC_STRIDE, pix1, stride); \ + scores[2] = pixel_satd_##size##_e2k(fenc, FENC_STRIDE, pix2, stride); \ +} \ +static void pixel_satd_x4_##size##_e2k(uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, \ + uint8_t *pix3, intptr_t stride, int scores[4]) { \ + scores[0] = pixel_satd_##size##_e2k(fenc, FENC_STRIDE, pix0, stride); \ + scores[1] = pixel_satd_##size##_e2k(fenc, FENC_STRIDE, pix1, stride); \ + scores[2] = pixel_satd_##size##_e2k(fenc, FENC_STRIDE, pix2, stride); \ + scores[3] = pixel_satd_##size##_e2k(fenc, FENC_STRIDE, pix3, stride); \ +} + +SATD_X(16x16) +SATD_X(16x8) +SATD_X(8x16) +SATD_X(8x8) +SATD_X(8x4) +SATD_X(4x8) +SATD_X(4x4) + +#define INTRA_MBCMP_8x8(mbcmp) \ +void intra_##mbcmp##_x3_8x8_e2k(uint8_t *fenc, uint8_t edge[36], int res[3]) { \ + uint8_t __attribute__((aligned(8))) pix[8*FDEC_STRIDE]; \ + x264_predict_8x8_v_c(pix, edge); \ + res[0] = pixel_##mbcmp##_8x8_e2k(pix, FDEC_STRIDE, fenc, FENC_STRIDE); \ + x264_predict_8x8_h_c(pix, edge); \ + res[1] = pixel_##mbcmp##_8x8_e2k(pix, FDEC_STRIDE, fenc, FENC_STRIDE); \ + x264_predict_8x8_dc_c(pix, edge); \ + res[2] = pixel_##mbcmp##_8x8_e2k(pix, FDEC_STRIDE, fenc, FENC_STRIDE); \ +} + +INTRA_MBCMP_8x8(sad) +INTRA_MBCMP_8x8(sa8d) + +#define INTRA_MBCMP(mbcmp, size, pred1, pred2, pred3, chroma) \ +void intra_##mbcmp##_x3_##size##x##size##chroma##_e2k(uint8_t *fenc, uint8_t *fdec, int res[3]) { \ + x264_predict_##size##x##size##chroma##_##pred1##_c(fdec); \ + res[0] = pixel_##mbcmp##_##size##x##size##_e2k(fdec, FDEC_STRIDE, fenc, FENC_STRIDE); \ + x264_predict_##size##x##size##chroma##_##pred2##_c(fdec); \ + res[1] = pixel_##mbcmp##_##size##x##size##_e2k(fdec, FDEC_STRIDE, fenc, FENC_STRIDE); \ + x264_predict_##size##x##size##chroma##_##pred3##_c(fdec); \ + res[2] = pixel_##mbcmp##_##size##x##size##_e2k(fdec, FDEC_STRIDE, fenc, FENC_STRIDE); \ +} + +INTRA_MBCMP(satd, 4, v, h, dc, ) +INTRA_MBCMP(sad, 8, dc, h, v, c) +INTRA_MBCMP(satd, 8, dc, h, v, c) +INTRA_MBCMP(sad, 16, v, h, dc, ) +INTRA_MBCMP(satd, 16, v, h, dc, ) +#endif // !HIGH_BIT_DEPTH + +/**************************************************************************** + * x264_pixel_init: + ****************************************************************************/ +void x264_pixel_init_e2k(x264_pixel_function_t *pixf) { +#if !HIGH_BIT_DEPTH + pixf->sad[PIXEL_16x16] = pixel_sad_16x16_e2k; + pixf->sad[PIXEL_8x16] = pixel_sad_8x16_e2k; + pixf->sad[PIXEL_16x8] = pixel_sad_16x8_e2k; + pixf->sad[PIXEL_8x8] = pixel_sad_8x8_e2k; + + pixf->sad_x3[PIXEL_16x16] = pixel_sad_x3_16x16_e2k; + pixf->sad_x3[PIXEL_8x16] = pixel_sad_x3_8x16_e2k; + pixf->sad_x3[PIXEL_16x8] = pixel_sad_x3_16x8_e2k; + pixf->sad_x3[PIXEL_8x8] = pixel_sad_x3_8x8_e2k; + + pixf->sad_x4[PIXEL_16x16] = pixel_sad_x4_16x16_e2k; + pixf->sad_x4[PIXEL_8x16] = pixel_sad_x4_8x16_e2k; + pixf->sad_x4[PIXEL_16x8] = pixel_sad_x4_16x8_e2k; + pixf->sad_x4[PIXEL_8x8] = pixel_sad_x4_8x8_e2k; + + pixf->satd[PIXEL_16x16] = pixel_satd_16x16_e2k; + pixf->satd[PIXEL_8x16] = pixel_satd_8x16_e2k; + pixf->satd[PIXEL_16x8] = pixel_satd_16x8_e2k; + pixf->satd[PIXEL_8x8] = pixel_satd_8x8_e2k; + pixf->satd[PIXEL_8x4] = pixel_satd_8x4_e2k; + pixf->satd[PIXEL_4x8] = pixel_satd_4x8_e2k; + pixf->satd[PIXEL_4x4] = pixel_satd_4x4_e2k; + + pixf->satd_x3[PIXEL_16x16] = pixel_satd_x3_16x16_e2k; + pixf->satd_x3[PIXEL_8x16] = pixel_satd_x3_8x16_e2k; + pixf->satd_x3[PIXEL_16x8] = pixel_satd_x3_16x8_e2k; + pixf->satd_x3[PIXEL_8x8] = pixel_satd_x3_8x8_e2k; + pixf->satd_x3[PIXEL_8x4] = pixel_satd_x3_8x4_e2k; + pixf->satd_x3[PIXEL_4x8] = pixel_satd_x3_4x8_e2k; + pixf->satd_x3[PIXEL_4x4] = pixel_satd_x3_4x4_e2k; + + pixf->satd_x4[PIXEL_16x16] = pixel_satd_x4_16x16_e2k; + pixf->satd_x4[PIXEL_8x16] = pixel_satd_x4_8x16_e2k; + pixf->satd_x4[PIXEL_16x8] = pixel_satd_x4_16x8_e2k; + pixf->satd_x4[PIXEL_8x8] = pixel_satd_x4_8x8_e2k; + pixf->satd_x4[PIXEL_8x4] = pixel_satd_x4_8x4_e2k; + pixf->satd_x4[PIXEL_4x8] = pixel_satd_x4_4x8_e2k; + pixf->satd_x4[PIXEL_4x4] = pixel_satd_x4_4x4_e2k; + + pixf->intra_sad_x3_8x8 = intra_sad_x3_8x8_e2k; + pixf->intra_sad_x3_8x8c = intra_sad_x3_8x8c_e2k; + pixf->intra_sad_x3_16x16 = intra_sad_x3_16x16_e2k; + + pixf->intra_satd_x3_4x4 = intra_satd_x3_4x4_e2k; + pixf->intra_satd_x3_8x8c = intra_satd_x3_8x8c_e2k; + pixf->intra_satd_x3_16x16 = intra_satd_x3_16x16_e2k; + + pixf->ssd[PIXEL_16x16] = pixel_ssd_16x16_e2k; + pixf->ssd[PIXEL_8x8] = pixel_ssd_8x8_e2k; + + pixf->sa8d[PIXEL_16x16] = pixel_sa8d_16x16_e2k; + pixf->sa8d[PIXEL_8x8] = pixel_sa8d_8x8_e2k; + + pixf->intra_sa8d_x3_8x8 = intra_sa8d_x3_8x8_e2k; + + pixf->var[PIXEL_16x16] = pixel_var_16x16_e2k; + pixf->var[PIXEL_8x8] = pixel_var_8x8_e2k; + + pixf->hadamard_ac[PIXEL_16x16] = pixel_hadamard_ac_16x16_e2k; + pixf->hadamard_ac[PIXEL_16x8] = pixel_hadamard_ac_16x8_e2k; + pixf->hadamard_ac[PIXEL_8x16] = pixel_hadamard_ac_8x16_e2k; + pixf->hadamard_ac[PIXEL_8x8] = pixel_hadamard_ac_8x8_e2k; + + pixf->ssim_4x4x2_core = ssim_4x4x2_core_e2k; +#endif // !HIGH_BIT_DEPTH +} diff --git a/common/e2k/pixel.h b/common/e2k/pixel.h new file mode 100644 index 0000000..d7c8a97 --- /dev/null +++ b/common/e2k/pixel.h @@ -0,0 +1,30 @@ +/***************************************************************************** + * pixel.h: e2k pixel metrics + ***************************************************************************** + * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_E2K_PIXEL_H +#define X264_E2K_PIXEL_H + +#define x264_pixel_init_e2k x264_template(pixel_init_e2k) +void x264_pixel_init_e2k(x264_pixel_function_t *pixf); + +#endif diff --git a/common/e2k/predict.c b/common/e2k/predict.c new file mode 100644 index 0000000..0381e26 --- /dev/null +++ b/common/e2k/predict.c @@ -0,0 +1,187 @@ +/***************************************************************************** + * predict.c: e2k intra prediction + ***************************************************************************** + * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd + * Copyright (C) 2007-2017 x264 project + * + * Authors: Guillaume Poirier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "e2kcommon.h" +#include "predict.h" +#include "pixel.h" + +#if !HIGH_BIT_DEPTH +static void predict_8x8c_p_e2k(uint8_t *src) { + int i, H = 0, V = 0; + + for (i = 0; i < 4; i++) { + H += (i + 1) * (src[4 + i - FDEC_STRIDE] - src[2 - i - FDEC_STRIDE]); + V += (i + 1) * (src[(i + 4) * FDEC_STRIDE - 1] - src[(2 - i) * FDEC_STRIDE - 1]); + } + + int a = 16 * (src[7 * FDEC_STRIDE - 1] + src[7 - FDEC_STRIDE]); + int b = (17 * H + 16) >> 5; + int c = (17 * V + 16) >> 5; + int i00 = a - b * 3 - c * 3 + 16; + + vec_s16_t i00_v, b_v, c_v; + i00_v = _mm_set1_epi16(i00); + b_v = _mm_set1_epi16(b); + c_v = _mm_set1_epi16(c); + + vec_s16_t induc_v = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); + vec_s16_t add0 = _mm_add_epi16(_mm_mullo_epi16(induc_v, b_v), i00_v); + + for (i = 0; i < 8; ++i) { + vec_s16_t shiftv = _mm_srai_epi16(add0, 5); + VEC_STL(src, _mm_packus_epi16(shiftv, shiftv)); + src += FDEC_STRIDE; + add0 = _mm_adds_epi16(add0, c_v); + } +} + + +/**************************************************************************** + * 16x16 prediction for intra luma block + ****************************************************************************/ + +static void predict_16x16_p_e2k(uint8_t *src) { + int i, H = 0, V = 0; + + for (i = 1; i <= 8; i++) { + H += i * (src[7 + i - FDEC_STRIDE] - src[7 - i - FDEC_STRIDE]); + V += i * (src[(7 + i) * FDEC_STRIDE - 1] - src[(7 - i) * FDEC_STRIDE - 1]); + } + + int a = 16 * (src[15 * FDEC_STRIDE - 1] + src[15 - FDEC_STRIDE]); + int b = (5 * H + 32) >> 6; + int c = (5 * V + 32) >> 6; + int i00 = a - b * 7 - c * 7 + 16; + + vec_s16_t i00_v, b_v, c_v; + i00_v = _mm_set1_epi16(i00); + b_v = _mm_set1_epi16(b); + c_v = _mm_set1_epi16(c); + + vec_s16_t induc_v = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); + vec_s16_t b8_v = _mm_slli_epi16(b_v, 3); + vec_s16_t add0 = _mm_add_epi16(_mm_mullo_epi16(induc_v, b_v), i00_v); + vec_s16_t add1 = _mm_adds_epi16(b8_v, add0); + + for (i = 0; i < 16; i++) { + vec_u8_t outv = _mm_packus_epi16(_mm_srai_epi16(add0, 5), + _mm_srai_epi16(add1, 5)); + VEC_ST(src, outv); + src += FDEC_STRIDE; + add0 = _mm_adds_epi16(add0, c_v); + add1 = _mm_adds_epi16(add1, c_v); + } +} + +#define PREDICT_16x16_DC_e2k(v) \ +for (i = 0; i < 16; i += 2) { \ + VEC_ST(src, v); \ + VEC_ST(src + FDEC_STRIDE, v); \ + src += FDEC_STRIDE*2; \ +} + +static void predict_16x16_dc_e2k(uint8_t *src) { + int i; uint32_t dc = 0; + + for (i = 0; i < 16; i++) { + dc += src[i * FDEC_STRIDE - 1]; + dc += src[i - FDEC_STRIDE]; + } + vec_u8_t bc_v = _mm_set1_epi8((dc + 16) >> 5); + PREDICT_16x16_DC_e2k(bc_v); +} + +static void predict_16x16_dc_left_e2k(uint8_t *src) { + int i; uint32_t dc = 0; + + for (i = 0; i < 16; i++) + dc += src[i * FDEC_STRIDE - 1]; + + vec_u8_t bc_v = _mm_set1_epi8((dc + 8) >> 4); + PREDICT_16x16_DC_e2k(bc_v); +} + +static void predict_16x16_dc_top_e2k(uint8_t *src) { + int i; uint32_t dc = 0; + + for (i = 0; i < 16; i++) + dc += src[i - FDEC_STRIDE]; + + vec_u8_t bc_v = _mm_set1_epi8((dc + 8) >> 4); + PREDICT_16x16_DC_e2k(bc_v); +} + +static void predict_16x16_dc_128_e2k(uint8_t *src) { + int i; + vec_u8_t bc_v = _mm_set1_epi8(128); + PREDICT_16x16_DC_e2k(bc_v); +} + +#define PREDICT_16_H_LD(i) \ + vec_u8_t v##i = _mm_set1_epi8(src[FDEC_STRIDE * i - 1]); +#define PREDICT_16_H_ST(i) VEC_ST(src + FDEC_STRIDE * i, v##i); + +#define PREDICT_REP16(M) M(0) M(1) M(2) M(3) M(4) M(5) M(6) M(7) \ + M(8) M(9) M(10) M(11) M(12) M(13) M(14) M(15) + +static void predict_16x16_h_e2k(uint8_t *src) { + PREDICT_REP16(PREDICT_16_H_LD) + PREDICT_REP16(PREDICT_16_H_ST) +} + +static void predict_16x16_v_e2k(uint8_t *src) { + int i; + vec_u32_t v = VEC_LD(src - FDEC_STRIDE); + + for (i = 0; i < 16; i++) { + VEC_ST(src, v); + src += FDEC_STRIDE; + } +} +#endif // !HIGH_BIT_DEPTH + + +/**************************************************************************** + * Exported functions: + ****************************************************************************/ +void x264_predict_16x16_init_e2k(x264_predict_t pf[7]) { +#if !HIGH_BIT_DEPTH + pf[I_PRED_16x16_V ] = predict_16x16_v_e2k; + pf[I_PRED_16x16_H ] = predict_16x16_h_e2k; + pf[I_PRED_16x16_DC] = predict_16x16_dc_e2k; + pf[I_PRED_16x16_P ] = predict_16x16_p_e2k; + pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_e2k; + pf[I_PRED_16x16_DC_TOP ] = predict_16x16_dc_top_e2k; + pf[I_PRED_16x16_DC_128 ] = predict_16x16_dc_128_e2k; +#endif // !HIGH_BIT_DEPTH +} + +void x264_predict_8x8c_init_e2k(x264_predict_t pf[7]) { +#if !HIGH_BIT_DEPTH + pf[I_PRED_CHROMA_P] = predict_8x8c_p_e2k; +#endif // !HIGH_BIT_DEPTH +} diff --git a/common/e2k/predict.h b/common/e2k/predict.h new file mode 100644 index 0000000..7b7f627 --- /dev/null +++ b/common/e2k/predict.h @@ -0,0 +1,32 @@ +/***************************************************************************** + * predict.h: e2k intra prediction + ***************************************************************************** + * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_E2K_PREDICT_H +#define X264_E2K_PREDICT_H + +#define x264_predict_16x16_init_e2k x264_template(predict_16x16_init_e2k) +void x264_predict_16x16_init_e2k(x264_predict_t pf[7]); +#define x264_predict_8x8c_init_e2k x264_template(predict_8x8c_init_e2k) +void x264_predict_8x8c_init_e2k(x264_predict_t pf[7]); + +#endif diff --git a/common/e2k/quant.c b/common/e2k/quant.c new file mode 100644 index 0000000..76a778d --- /dev/null +++ b/common/e2k/quant.c @@ -0,0 +1,436 @@ +/***************************************************************************** + * quant.c: e2k quantization + ***************************************************************************** + * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd + * Copyright (C) 2007-2017 x264 project + * + * Authors: Guillaume Poirier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "e2kcommon.h" +#include "quant.h" + +#if !HIGH_BIT_DEPTH +// quant of a whole 4x4 block, unrolled 2x and "pre-scheduled" + +#define LOAD_MF_BIAS(i) \ + mfvA = VEC_LD(mf + i); \ + mfvB = VEC_LD(mf + i + 8); \ + biasvA = VEC_LD(bias + i); \ + biasvB = VEC_LD(bias + i + 8); \ + +#define QUANT_16_U(dct, b0, b1, mf0, mf1) { \ + temp1v = VEC_LD(dct); \ + temp2v = VEC_LD(dct + 8); \ + coefvA = _mm_abs_epi16(temp1v); \ + coefvB = _mm_abs_epi16(temp2v); \ + coefvA = _mm_add_epi16(coefvA, b0); \ + coefvB = _mm_add_epi16(coefvB, b1); \ + coefvA = _mm_mulhi_epu16(coefvA, mf0); \ + coefvB = _mm_mulhi_epu16(coefvB, mf1); \ + temp1v = _mm_sign_epi16(coefvA, temp1v); \ + temp2v = _mm_sign_epi16(coefvB, temp2v); \ + VEC_ST(dct, temp1v); \ + VEC_ST(dct + 8, temp2v); \ +} + +int x264_quant_2x2_dc_e2k(int16_t dct[4], int mf, int bias) { + __m64 v0, v1, mfv = _mm_set1_pi16(mf), biasv = _mm_set1_pi16(bias); + + v0 = *(__m64*)dct; + v1 = _mm_abs_pi16(v0); + v1 = _mm_add_pi16(v1, biasv); + v1 = _mm_mulhi_pu16(v1, mfv); + v1 = _mm_sign_pi16(v1, v0); + *(__m64*)dct = v1; + return _mm_cvtm64_si64(v1) != 0; +} + +int x264_quant_4x4_e2k(int16_t dct[16], uint16_t mf[16], uint16_t bias[16]) { + vec_u16_t coefvA, mfvA, biasvA, coefvB, mfvB, biasvB; + vec_s16_t nz, temp1v, temp2v; + + LOAD_MF_BIAS(0); + QUANT_16_U(dct, biasvA, biasvB, mfvA, mfvB); + nz = _mm_or_si128(temp1v, temp2v); + return _mm_cvtsi128_si64(_mm_packs_epi16(nz, nz)) != 0; +} + +int x264_quant_4x4_dc_e2k(int16_t dct[16], int mf, int bias) { + vec_u16_t coefvA, coefvB; + vec_s16_t nz, temp1v, temp2v; + vec_u16_t mfv, biasv; + + mfv = _mm_set1_epi16(mf); + biasv = _mm_set1_epi16(bias); + + QUANT_16_U(dct, biasv, biasv, mfv, mfv); + nz = _mm_or_si128(temp1v, temp2v); + return _mm_cvtsi128_si64(_mm_packs_epi16(nz, nz)) != 0; +} + +int x264_quant_4x4x4_e2k(int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16]) { + vec_u16_t coefvA, mfvA, biasvA, coefvB, mfvB, biasvB; + vec_s16_t nz0, nz1, nz2, nz3, temp1v, temp2v; + + LOAD_MF_BIAS(0); + QUANT_16_U(dct[0], biasvA, biasvB, mfvA, mfvB); + nz0 = _mm_or_si128(temp1v, temp2v); + QUANT_16_U(dct[1], biasvA, biasvB, mfvA, mfvB); + nz1 = _mm_or_si128(temp1v, temp2v); + QUANT_16_U(dct[2], biasvA, biasvB, mfvA, mfvB); + nz2 = _mm_or_si128(temp1v, temp2v); + QUANT_16_U(dct[3], biasvA, biasvB, mfvA, mfvB); + nz3 = _mm_or_si128(temp1v, temp2v); + + return !!_mm_cvtsi128_si64(_mm_packs_epi16(nz0, nz0)) * 1 | + !!_mm_cvtsi128_si64(_mm_packs_epi16(nz1, nz1)) * 2 | + !!_mm_cvtsi128_si64(_mm_packs_epi16(nz2, nz2)) * 4 | + !!_mm_cvtsi128_si64(_mm_packs_epi16(nz3, nz3)) * 8; +} + +int x264_quant_8x8_e2k(int16_t dct[64], uint16_t mf[64], uint16_t bias[64]) { + LOAD_ZERO; + vec_u16_t coefvA, mfvA, biasvA, coefvB, mfvB, biasvB; + vec_s16_t nz = zerov, temp1v, temp2v; + int i; + + PRAGMA_E2K("ivdep") + for (i = 0; i < 64; i += 16) { + LOAD_MF_BIAS(i); + QUANT_16_U(dct + i, biasvA, biasvB, mfvA, mfvB); + nz = _mm_or_si128(nz, _mm_or_si128(temp1v, temp2v)); + } + return _mm_cvtsi128_si64(_mm_packs_epi16(nz, nz)) != 0; +} + +#define DEQUANT_SHL() { \ + dctv = VEC_LD(dct + y); \ + mf1v = VEC_LD(dequant_mf[i_mf] + y); \ + mf2v = VEC_LD(dequant_mf[i_mf] + y + 4); \ + mfv = _mm_packs_epi32(mf1v, mf2v); \ + dctv = _mm_mullo_epi16(dctv, mfv); \ + dctv = _mm_slli_epi16(dctv, i_qbits); \ + VEC_ST(dct + y, dctv); \ +} + +#define DEQUANT_SHR() { \ + dctv = VEC_LD(dct + y); \ + mf1v = VEC_LD(dequant_mf[i_mf] + y); \ + mf2v = VEC_LD(dequant_mf[i_mf] + y + 4); \ + mfv = _mm_packs_epi32(mf1v, mf2v); \ + mult1v = _mm_mulhi_epi16(dctv, mfv); \ + mult0v = _mm_mullo_epi16(dctv, mfv); \ + temp1v = _mm_unpacklo_epi16(mult0v, mult1v); \ + temp2v = _mm_unpackhi_epi16(mult0v, mult1v); \ + temp1v = _mm_add_epi32(temp1v, fv); \ + temp2v = _mm_add_epi32(temp2v, fv); \ + temp1v = _mm_srai_epi32(temp1v, -i_qbits); \ + temp2v = _mm_srai_epi32(temp2v, -i_qbits); \ + dctv = _mm_packs_epi32(temp1v, temp2v); \ + VEC_ST(dct + y, dctv); \ +} + +void x264_dequant_4x4_e2k(int16_t dct[16], int dequant_mf[6][16], int i_qp) { + int y, i_mf = i_qp % 6, i_qbits = i_qp / 6 - 4; + vec_s16_t dctv, mfv, mult0v, mult1v; + vec_s32_t mf1v, mf2v, temp1v, temp2v; + + if (i_qbits >= 0) { + PRAGMA_E2K("ivdep") + for (y = 0; y < 16; y += 8) DEQUANT_SHL(); + } else { + vec_s32_t fv = _mm_set1_epi32(1 << (-i_qbits-1)); + PRAGMA_E2K("ivdep") + for (y = 0; y < 16; y += 8) DEQUANT_SHR(); + } +} + +void x264_dequant_4x4_dc_e2k(int16_t dct[16], int dequant_mf[6][16], int i_qp) { + int y, i_mf = i_qp % 6, i_qbits = i_qp / 6 - 6; + vec_s16_t dctv, mfv, mult0v, mult1v; + vec_s32_t temp1v, temp2v; + + if (i_qbits >= 0) { + mfv = _mm_set1_epi16(dequant_mf[i_mf][0] << i_qbits); + PRAGMA_E2K("ivdep") + for (y = 0; y < 16; y += 8) { + dctv = VEC_LD(dct + y); + dctv = _mm_mullo_epi16(dctv, mfv); + VEC_ST(dct + y, dctv); + } + } else { + vec_s32_t fv = _mm_set1_epi32(1 << (-i_qbits-1)); + mfv = _mm_set1_epi32(dequant_mf[i_mf][0]); + mfv = _mm_packs_epi32(mfv, mfv); + PRAGMA_E2K("ivdep") + for (y = 0; y < 16; y += 8) { + dctv = VEC_LD(dct + y); + mult1v = _mm_mulhi_epi16(dctv, mfv); + mult0v = _mm_mullo_epi16(dctv, mfv); + temp1v = _mm_unpacklo_epi16(mult0v, mult1v); + temp2v = _mm_unpackhi_epi16(mult0v, mult1v); + temp1v = _mm_add_epi32(temp1v, fv); + temp2v = _mm_add_epi32(temp2v, fv); + temp1v = _mm_srai_epi32(temp1v, -i_qbits); + temp2v = _mm_srai_epi32(temp2v, -i_qbits); + dctv = _mm_packs_epi32(temp1v, temp2v); + VEC_ST(dct + y, dctv); + } + } +} + +void x264_dequant_8x8_e2k(int16_t dct[64], int dequant_mf[6][64], int i_qp) { + int y, i_mf = i_qp % 6, i_qbits = i_qp / 6 - 6; + vec_s16_t dctv, mfv, mult0v, mult1v; + vec_s32_t mf1v, mf2v, temp1v, temp2v; + + if (i_qbits >= 0) { + PRAGMA_E2K("ivdep") + for (y = 0; y < 64; y += 8) DEQUANT_SHL(); + } else { + vec_s32_t fv = _mm_set1_epi32(1 << (-i_qbits-1)); + PRAGMA_E2K("ivdep") + for (y = 0; y < 64; y += 8) DEQUANT_SHR(); + } +} + +void x264_denoise_dct_e2k(int16_t *dct, uint32_t *sum, uint16_t *offset, int size) { + /* size = 16, 64 */ + int i; + LOAD_ZERO; + + PRAGMA_E2K("ivdep") + for (i = 0; i < size; i += 8) { + __m128i v0, v1, v2, v3, v4; + v0 = VEC_LD(dct + i); + v2 = VEC_LD(sum + i); + v3 = VEC_LD(sum + i + 4); + v4 = VEC_LD(offset + i); + + v1 = _mm_abs_epi16(v0); + v2 = _mm_add_epi32(v2, _mm_unpacklo_epi16(v1, zerov)); + v3 = _mm_add_epi32(v3, _mm_unpackhi_epi16(v1, zerov)); + v1 = _mm_sign_epi16(_mm_subs_epu16(v1, v4), v0); + + VEC_ST(sum + i, v2); + VEC_ST(sum + i + 4, v3); + VEC_ST(dct + i, v1); + } +} + +static const uint8_t decimate_mask_table4[256] = { + 0, 3, 2, 6, 2, 5, 5, 9, 1, 5, 4, 8, 5, 8, 8, 12, + 1, 4, 4, 8, 4, 7, 7, 11, 4, 8, 7, 11, 8, 11, 11, 15, + 1, 4, 3, 7, 4, 7, 7, 11, 3, 7, 6, 10, 7, 10, 10, 14, + 4, 7, 7, 11, 7, 10, 10, 14, 7, 11, 10, 14, 11, 14, 14, 18, + 0, 4, 3, 7, 3, 6, 6, 10, 3, 7, 6, 10, 7, 10, 10, 14, + 3, 6, 6, 10, 6, 9, 9, 13, 6, 10, 9, 13, 10, 13, 13, 17, + 4, 7, 6, 10, 7, 10, 10, 14, 6, 10, 9, 13, 10, 13, 13, 17, + 7, 10, 10, 14, 10, 13, 13, 17, 10, 14, 13, 17, 14, 17, 17, 21, + 0, 3, 3, 7, 3, 6, 6, 10, 2, 6, 5, 9, 6, 9, 9, 13, + 3, 6, 6, 10, 6, 9, 9, 13, 6, 10, 9, 13, 10, 13, 13, 17, + 3, 6, 5, 9, 6, 9, 9, 13, 5, 9, 8, 12, 9, 12, 12, 16, + 6, 9, 9, 13, 9, 12, 12, 16, 9, 13, 12, 16, 13, 16, 16, 20, + 3, 7, 6, 10, 6, 9, 9, 13, 6, 10, 9, 13, 10, 13, 13, 17, + 6, 9, 9, 13, 9, 12, 12, 16, 9, 13, 12, 16, 13, 16, 16, 20, + 7, 10, 9, 13, 10, 13, 13, 17, 9, 13, 12, 16, 13, 16, 16, 20, + 10, 13, 13, 17, 13, 16, 16, 20, 13, 17, 16, 20, 17, 20, 20, 24 +}; + +#define DECIMATE_MASK_E2K(i, nz, a) \ + v0 = VEC_LD(dct + i); \ + v1 = VEC_LD(dct + i + 8); \ + v0 = _mm_abs_epi16(v0); \ + v1 = _mm_abs_epi16(v1); \ + v0 = _mm_packs_epi16(v0, v1); \ + v2 = _mm_cmpeq_epi8(c0, v0); \ + v0 = _mm_cmpgt_epi8(v0, c1); \ + nz = _mm_movemask_epi8(v2); \ + a = _mm_movemask_epi8(v0); + +static int ALWAYS_INLINE decimate_score_internal(int16_t *dct, int i_max) { + __m128i v0, v1, v2, c0 = _mm_setzero_si128(), c1 = _mm_set1_epi8(1); + int c, a, nz; + + DECIMATE_MASK_E2K(0, nz, a) + nz ^= 0xffff; + if (!nz) return 0; + if (a) return 9; + if (i_max == 15) nz >>= 1; + c = nz & 255; + a = decimate_mask_table4[c]; + if (nz < 256) return a; + nz >>= 32 - __builtin_clz(c | 1); + c = __builtin_ctz(nz); + nz >>= c + 1; + a += x264_decimate_table4[c]; + a += decimate_mask_table4[nz]; + return a; +} + +int x264_decimate_score15_e2k(dctcoef *dct) { + return decimate_score_internal(dct, 15); +} + +int x264_decimate_score16_e2k(dctcoef *dct) { + return decimate_score_internal(dct, 16); +} + +int x264_decimate_score64_e2k(dctcoef *dct) { + __m128i v0, v1, v2, c0 = _mm_setzero_si128(), c1 = _mm_set1_epi8(1); + int c, a, a1; + uint64_t nz, nz1; + + DECIMATE_MASK_E2K(0, nz, a) + if (a) return 9; + DECIMATE_MASK_E2K(16, nz1, a) nz |= nz1 << 16; + DECIMATE_MASK_E2K(32, nz1, a1) nz |= nz1 << 32; a |= a1; + DECIMATE_MASK_E2K(48, nz1, a1) nz |= nz1 << 48; a |= a1; + if (a) return 9; + nz = ~nz; + for (a = 0; nz; nz >>= c, nz >>= 1) { + c = __builtin_ctzll(nz); + a += x264_decimate_table8[c]; + if (a > 5) return 9; + } + return a; +} + +int x264_coeff_last4_e2k(dctcoef *dct) { + uint64_t a = *(uint64_t*)dct; + return a ? 3 - (__builtin_clzll(a) >> 4) : -1; +} + +int x264_coeff_last8_e2k(dctcoef *dct) { + __m64 v0 = *(__m64*)dct, v1 = *(__m64*)(dct + 4); + uint64_t a = _mm_cvtm64_si64(_mm_packs_pi16(v0, v1)); + return a ? 7 - (__builtin_clzll(a) >> 3) : -1; +} + +#define COEFF_LAST_E2K(i, nz) \ + v0 = VEC_LD(dct + i); \ + v1 = VEC_LD(dct + i + 8); \ + v0 = _mm_packs_epi16(v0, v1); \ + v0 = _mm_cmpeq_epi8(v0, c0); \ + nz = _mm_movemask_epi8(v0); + +int x264_coeff_last15_e2k(dctcoef *dct) { + __m128i v0, v1, c0 = _mm_setzero_si128(); + uint32_t nz; + COEFF_LAST_E2K(-1, nz) + nz ^= 0xffff; + return 30 - __builtin_clz(nz | 1); +} + +int x264_coeff_last16_e2k(dctcoef *dct) { + __m128i v0, v1, c0 = _mm_setzero_si128(); + uint32_t nz; + COEFF_LAST_E2K(0, nz) + nz ^= 0xffff; + return nz ? 31 - __builtin_clz(nz) : -1; +} + +int x264_coeff_last64_e2k(dctcoef *dct) { + __m128i v0, v1, c0 = _mm_setzero_si128(); + uint32_t nz; + COEFF_LAST_E2K(48, nz) nz ^= 0xffff; + if (nz) return 79 - __builtin_clz(nz); + COEFF_LAST_E2K(32, nz) nz ^= 0xffff; + if (nz) return 63 - __builtin_clz(nz); + COEFF_LAST_E2K(16, nz) nz ^= 0xffff; + if (nz) return 47 - __builtin_clz(nz); + COEFF_LAST_E2K(0, nz) nz ^= 0xffff; + return nz ? 31 - __builtin_clz(nz) : -1; +} + +int x264_coeff_level_run4_e2k(dctcoef *dct, x264_run_level_t *runlevel) { + __m64 v0 = *(__m64*)dct; + uint32_t nz; int total = 0, n, last; + v0 = _mm_packs_pi16(v0, v0); + v0 = _mm_cmpeq_pi8(v0, _mm_setzero_si64()); + nz = ~_mm_movemask_pi8(v0) & 0xf; + runlevel->mask = nz; + nz = nz << 2 | 2; + n = __builtin_clz(nz); + nz <<= 1 + n; + runlevel->last = last = 29 - n; + do { + nz <<= n = __builtin_clz(nz) + 1; + runlevel->level[total++] = dct[last]; + } while ((last -= n) >= 0); + return total; +} + +int x264_coeff_level_run8_e2k(dctcoef *dct, x264_run_level_t *runlevel) { + __m64 v0 = *(__m64*)dct, v1 = *(__m64*)(dct + 4); + uint32_t nz; int total = 0, n, last; + v0 = _mm_packs_pi16(v0, v1); + v0 = _mm_cmpeq_pi8(v0, _mm_setzero_si64()); + nz = _mm_movemask_pi8(v0) ^ 0xff; + runlevel->mask = nz; + nz = nz << 2 | 2; + n = __builtin_clz(nz); + nz <<= 1 + n; + runlevel->last = last = 29 - n; + do { + nz <<= n = __builtin_clz(nz) + 1; + runlevel->level[total++] = dct[last]; + } while ((last -= n) >= 0); + return total; +} + +int x264_coeff_level_run15_e2k(dctcoef *dct, x264_run_level_t *runlevel) { + __m128i v0, v1, c0 = _mm_setzero_si128(); + uint32_t nz; int total = 0, n, last; + COEFF_LAST_E2K(-1, nz) + nz ^= 0xffff; + runlevel->mask = nz >> 1; + nz = nz << 2 | 4; + n = __builtin_clz(nz); + nz <<= 1 + n; + runlevel->last = last = 28 - n; + do { + nz <<= n = __builtin_clz(nz) + 1; + runlevel->level[total++] = dct[last]; + } while ((last -= n) >= 0); + return total; +} + +int x264_coeff_level_run16_e2k(dctcoef *dct, x264_run_level_t *runlevel) { + __m128i v0, v1, c0 = _mm_setzero_si128(); + uint32_t nz; int total = 0, n, last; + COEFF_LAST_E2K(0, nz) + nz ^= 0xffff; + runlevel->mask = nz; + nz = nz << 2 | 2; + n = __builtin_clz(nz); + nz <<= 1 + n; + runlevel->last = last = 29 - n; + do { + nz <<= n = __builtin_clz(nz) + 1; + runlevel->level[total++] = dct[last]; + } while ((last -= n) >= 0); + return total; +} + +#endif // !HIGH_BIT_DEPTH + diff --git a/common/e2k/quant.h b/common/e2k/quant.h new file mode 100644 index 0000000..1050f9d --- /dev/null +++ b/common/e2k/quant.h @@ -0,0 +1,73 @@ +/***************************************************************************** + * quant.h: e2k quantization + ***************************************************************************** + * Copyright (C) 2021, Ilya Kurdyukov for BaseALT, Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_E2K_QUANT_H +#define X264_E2K_QUANT_H + +#define x264_quant_2x2_dc_e2k x264_template(quant_2x2_dc_e2k) +int x264_quant_2x2_dc_e2k(int16_t dct[4], int mf, int bias); +#define x264_quant_4x4_e2k x264_template(quant_4x4_e2k) +int x264_quant_4x4_e2k(int16_t dct[16], uint16_t mf[16], uint16_t bias[16]); +#define x264_quant_4x4_dc_e2k x264_template(quant_4x4_dc_e2k) +int x264_quant_4x4_dc_e2k(int16_t dct[16], int mf, int bias); +#define x264_quant_4x4x4_e2k x264_template(quant_4x4x4_e2k) +int x264_quant_4x4x4_e2k(int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16]); +#define x264_quant_8x8_e2k x264_template(quant_8x8_e2k) +int x264_quant_8x8_e2k(int16_t dct[64], uint16_t mf[64], uint16_t bias[64]); + +#define x264_dequant_4x4_e2k x264_template(dequant_4x4_e2k) +void x264_dequant_4x4_e2k(int16_t dct[16], int dequant_mf[6][16], int i_qp); +#define x264_dequant_4x4_dc_e2k x264_template(dequant_4x4_dc_e2k) +void x264_dequant_4x4_dc_e2k(int16_t dct[16], int dequant_mf[6][16], int i_qp); +#define x264_dequant_8x8_e2k x264_template(dequant_8x8_e2k) +void x264_dequant_8x8_e2k(int16_t dct[64], int dequant_mf[6][64], int i_qp); + +#define x264_denoise_dct_e2k x264_template(denoise_dct_e2k) +void x264_denoise_dct_e2k(int16_t *dct, uint32_t *sum, uint16_t *offset, int size); +#define x264_decimate_score15_e2k x264_template(decimate_score15_e2k) +int x264_decimate_score15_e2k(dctcoef *dct); +#define x264_decimate_score16_e2k x264_template(decimate_score16_e2k) +int x264_decimate_score16_e2k(dctcoef *dct); +#define x264_decimate_score64_e2k x264_template(decimate_score64_e2k) +int x264_decimate_score64_e2k(dctcoef *dct); + +#define x264_coeff_last4_e2k x264_template(coeff_last4_e2k) +int x264_coeff_last4_e2k(dctcoef *dct); +#define x264_coeff_last8_e2k x264_template(coeff_last8_e2k) +int x264_coeff_last8_e2k(dctcoef *dct); +#define x264_coeff_last15_e2k x264_template(coeff_last15_e2k) +int x264_coeff_last15_e2k(dctcoef *dct); +#define x264_coeff_last16_e2k x264_template(coeff_last16_e2k) +int x264_coeff_last16_e2k(dctcoef *dct); +#define x264_coeff_last64_e2k x264_template(coeff_last64_e2k) +int x264_coeff_last64_e2k(dctcoef *dct); +#define x264_coeff_level_run4_e2k x264_template(coeff_level_run4_e2k) +int x264_coeff_level_run4_e2k(dctcoef *dct, x264_run_level_t *runlevel); +#define x264_coeff_level_run8_e2k x264_template(coeff_level_run8_e2k) +int x264_coeff_level_run8_e2k(dctcoef *dct, x264_run_level_t *runlevel); +#define x264_coeff_level_run15_e2k x264_template(coeff_level_run15_e2k) +int x264_coeff_level_run15_e2k(dctcoef *dct, x264_run_level_t *runlevel); +#define x264_coeff_level_run16_e2k x264_template(coeff_level_run16_e2k) +int x264_coeff_level_run16_e2k(dctcoef *dct, x264_run_level_t *runlevel); + +#endif diff --git a/common/mc.c b/common/mc.c index d5e6d55..eb6b7be 100644 --- a/common/mc.c +++ b/common/mc.c @@ -32,6 +32,9 @@ #if HAVE_ALTIVEC #include "ppc/mc.h" #endif +#if ARCH_E2K +#include "e2k/mc.h" +#endif #if HAVE_ARMV6 #include "arm/mc.h" #endif @@ -677,6 +680,10 @@ void x264_mc_init( uint32_t cpu, x264_mc_functions_t *pf, int cpu_independent ) if( cpu&X264_CPU_ALTIVEC ) x264_mc_init_altivec( pf ); #endif +#if HAVE_E2K + if( cpu&X264_CPU_E2K ) + x264_mc_init_e2k( pf ); +#endif #if HAVE_ARMV6 x264_mc_init_arm( cpu, pf ); #endif diff --git a/common/pixel.c b/common/pixel.c index 113df30..d4757e9 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -34,6 +34,9 @@ #if HAVE_ALTIVEC # include "ppc/pixel.h" #endif +#if ARCH_E2K +# include "e2k/pixel.h" +#endif #if HAVE_ARMV6 # include "arm/pixel.h" # include "arm/predict.h" @@ -1506,6 +1509,12 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf ) x264_pixel_init_altivec( pixf ); } #endif +#if HAVE_E2K + if( cpu&X264_CPU_E2K ) + { + x264_pixel_init_e2k( pixf ); + } +#endif pixf->ads[PIXEL_8x16] = pixf->ads[PIXEL_8x4] = diff --git a/common/predict.c b/common/predict.c index 5a48198..f7e1f2f 100644 --- a/common/predict.c +++ b/common/predict.c @@ -37,6 +37,9 @@ #if HAVE_ALTIVEC # include "ppc/predict.h" #endif +#if ARCH_E2K +# include "e2k/predict.h" +#endif #if HAVE_ARMV6 # include "arm/predict.h" #endif @@ -902,6 +905,11 @@ void x264_predict_16x16_init( uint32_t cpu, x264_predict_t pf[7] ) x264_predict_16x16_init_altivec( pf ); #endif +#if HAVE_E2K + if( cpu&X264_CPU_E2K ) + x264_predict_16x16_init_e2k( pf ); +#endif + #if HAVE_ARMV6 x264_predict_16x16_init_arm( cpu, pf ); #endif diff --git a/common/quant.c b/common/quant.c index 6b71225..b5cefa0 100644 --- a/common/quant.c +++ b/common/quant.c @@ -34,6 +34,9 @@ #if HAVE_ALTIVEC # include "ppc/quant.h" #endif +#if ARCH_E2K +# include "e2k/quant.h" +#endif #if HAVE_ARMV6 # include "arm/quant.h" #endif @@ -745,6 +748,36 @@ void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf ) } #endif +#if HAVE_E2K + if( cpu&X264_CPU_E2K ) + { + pf->quant_2x2_dc = x264_quant_2x2_dc_e2k; + pf->quant_4x4 = x264_quant_4x4_e2k; + pf->quant_4x4_dc = x264_quant_4x4_dc_e2k; + pf->quant_4x4x4 = x264_quant_4x4x4_e2k; + pf->quant_8x8 = x264_quant_8x8_e2k; + + pf->dequant_4x4 = x264_dequant_4x4_e2k; + pf->dequant_4x4_dc = x264_dequant_4x4_dc_e2k; + pf->dequant_8x8 = x264_dequant_8x8_e2k; + + pf->denoise_dct = x264_denoise_dct_e2k; + pf->decimate_score15 = x264_decimate_score15_e2k; + pf->decimate_score16 = x264_decimate_score16_e2k; + pf->decimate_score64 = x264_decimate_score64_e2k; + + pf->coeff_last4 = x264_coeff_last4_e2k; + pf->coeff_last8 = x264_coeff_last8_e2k; + pf->coeff_last[DCT_LUMA_AC ] = x264_coeff_last15_e2k; + pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_e2k; + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_e2k; + pf->coeff_level_run4 = x264_coeff_level_run4_e2k; + pf->coeff_level_run8 = x264_coeff_level_run8_e2k; + pf->coeff_level_run[DCT_LUMA_AC ] = x264_coeff_level_run15_e2k; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_e2k; + } +#endif + #if HAVE_ARMV6 if( cpu&X264_CPU_ARMV6 ) { diff --git a/configure b/configure index e242e73..22e7583 100755 --- a/configure +++ b/configure @@ -409,7 +409,7 @@ NL=" " # list of all preprocessor HAVE values we can define -CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON AARCH64 BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F SWSCALE \ +CONFIG_HAVE="MALLOC_H E2K ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON AARCH64 BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F SWSCALE \ LAVF FFMS GPAC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP LSMASH X86_INLINE_ASM AS_FUNC INTEL_DISPATCHER \ MSA MMAP WINRT VSX ARM_INLINE_ASM STRTOK_R CLOCK_GETTIME BITDEPTH8 BITDEPTH10" @@ -814,6 +814,15 @@ case $host_cpu in fi fi ;; + e2k) + ARCH="E2K" + if [ $asm = auto ] ; then + define HAVE_E2K + CFLAGS="$CFLAGS -msse4.1" + AS="${AS-${CC}}" + AS_EXT=".c" + fi + ;; sparc) ARCH="SPARC" ;; diff --git a/tools/checkasm.c b/tools/checkasm.c index 829f3e2..f857e43 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -100,6 +100,10 @@ static const char **intra_predict_8x16c_names = intra_predict_8x8c_names; #define set_func_name(...) snprintf( func_name, sizeof(func_name), __VA_ARGS__ ) +#ifdef ARCH_E2K +#include +#endif + static inline uint32_t read_time(void) { uint32_t a = 0; @@ -109,6 +113,9 @@ static inline uint32_t read_time(void) : "=a"(a) :: "edx", "memory" ); #elif ARCH_PPC asm volatile( "mftb %0" : "=r"(a) :: "memory" ); +#elif ARCH_E2K + unsigned aux; + a = __rdtscp(&aux); #elif HAVE_ARM_INLINE_ASM // ARMv7 only asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) :: "memory" ); #elif ARCH_AARCH64 @@ -207,6 +214,8 @@ static void print_bench(void) b->cpu&X264_CPU_MMX ? "mmx" : #elif ARCH_PPC b->cpu&X264_CPU_ALTIVEC ? "altivec" : +#elif ARCH_E2K + b->cpu&X264_CPU_E2K ? "e2k" : #elif ARCH_ARM b->cpu&X264_CPU_NEON ? "neon" : b->cpu&X264_CPU_ARMV6 ? "armv6" : @@ -2959,6 +2968,12 @@ static int check_all_flags( void ) fprintf( stderr, "x264: ALTIVEC against C\n" ); ret = check_all_funcs( 0, X264_CPU_ALTIVEC ); } +#elif ARCH_E2K + if( cpu_detect & X264_CPU_E2K ) + { + fprintf( stderr, "x264: E2K against C\n" ); + ret = check_all_funcs( 0, X264_CPU_E2K ); + } #elif ARCH_ARM if( cpu_detect & X264_CPU_NEON ) x264_checkasm_call = x264_checkasm_call_neon; @@ -2989,7 +3004,7 @@ REALIGN_STACK int main( int argc, char **argv ) if( argc > 1 && !strncmp( argv[1], "--bench", 7 ) ) { -#if !ARCH_X86 && !ARCH_X86_64 && !ARCH_PPC && !ARCH_ARM && !ARCH_AARCH64 && !ARCH_MIPS +#if !ARCH_X86 && !ARCH_X86_64 && !ARCH_PPC && !ARCH_E2K && !ARCH_ARM && !ARCH_AARCH64 && !ARCH_MIPS fprintf( stderr, "no --bench for your cpu until you port rdtsc\n" ); return 1; #endif diff --git a/x264.h b/x264.h index 5aa2a8d..ce501ef 100644 --- a/x264.h +++ b/x264.h @@ -172,6 +172,9 @@ typedef struct x264_nal_t /* PowerPC */ #define X264_CPU_ALTIVEC 0x0000001U +/* Elbrus */ +#define X264_CPU_E2K 0x0000001U + /* ARM and AArch64 */ #define X264_CPU_ARMV6 0x0000001U #define X264_CPU_NEON 0x0000002U /* ARM NEON */ -- 2.34.1