287 lines
11 KiB
Diff
287 lines
11 KiB
Diff
--- a/common/common.h 2023-07-04 15:30:16.461348834 +0300
|
|
+++ b/common/common.h 2023-07-04 15:05:14.374641362 +0300
|
|
@@ -144,7 +144,11 @@
|
|
|
|
static ALWAYS_INLINE pixel x264_clip_pixel( int x )
|
|
{
|
|
+#if ARCH_E2K
|
|
+ return ( (x > PIXEL_MAX) ? PIXEL_MAX : (x < 0) ? 0 : x );
|
|
+#else
|
|
return ( (x & ~PIXEL_MAX) ? (-x)>>31 & PIXEL_MAX : x );
|
|
+#endif /* ARCH_E2K */
|
|
}
|
|
|
|
/****************************************************************************
|
|
--- a/common/macroblock.c 2022-06-10 09:20:18.000000000 +0300
|
|
+++ b/common/macroblock.c 2023-07-04 14:20:32.987850779 +0300
|
|
@@ -28,6 +28,15 @@
|
|
|
|
#include "common.h"
|
|
|
|
+#if ARCH_E2K
|
|
+#define MEMCPY_FAST(dst,src,len) \
|
|
+ for( int x = 0; x < (len)/8; ++x ) \
|
|
+ ((long long *) (dst))[x] = ((long long *) (src))[x]
|
|
+#else /* ARCH_E2K */
|
|
+#define MEMCPY_FAST(dst,src,len) \
|
|
+ memcpy( dst, src, len )
|
|
+#endif /* ARCH_E2K */
|
|
+
|
|
#define MC_LUMA(list,p) \
|
|
h->mc.mc_luma( &h->mb.pic.p_fdec[p][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, \
|
|
&h->mb.pic.p_fref[list][i_ref][p*4], h->mb.pic.i_stride[p], \
|
|
@@ -433,6 +442,7 @@
|
|
|
|
h->fdec->i_ref[0] = h->i_ref[0];
|
|
h->fdec->i_ref[1] = h->i_ref[1];
|
|
+#pragma loop count (1)
|
|
for( int i = 0; i < h->i_ref[0]; i++ )
|
|
h->fdec->ref_poc[0][i] = h->fref[0][i]->i_poc;
|
|
if( h->sh.i_type == SLICE_TYPE_B )
|
|
@@ -477,6 +487,7 @@
|
|
memset( h->mb.cache.ref, -2, sizeof( h->mb.cache.ref ) );
|
|
|
|
if( h->i_ref[0] > 0 )
|
|
+#pragma loop count (1)
|
|
for( int field = 0; field <= SLICE_MBAFF; field++ )
|
|
{
|
|
int curpoc = h->fdec->i_poc + h->fdec->i_delta_poc[field];
|
|
@@ -586,15 +597,15 @@
|
|
if( b_chroma )
|
|
{
|
|
h->mc.load_deinterleave_chroma_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2, height );
|
|
- memcpy( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec, 8*SIZEOF_PIXEL );
|
|
- memcpy( h->mb.pic.p_fdec[2]-FDEC_STRIDE, intra_fdec+8, 8*SIZEOF_PIXEL );
|
|
+ MEMCPY_FAST( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec, 8*SIZEOF_PIXEL );
|
|
+ MEMCPY_FAST( h->mb.pic.p_fdec[2]-FDEC_STRIDE, intra_fdec+8, 8*SIZEOF_PIXEL );
|
|
h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = intra_fdec[-1-8];
|
|
h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = intra_fdec[-1];
|
|
}
|
|
else
|
|
{
|
|
h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE, h->mb.pic.p_fenc_plane[i], i_stride2, 16 );
|
|
- memcpy( h->mb.pic.p_fdec[i]-FDEC_STRIDE, intra_fdec, 24*SIZEOF_PIXEL );
|
|
+ MEMCPY_FAST( h->mb.pic.p_fdec[i]-FDEC_STRIDE, intra_fdec, 24*SIZEOF_PIXEL );
|
|
h->mb.pic.p_fdec[i][-FDEC_STRIDE-1] = intra_fdec[-1];
|
|
}
|
|
if( b_mbaff || h->mb.b_reencode_mb )
|
|
@@ -609,6 +620,7 @@
|
|
h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
|
|
}
|
|
pixel *plane_src, **filtered_src;
|
|
+#pragma loop count (1)
|
|
for( int j = 0; j < h->mb.pic.i_fref[0]; j++ )
|
|
{
|
|
// Interpolate between pixels in same field.
|
|
@@ -888,6 +900,7 @@
|
|
CP32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8], &nnz[top][32-4 + (16>>CHROMA_V_SHIFT)] );
|
|
|
|
/* Finish the prefetching */
|
|
+#pragma loop count (1)
|
|
for( int l = 0; l < lists; l++ )
|
|
{
|
|
x264_prefetch( &h->mb.mv[l][top_4x4-1] );
|
|
@@ -1046,6 +1059,7 @@
|
|
x264_prefetch_fenc( h, h->fenc, mb_x, mb_y );
|
|
|
|
/* load ref/mv/mvd */
|
|
+#pragma loop count (1)
|
|
for( int l = 0; l < lists; l++ )
|
|
{
|
|
int16_t (*mv)[2] = h->mb.mv[l];
|
|
@@ -1642,17 +1656,17 @@
|
|
* bottom row of each field. We also store samples needed for the next
|
|
* mbpair in intra_border_backup[2]. */
|
|
int backup_dst = !b_mbaff ? (mb_y&1) : (mb_y&1) ? 1 : MB_INTERLACED ? 0 : 2;
|
|
- memcpy( &h->intra_border_backup[backup_dst][0][mb_x*16 ], h->mb.pic.p_fdec[0]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL );
|
|
+ MEMCPY_FAST( &h->intra_border_backup[backup_dst][0][mb_x*16 ], h->mb.pic.p_fdec[0]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL );
|
|
if( CHROMA444 )
|
|
{
|
|
- memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL );
|
|
- memcpy( &h->intra_border_backup[backup_dst][2][mb_x*16 ], h->mb.pic.p_fdec[2]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL );
|
|
+ MEMCPY_FAST( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL );
|
|
+ MEMCPY_FAST( &h->intra_border_backup[backup_dst][2][mb_x*16 ], h->mb.pic.p_fdec[2]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL );
|
|
}
|
|
else if( CHROMA_FORMAT )
|
|
{
|
|
int backup_src = (15>>CHROMA_V_SHIFT) * FDEC_STRIDE;
|
|
- memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 8*SIZEOF_PIXEL );
|
|
- memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*SIZEOF_PIXEL );
|
|
+ MEMCPY_FAST( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 8*SIZEOF_PIXEL );
|
|
+ MEMCPY_FAST( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*SIZEOF_PIXEL );
|
|
}
|
|
if( b_mbaff )
|
|
{
|
|
@@ -1660,18 +1674,18 @@
|
|
{
|
|
int backup_src = (MB_INTERLACED ? 7 : 14) * FDEC_STRIDE;
|
|
backup_dst = MB_INTERLACED ? 2 : 0;
|
|
- memcpy( &h->intra_border_backup[backup_dst][0][mb_x*16 ], h->mb.pic.p_fdec[0]+backup_src, 16*SIZEOF_PIXEL );
|
|
+ MEMCPY_FAST( &h->intra_border_backup[backup_dst][0][mb_x*16 ], h->mb.pic.p_fdec[0]+backup_src, 16*SIZEOF_PIXEL );
|
|
if( CHROMA444 )
|
|
{
|
|
- memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 16*SIZEOF_PIXEL );
|
|
- memcpy( &h->intra_border_backup[backup_dst][2][mb_x*16 ], h->mb.pic.p_fdec[2]+backup_src, 16*SIZEOF_PIXEL );
|
|
+ MEMCPY_FAST( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 16*SIZEOF_PIXEL );
|
|
+ MEMCPY_FAST( &h->intra_border_backup[backup_dst][2][mb_x*16 ], h->mb.pic.p_fdec[2]+backup_src, 16*SIZEOF_PIXEL );
|
|
}
|
|
else if( CHROMA_FORMAT )
|
|
{
|
|
if( CHROMA_FORMAT == CHROMA_420 )
|
|
backup_src = (MB_INTERLACED ? 3 : 6) * FDEC_STRIDE;
|
|
- memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 8*SIZEOF_PIXEL );
|
|
- memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*SIZEOF_PIXEL );
|
|
+ MEMCPY_FAST( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 8*SIZEOF_PIXEL );
|
|
+ MEMCPY_FAST( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*SIZEOF_PIXEL );
|
|
}
|
|
}
|
|
}
|
|
--- a/common/osdep.h 2022-06-10 09:20:18.000000000 +0300
|
|
+++ b/common/osdep.h 2023-07-04 14:39:06.062582899 +0300
|
|
@@ -472,6 +472,11 @@
|
|
asm("rev %0, %0":"+r"(x));
|
|
return x;
|
|
}
|
|
+#elif defined(ARCH_E2K)
|
|
+static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
|
|
+{
|
|
+ return __builtin_bswap32 (x);
|
|
+}
|
|
#else
|
|
static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
|
|
{
|
|
@@ -484,6 +489,11 @@
|
|
asm("bswap %0":"+r"(x));
|
|
return x;
|
|
}
|
|
+#elif defined(ARCH_E2K)
|
|
+static ALWAYS_INLINE uint64_t endian_fix64( uint64_t x )
|
|
+{
|
|
+ return __builtin_bswap64 (x);
|
|
+}
|
|
#else
|
|
static ALWAYS_INLINE uint64_t endian_fix64( uint64_t x )
|
|
{
|
|
--- a/encoder/analyse.c 2022-06-10 09:20:18.000000000 +0300
|
|
+++ b/encoder/analyse.c 2023-07-04 15:26:12.424359907 +0300
|
|
@@ -626,6 +626,7 @@
|
|
satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE );
|
|
satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
|
|
|
|
+#pragma loop count(4)
|
|
for( ; *predict_mode >= 0; predict_mode++ )
|
|
{
|
|
int i_mode = *predict_mode;
|
|
@@ -637,6 +638,7 @@
|
|
}
|
|
else
|
|
{
|
|
+#pragma loop count(4)
|
|
for( ; *predict_mode >= 0; predict_mode++ )
|
|
{
|
|
int i_satd;
|
|
@@ -1265,6 +1267,7 @@
|
|
LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
|
|
|
|
a->l0.me16x16.cost = INT_MAX;
|
|
+#pragma loop count (1)
|
|
for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
|
|
{
|
|
m.i_ref_cost = REF_COST( 0, i_ref );
|
|
--- a/encoder/macroblock.c 2022-06-10 09:20:18.000000000 +0300
|
|
+++ b/encoder/macroblock.c 2023-07-04 14:53:19.544020314 +0300
|
|
@@ -34,15 +34,23 @@
|
|
#define ZIG(i,y,x) level[i] = dct[x*2+y];
|
|
static inline void zigzag_scan_2x2_dc( dctcoef level[4], dctcoef dct[4] )
|
|
{
|
|
+#if ARCH_E2K
|
|
+ ((__di *) level)[0] = __builtin_e2k_pshufh (((__di *) dct)[0], 0xd8); /* 3120 */
|
|
+#else /* ARCH_E2K */
|
|
ZIG(0,0,0)
|
|
ZIG(1,0,1)
|
|
ZIG(2,1,0)
|
|
ZIG(3,1,1)
|
|
+#endif /* ARCH_E2K */
|
|
}
|
|
#undef ZIG
|
|
|
|
static inline void zigzag_scan_2x4_dc( dctcoef level[8], dctcoef dct[8] )
|
|
{
|
|
+#if ARCH_E2K
|
|
+ ((__di *) level)[0] = __builtin_e2k_pshufb (((__di *) dct)[1], ((__di *) dct)[0], 0x0908030205040100LL);
|
|
+ ((__di *) level)[1] = __builtin_e2k_pshufb (((__di *) dct)[1], ((__di *) dct)[0], 0x0f0e0b0a07060d0cLL);
|
|
+#else /* ARCH_E2K */
|
|
level[0] = dct[0];
|
|
level[1] = dct[2];
|
|
level[2] = dct[1];
|
|
@@ -51,6 +59,7 @@
|
|
level[5] = dct[3];
|
|
level[6] = dct[5];
|
|
level[7] = dct[7];
|
|
+#endif /* ARCH_E2K */
|
|
}
|
|
|
|
#define IDCT_DEQUANT_2X2_START \
|
|
@@ -622,6 +631,7 @@
|
|
int b_force_no_skip = 0;
|
|
int nz;
|
|
h->mb.i_cbp_luma = 0;
|
|
+#pragma loop count (1)
|
|
for( int p = 0; p < plane_count; p++ )
|
|
h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = 0;
|
|
|
|
@@ -706,7 +716,7 @@
|
|
if( h->mb.i_type == I_16x16 )
|
|
{
|
|
h->mb.b_transform_8x8 = 0;
|
|
-
|
|
+#pragma loop count (1)
|
|
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
|
|
mb_encode_i16x16( h, p, i_qp );
|
|
}
|
|
@@ -845,6 +855,7 @@
|
|
else
|
|
{
|
|
ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] );
|
|
+#pragma loop count (1)
|
|
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
|
|
{
|
|
int quant_cat = p ? CQM_4PC : CQM_4PY;
|
|
@@ -991,7 +1002,7 @@
|
|
ALIGNED_ARRAY_64( dctcoef, dctscan,[16] );
|
|
ALIGNED_4( int16_t mvp[2] );
|
|
int i_qp = h->mb.i_qp;
|
|
-
|
|
+#pragma loop count (1)
|
|
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
|
|
{
|
|
int quant_cat = p ? CQM_4PC : CQM_4PY;
|
|
@@ -1087,7 +1098,7 @@
|
|
else
|
|
h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );
|
|
}
|
|
-
|
|
+#pragma loop count (1)
|
|
for( int i = 0; i <= chroma422; i++ )
|
|
if( h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4PC][i_qp+3*chroma422][0] >> 1,
|
|
h->quant4_bias[CQM_4PC][i_qp+3*chroma422][0] << 1 ) )
|
|
@@ -1098,6 +1109,7 @@
|
|
continue;
|
|
|
|
if( !h->mb.b_noise_reduction )
|
|
+#pragma loop count (1)
|
|
for( int i = 0; i <= chroma422; i++ )
|
|
{
|
|
h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
|
|
--- a/tools/checkasm.c 2023-07-04 15:30:16.477349162 +0300
|
|
+++ b/tools/checkasm.c 2023-07-04 15:22:14.275491480 +0300
|
|
@@ -126,6 +126,8 @@
|
|
asm volatile( "rdhwr %0, $2" : "=r"(a) :: "memory" );
|
|
#elif ARCH_SPARC && defined(__sparc_v9__)
|
|
asm volatile( "rd %%tick, %0" : "=r" (a));
|
|
+#elif ARCH_E2K
|
|
+ asm volatile( "rrd %%clkr, %0" : "=r" (a) :: "memory");
|
|
#endif
|
|
return a;
|
|
}
|