mcst-linux-kernel/patches-2024.06.26/x264-0.164/0004.Add_optimization_for_e...

287 lines
11 KiB
Diff

--- a/common/common.h 2023-07-04 15:30:16.461348834 +0300
+++ b/common/common.h 2023-07-04 15:05:14.374641362 +0300
@@ -144,7 +144,11 @@
static ALWAYS_INLINE pixel x264_clip_pixel( int x )
{
+#if ARCH_E2K
+ return ( (x > PIXEL_MAX) ? PIXEL_MAX : (x < 0) ? 0 : x );
+#else
return ( (x & ~PIXEL_MAX) ? (-x)>>31 & PIXEL_MAX : x );
+#endif /* ARCH_E2K */
}
/****************************************************************************
--- a/common/macroblock.c 2022-06-10 09:20:18.000000000 +0300
+++ b/common/macroblock.c 2023-07-04 14:20:32.987850779 +0300
@@ -28,6 +28,15 @@
#include "common.h"
+#if ARCH_E2K
+#define MEMCPY_FAST(dst,src,len) \
+ for( int x = 0; x < (len)/8; ++x ) \
+ ((long long *) (dst))[x] = ((long long *) (src))[x]
+#else /* ARCH_E2K */
+#define MEMCPY_FAST(dst,src,len) \
+ memcpy( dst, src, len )
+#endif /* ARCH_E2K */
+
#define MC_LUMA(list,p) \
h->mc.mc_luma( &h->mb.pic.p_fdec[p][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, \
&h->mb.pic.p_fref[list][i_ref][p*4], h->mb.pic.i_stride[p], \
@@ -433,6 +442,7 @@
h->fdec->i_ref[0] = h->i_ref[0];
h->fdec->i_ref[1] = h->i_ref[1];
+#pragma loop count (1)
for( int i = 0; i < h->i_ref[0]; i++ )
h->fdec->ref_poc[0][i] = h->fref[0][i]->i_poc;
if( h->sh.i_type == SLICE_TYPE_B )
@@ -477,6 +487,7 @@
memset( h->mb.cache.ref, -2, sizeof( h->mb.cache.ref ) );
if( h->i_ref[0] > 0 )
+#pragma loop count (1)
for( int field = 0; field <= SLICE_MBAFF; field++ )
{
int curpoc = h->fdec->i_poc + h->fdec->i_delta_poc[field];
@@ -586,15 +597,15 @@
if( b_chroma )
{
h->mc.load_deinterleave_chroma_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2, height );
- memcpy( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec, 8*SIZEOF_PIXEL );
- memcpy( h->mb.pic.p_fdec[2]-FDEC_STRIDE, intra_fdec+8, 8*SIZEOF_PIXEL );
+ MEMCPY_FAST( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec, 8*SIZEOF_PIXEL );
+ MEMCPY_FAST( h->mb.pic.p_fdec[2]-FDEC_STRIDE, intra_fdec+8, 8*SIZEOF_PIXEL );
h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = intra_fdec[-1-8];
h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = intra_fdec[-1];
}
else
{
h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE, h->mb.pic.p_fenc_plane[i], i_stride2, 16 );
- memcpy( h->mb.pic.p_fdec[i]-FDEC_STRIDE, intra_fdec, 24*SIZEOF_PIXEL );
+ MEMCPY_FAST( h->mb.pic.p_fdec[i]-FDEC_STRIDE, intra_fdec, 24*SIZEOF_PIXEL );
h->mb.pic.p_fdec[i][-FDEC_STRIDE-1] = intra_fdec[-1];
}
if( b_mbaff || h->mb.b_reencode_mb )
@@ -609,6 +620,7 @@
h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
}
pixel *plane_src, **filtered_src;
+#pragma loop count (1)
for( int j = 0; j < h->mb.pic.i_fref[0]; j++ )
{
// Interpolate between pixels in same field.
@@ -888,6 +900,7 @@
CP32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8], &nnz[top][32-4 + (16>>CHROMA_V_SHIFT)] );
/* Finish the prefetching */
+#pragma loop count (1)
for( int l = 0; l < lists; l++ )
{
x264_prefetch( &h->mb.mv[l][top_4x4-1] );
@@ -1046,6 +1059,7 @@
x264_prefetch_fenc( h, h->fenc, mb_x, mb_y );
/* load ref/mv/mvd */
+#pragma loop count (1)
for( int l = 0; l < lists; l++ )
{
int16_t (*mv)[2] = h->mb.mv[l];
@@ -1642,17 +1656,17 @@
* bottom row of each field. We also store samples needed for the next
* mbpair in intra_border_backup[2]. */
int backup_dst = !b_mbaff ? (mb_y&1) : (mb_y&1) ? 1 : MB_INTERLACED ? 0 : 2;
- memcpy( &h->intra_border_backup[backup_dst][0][mb_x*16 ], h->mb.pic.p_fdec[0]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL );
+ MEMCPY_FAST( &h->intra_border_backup[backup_dst][0][mb_x*16 ], h->mb.pic.p_fdec[0]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL );
if( CHROMA444 )
{
- memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL );
- memcpy( &h->intra_border_backup[backup_dst][2][mb_x*16 ], h->mb.pic.p_fdec[2]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL );
+ MEMCPY_FAST( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL );
+ MEMCPY_FAST( &h->intra_border_backup[backup_dst][2][mb_x*16 ], h->mb.pic.p_fdec[2]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL );
}
else if( CHROMA_FORMAT )
{
int backup_src = (15>>CHROMA_V_SHIFT) * FDEC_STRIDE;
- memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 8*SIZEOF_PIXEL );
- memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*SIZEOF_PIXEL );
+ MEMCPY_FAST( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 8*SIZEOF_PIXEL );
+ MEMCPY_FAST( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*SIZEOF_PIXEL );
}
if( b_mbaff )
{
@@ -1660,18 +1674,18 @@
{
int backup_src = (MB_INTERLACED ? 7 : 14) * FDEC_STRIDE;
backup_dst = MB_INTERLACED ? 2 : 0;
- memcpy( &h->intra_border_backup[backup_dst][0][mb_x*16 ], h->mb.pic.p_fdec[0]+backup_src, 16*SIZEOF_PIXEL );
+ MEMCPY_FAST( &h->intra_border_backup[backup_dst][0][mb_x*16 ], h->mb.pic.p_fdec[0]+backup_src, 16*SIZEOF_PIXEL );
if( CHROMA444 )
{
- memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 16*SIZEOF_PIXEL );
- memcpy( &h->intra_border_backup[backup_dst][2][mb_x*16 ], h->mb.pic.p_fdec[2]+backup_src, 16*SIZEOF_PIXEL );
+ MEMCPY_FAST( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 16*SIZEOF_PIXEL );
+ MEMCPY_FAST( &h->intra_border_backup[backup_dst][2][mb_x*16 ], h->mb.pic.p_fdec[2]+backup_src, 16*SIZEOF_PIXEL );
}
else if( CHROMA_FORMAT )
{
if( CHROMA_FORMAT == CHROMA_420 )
backup_src = (MB_INTERLACED ? 3 : 6) * FDEC_STRIDE;
- memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 8*SIZEOF_PIXEL );
- memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*SIZEOF_PIXEL );
+ MEMCPY_FAST( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 8*SIZEOF_PIXEL );
+ MEMCPY_FAST( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*SIZEOF_PIXEL );
}
}
}
--- a/common/osdep.h 2022-06-10 09:20:18.000000000 +0300
+++ b/common/osdep.h 2023-07-04 14:39:06.062582899 +0300
@@ -472,6 +472,11 @@
asm("rev %0, %0":"+r"(x));
return x;
}
+#elif defined(ARCH_E2K)
+static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
+{
+ return __builtin_bswap32 (x);
+}
#else
static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
{
@@ -484,6 +489,11 @@
asm("bswap %0":"+r"(x));
return x;
}
+#elif defined(ARCH_E2K)
+static ALWAYS_INLINE uint64_t endian_fix64( uint64_t x )
+{
+ return __builtin_bswap64 (x);
+}
#else
static ALWAYS_INLINE uint64_t endian_fix64( uint64_t x )
{
--- a/encoder/analyse.c 2022-06-10 09:20:18.000000000 +0300
+++ b/encoder/analyse.c 2023-07-04 15:26:12.424359907 +0300
@@ -626,6 +626,7 @@
satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE );
satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
+#pragma loop count(4)
for( ; *predict_mode >= 0; predict_mode++ )
{
int i_mode = *predict_mode;
@@ -637,6 +638,7 @@
}
else
{
+#pragma loop count(4)
for( ; *predict_mode >= 0; predict_mode++ )
{
int i_satd;
@@ -1265,6 +1267,7 @@
LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
a->l0.me16x16.cost = INT_MAX;
+#pragma loop count (1)
for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
{
m.i_ref_cost = REF_COST( 0, i_ref );
--- a/encoder/macroblock.c 2022-06-10 09:20:18.000000000 +0300
+++ b/encoder/macroblock.c 2023-07-04 14:53:19.544020314 +0300
@@ -34,15 +34,23 @@
#define ZIG(i,y,x) level[i] = dct[x*2+y];
static inline void zigzag_scan_2x2_dc( dctcoef level[4], dctcoef dct[4] )
{
+#if ARCH_E2K
+ ((__di *) level)[0] = __builtin_e2k_pshufh (((__di *) dct)[0], 0xd8); /* 3120 */
+#else /* ARCH_E2K */
ZIG(0,0,0)
ZIG(1,0,1)
ZIG(2,1,0)
ZIG(3,1,1)
+#endif /* ARCH_E2K */
}
#undef ZIG
static inline void zigzag_scan_2x4_dc( dctcoef level[8], dctcoef dct[8] )
{
+#if ARCH_E2K
+ ((__di *) level)[0] = __builtin_e2k_pshufb (((__di *) dct)[1], ((__di *) dct)[0], 0x0908030205040100LL);
+ ((__di *) level)[1] = __builtin_e2k_pshufb (((__di *) dct)[1], ((__di *) dct)[0], 0x0f0e0b0a07060d0cLL);
+#else /* ARCH_E2K */
level[0] = dct[0];
level[1] = dct[2];
level[2] = dct[1];
@@ -51,6 +59,7 @@
level[5] = dct[3];
level[6] = dct[5];
level[7] = dct[7];
+#endif /* ARCH_E2K */
}
#define IDCT_DEQUANT_2X2_START \
@@ -622,6 +631,7 @@
int b_force_no_skip = 0;
int nz;
h->mb.i_cbp_luma = 0;
+#pragma loop count (1)
for( int p = 0; p < plane_count; p++ )
h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = 0;
@@ -706,7 +716,7 @@
if( h->mb.i_type == I_16x16 )
{
h->mb.b_transform_8x8 = 0;
-
+#pragma loop count (1)
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
mb_encode_i16x16( h, p, i_qp );
}
@@ -845,6 +855,7 @@
else
{
ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] );
+#pragma loop count (1)
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
{
int quant_cat = p ? CQM_4PC : CQM_4PY;
@@ -991,7 +1002,7 @@
ALIGNED_ARRAY_64( dctcoef, dctscan,[16] );
ALIGNED_4( int16_t mvp[2] );
int i_qp = h->mb.i_qp;
-
+#pragma loop count (1)
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
{
int quant_cat = p ? CQM_4PC : CQM_4PY;
@@ -1087,7 +1098,7 @@
else
h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );
}
-
+#pragma loop count (1)
for( int i = 0; i <= chroma422; i++ )
if( h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4PC][i_qp+3*chroma422][0] >> 1,
h->quant4_bias[CQM_4PC][i_qp+3*chroma422][0] << 1 ) )
@@ -1098,6 +1109,7 @@
continue;
if( !h->mb.b_noise_reduction )
+#pragma loop count (1)
for( int i = 0; i <= chroma422; i++ )
{
h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
--- a/tools/checkasm.c 2023-07-04 15:30:16.477349162 +0300
+++ b/tools/checkasm.c 2023-07-04 15:22:14.275491480 +0300
@@ -126,6 +126,8 @@
asm volatile( "rdhwr %0, $2" : "=r"(a) :: "memory" );
#elif ARCH_SPARC && defined(__sparc_v9__)
asm volatile( "rd %%tick, %0" : "=r" (a));
+#elif ARCH_E2K
+ asm volatile( "rrd %%clkr, %0" : "=r" (a) :: "memory");
#endif
return a;
}