mcst-linux-kernel/patches-2024.06.26/x264-0.164/0004.Add_optimization_for_e...

--- a/common/common.h	2023-07-04 15:30:16.461348834 +0300
+++ b/common/common.h	2023-07-04 15:05:14.374641362 +0300
@@ -144,7 +144,11 @@

 static ALWAYS_INLINE pixel x264_clip_pixel( int x )
 {
+#if ARCH_E2K
+    return ( (x > PIXEL_MAX) ? PIXEL_MAX : (x < 0) ? 0 : x );
+#else
     return ( (x & ~PIXEL_MAX) ? (-x)>>31 & PIXEL_MAX : x );
+#endif /* ARCH_E2K */
 }

 /****************************************************************************
--- a/common/macroblock.c	2022-06-10 09:20:18.000000000 +0300
+++ b/common/macroblock.c	2023-07-04 14:20:32.987850779 +0300
@@ -28,6 +28,15 @@

 #include "common.h"

+#if ARCH_E2K
+#define MEMCPY_FAST(dst,src,len) \
+        for( int x = 0; x < (len)/8; ++x ) \
+            ((long long *) (dst))[x] = ((long long *) (src))[x]
+#else /* ARCH_E2K */
+#define MEMCPY_FAST(dst,src,len) \
+        memcpy( dst, src, len )
+#endif /* ARCH_E2K */
+
 #define MC_LUMA(list,p) \
     h->mc.mc_luma( &h->mb.pic.p_fdec[p][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, \
                    &h->mb.pic.p_fref[list][i_ref][p*4], h->mb.pic.i_stride[p], \
@@ -433,6 +442,7 @@

     h->fdec->i_ref[0] = h->i_ref[0];
     h->fdec->i_ref[1] = h->i_ref[1];
+#pragma loop count (1)
     for( int i = 0; i < h->i_ref[0]; i++ )
         h->fdec->ref_poc[0][i] = h->fref[0][i]->i_poc;
     if( h->sh.i_type == SLICE_TYPE_B )
@@ -477,6 +487,7 @@
     memset( h->mb.cache.ref, -2, sizeof( h->mb.cache.ref ) );

     if( h->i_ref[0] > 0 )
+#pragma loop count (1)
         for( int field = 0; field <= SLICE_MBAFF; field++ )
         {
             int curpoc = h->fdec->i_poc + h->fdec->i_delta_poc[field];
@@ -586,15 +597,15 @@
     if( b_chroma )
     {
         h->mc.load_deinterleave_chroma_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2, height );
-        memcpy( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec, 8*SIZEOF_PIXEL );
-        memcpy( h->mb.pic.p_fdec[2]-FDEC_STRIDE, intra_fdec+8, 8*SIZEOF_PIXEL );
+        MEMCPY_FAST( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec, 8*SIZEOF_PIXEL );
+        MEMCPY_FAST( h->mb.pic.p_fdec[2]-FDEC_STRIDE, intra_fdec+8, 8*SIZEOF_PIXEL );
         h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = intra_fdec[-1-8];
         h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = intra_fdec[-1];
     }
     else
     {
         h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE, h->mb.pic.p_fenc_plane[i], i_stride2, 16 );
-        memcpy( h->mb.pic.p_fdec[i]-FDEC_STRIDE, intra_fdec, 24*SIZEOF_PIXEL );
+        MEMCPY_FAST( h->mb.pic.p_fdec[i]-FDEC_STRIDE, intra_fdec, 24*SIZEOF_PIXEL );
         h->mb.pic.p_fdec[i][-FDEC_STRIDE-1] = intra_fdec[-1];
     }
     if( b_mbaff || h->mb.b_reencode_mb )
@@ -609,6 +620,7 @@
                 h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
     }
     pixel *plane_src, **filtered_src;
+#pragma loop count (1)
     for( int j = 0; j < h->mb.pic.i_fref[0]; j++ )
     {
         // Interpolate between pixels in same field.
@@ -888,6 +900,7 @@
         CP32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8], &nnz[top][32-4 + (16>>CHROMA_V_SHIFT)] );

         /* Finish the prefetching */
+#pragma loop count (1)
         for( int l = 0; l < lists; l++ )
         {
             x264_prefetch( &h->mb.mv[l][top_4x4-1] );
@@ -1046,6 +1059,7 @@
     x264_prefetch_fenc( h, h->fenc, mb_x, mb_y );

     /* load ref/mv/mvd */
+#pragma loop count (1)
     for( int l = 0; l < lists; l++ )
     {
         int16_t (*mv)[2] = h->mb.mv[l];
@@ -1642,17 +1656,17 @@
      * bottom row of each field. We also store samples needed for the next
      * mbpair in intra_border_backup[2]. */
     int backup_dst = !b_mbaff ? (mb_y&1) : (mb_y&1) ? 1 : MB_INTERLACED ? 0 : 2;
-    memcpy( &h->intra_border_backup[backup_dst][0][mb_x*16  ], h->mb.pic.p_fdec[0]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL );
+    MEMCPY_FAST( &h->intra_border_backup[backup_dst][0][mb_x*16  ], h->mb.pic.p_fdec[0]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL );
     if( CHROMA444 )
     {
-        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL );
-        memcpy( &h->intra_border_backup[backup_dst][2][mb_x*16  ], h->mb.pic.p_fdec[2]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL );
+        MEMCPY_FAST( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL );
+        MEMCPY_FAST( &h->intra_border_backup[backup_dst][2][mb_x*16  ], h->mb.pic.p_fdec[2]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL );
     }
     else if( CHROMA_FORMAT )
     {
         int backup_src = (15>>CHROMA_V_SHIFT) * FDEC_STRIDE;
-        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+backup_src, 8*SIZEOF_PIXEL );
-        memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*SIZEOF_PIXEL );
+        MEMCPY_FAST( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+backup_src, 8*SIZEOF_PIXEL );
+        MEMCPY_FAST( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*SIZEOF_PIXEL );
     }
     if( b_mbaff )
     {
@@ -1660,18 +1674,18 @@
         {
             int backup_src = (MB_INTERLACED ? 7 : 14) * FDEC_STRIDE;
             backup_dst = MB_INTERLACED ? 2 : 0;
-            memcpy( &h->intra_border_backup[backup_dst][0][mb_x*16  ], h->mb.pic.p_fdec[0]+backup_src, 16*SIZEOF_PIXEL );
+            MEMCPY_FAST( &h->intra_border_backup[backup_dst][0][mb_x*16  ], h->mb.pic.p_fdec[0]+backup_src, 16*SIZEOF_PIXEL );
             if( CHROMA444 )
             {
-                memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+backup_src, 16*SIZEOF_PIXEL );
-                memcpy( &h->intra_border_backup[backup_dst][2][mb_x*16  ], h->mb.pic.p_fdec[2]+backup_src, 16*SIZEOF_PIXEL );
+                MEMCPY_FAST( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+backup_src, 16*SIZEOF_PIXEL );
+                MEMCPY_FAST( &h->intra_border_backup[backup_dst][2][mb_x*16  ], h->mb.pic.p_fdec[2]+backup_src, 16*SIZEOF_PIXEL );
             }
             else if( CHROMA_FORMAT )
             {
                 if( CHROMA_FORMAT == CHROMA_420 )
                     backup_src = (MB_INTERLACED ? 3 : 6) * FDEC_STRIDE;
-                memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+backup_src,  8*SIZEOF_PIXEL );
-                memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src,  8*SIZEOF_PIXEL );
+                MEMCPY_FAST( &h->intra_border_backup[backup_dst][1][mb_x*16  ], h->mb.pic.p_fdec[1]+backup_src,  8*SIZEOF_PIXEL );
+                MEMCPY_FAST( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src,  8*SIZEOF_PIXEL );
             }
         }
     }
--- a/common/osdep.h	2022-06-10 09:20:18.000000000 +0300
+++ b/common/osdep.h	2023-07-04 14:39:06.062582899 +0300
@@ -472,6 +472,11 @@
     asm("rev %0, %0":"+r"(x));
     return x;
 }
+#elif defined(ARCH_E2K)
+static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
+{
+    return __builtin_bswap32 (x);
+}
 #else
 static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
 {
@@ -484,6 +489,11 @@
     asm("bswap %0":"+r"(x));
     return x;
 }
+#elif defined(ARCH_E2K)
+static ALWAYS_INLINE uint64_t endian_fix64( uint64_t x )
+{
+    return __builtin_bswap64 (x);
+}
 #else
 static ALWAYS_INLINE uint64_t endian_fix64( uint64_t x )
 {
--- a/encoder/analyse.c	2022-06-10 09:20:18.000000000 +0300
+++ b/encoder/analyse.c	2023-07-04 15:26:12.424359907 +0300
@@ -626,6 +626,7 @@
         satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE );
         satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );

+#pragma loop count(4)
         for( ; *predict_mode >= 0; predict_mode++ )
         {
             int i_mode = *predict_mode;
@@ -637,6 +638,7 @@
     }
     else
     {
+#pragma loop count(4)
         for( ; *predict_mode >= 0; predict_mode++ )
         {
             int i_satd;
@@ -1265,6 +1267,7 @@
     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );

     a->l0.me16x16.cost = INT_MAX;
+#pragma loop count (1)
     for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
     {
         m.i_ref_cost = REF_COST( 0, i_ref );
--- a/encoder/macroblock.c	2022-06-10 09:20:18.000000000 +0300
+++ b/encoder/macroblock.c	2023-07-04 14:53:19.544020314 +0300
@@ -34,15 +34,23 @@
 #define ZIG(i,y,x) level[i] = dct[x*2+y];
 static inline void zigzag_scan_2x2_dc( dctcoef level[4], dctcoef dct[4] )
 {
+#if ARCH_E2K
+    ((__di *) level)[0] = __builtin_e2k_pshufh (((__di *) dct)[0], 0xd8); /* 3120 */
+#else /* ARCH_E2K */
     ZIG(0,0,0)
     ZIG(1,0,1)
     ZIG(2,1,0)
     ZIG(3,1,1)
+#endif /* ARCH_E2K */
 }
 #undef ZIG

 static inline void zigzag_scan_2x4_dc( dctcoef level[8], dctcoef dct[8] )
 {
+#if ARCH_E2K
+    ((__di *) level)[0] = __builtin_e2k_pshufb (((__di *) dct)[1], ((__di *) dct)[0], 0x0908030205040100LL);
+    ((__di *) level)[1] = __builtin_e2k_pshufb (((__di *) dct)[1], ((__di *) dct)[0], 0x0f0e0b0a07060d0cLL);
+#else /* ARCH_E2K */
     level[0] = dct[0];
     level[1] = dct[2];
     level[2] = dct[1];
@@ -51,6 +59,7 @@
     level[5] = dct[3];
     level[6] = dct[5];
     level[7] = dct[7];
+#endif /* ARCH_E2K */
 }

 #define IDCT_DEQUANT_2X2_START \
@@ -622,6 +631,7 @@
     int b_force_no_skip = 0;
     int nz;
     h->mb.i_cbp_luma = 0;
+#pragma loop count (1)
     for( int p = 0; p < plane_count; p++ )
         h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = 0;

@@ -706,7 +716,7 @@
     if( h->mb.i_type == I_16x16 )
     {
         h->mb.b_transform_8x8 = 0;
-
+#pragma loop count (1)
         for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
             mb_encode_i16x16( h, p, i_qp );
     }
@@ -845,6 +855,7 @@
         else
         {
             ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] );
+#pragma loop count (1)
             for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
             {
                 int quant_cat = p ? CQM_4PC : CQM_4PY;
@@ -991,7 +1002,7 @@
     ALIGNED_ARRAY_64( dctcoef, dctscan,[16] );
     ALIGNED_4( int16_t mvp[2] );
     int i_qp = h->mb.i_qp;
-
+#pragma loop count (1)
     for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
     {
         int quant_cat = p ? CQM_4PC : CQM_4PY;
@@ -1087,7 +1098,7 @@
                 else
                     h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );
             }
-
+#pragma loop count (1)
             for( int i = 0; i <= chroma422; i++ )
                 if( h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4PC][i_qp+3*chroma422][0] >> 1,
                                             h->quant4_bias[CQM_4PC][i_qp+3*chroma422][0] << 1 ) )
@@ -1098,6 +1109,7 @@
                 continue;

             if( !h->mb.b_noise_reduction )
+#pragma loop count (1)
                 for( int i = 0; i <= chroma422; i++ )
                 {
                     h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
--- a/tools/checkasm.c	2023-07-04 15:30:16.477349162 +0300
+++ b/tools/checkasm.c	2023-07-04 15:22:14.275491480 +0300
@@ -126,6 +126,8 @@
     asm volatile( "rdhwr %0, $2" : "=r"(a) :: "memory" );
 #elif ARCH_SPARC && defined(__sparc_v9__)
     asm volatile( "rd %%tick, %0" : "=r" (a));
+#elif ARCH_E2K
+    asm volatile( "rrd %%clkr, %0" : "=r" (a) :: "memory");
 #endif
     return a;
 }