diff --git a/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_neon.c b/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_neon.c new file mode 100644 index 00000000..4df3aee9 --- /dev/null +++ b/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_neon.c @@ -0,0 +1,70 @@ + int i; + float64x2_t sum0 = vdupq_n_f64(0.0f); + float64x2_t sum1 = vdupq_n_f64(0.0f); + float64x2_t sum2 = vdupq_n_f64(0.0f); + float64x2_t sum3 = vdupq_n_f64(0.0f); + float64x2_t d0 = vdupq_n_f64(0.0f); + float64x2_t d1 = vdupq_n_f64(0.0f); + float64x2_t d2 = vdupq_n_f64(0.0f); + float64x2_t d3 = vdupq_n_f64(0.0f); +#if MAX_LAG > 8 + float64x2_t sum4 = vdupq_n_f64(0.0f); + float64x2_t d4 = vdupq_n_f64(0.0f); +#endif +#if MAX_LAG > 10 + float64x2_t sum5 = vdupq_n_f64(0.0f); + float64x2_t sum6 = vdupq_n_f64(0.0f); + float64x2_t d5 = vdupq_n_f64(0.0f); + float64x2_t d6 = vdupq_n_f64(0.0f); +#endif + float64x2_t d; + + (void)lag; + FLAC__ASSERT(lag <= MAX_LAG); + + // Loop backwards through samples from data_len to 0 + for (i = data_len - 1; i >= 0; i--) + { + d = vdupq_n_f64(data[i]); // Create vector with 2 entries data[i] + + // The next 6 lines of code right-shift the elements through the 7 vectors d0..d6. + // The 7th line adds the newly loaded element to d0. This works like a stack, where + // data[i] is pushed onto the stack every time and the 9th element falls off +#if MAX_LAG > 10 + d6 = vextq_f64(d5,d6,1); + d5 = vextq_f64(d4,d5,1); +#endif +#if MAX_LAG > 8 + d4 = vextq_f64(d3,d4,1); +#endif + d3 = vextq_f64(d2,d3,1); + d2 = vextq_f64(d1,d2,1); + d1 = vextq_f64(d0,d1,1); + d0 = vextq_f64(d,d0,1); + + // Fused multiply-add sum += d * d0..d6 + sum0 = vfmaq_f64(sum0, d, d0); + sum1 = vfmaq_f64(sum1, d, d1); + sum2 = vfmaq_f64(sum2, d, d2); + sum3 = vfmaq_f64(sum3, d, d3); +#if MAX_LAG > 8 + sum4 = vfmaq_f64(sum4, d, d4); +#endif +#if MAX_LAG > 10 + sum5 = vfmaq_f64(sum5, d, d5); + sum6 = vfmaq_f64(sum6, d, d6); +#endif + } + + // Store sum0..sum6 in autoc[0..14] + vst1q_f64(autoc, sum0); + vst1q_f64(autoc + 2, sum1); + vst1q_f64(autoc + 4, sum2); + vst1q_f64(autoc + 6, sum3); +#if MAX_LAG > 8 + vst1q_f64(autoc + 8, sum4); +#endif +#if MAX_LAG > 10 + vst1q_f64(autoc + 10, sum5); + vst1q_f64(autoc + 12, sum6); +#endif diff --git a/src/libFLAC/include/private/lpc.h b/src/libFLAC/include/private/lpc.h index a9076903..0e619c1d 100644 --- a/src/libFLAC/include/private/lpc.h +++ b/src/libFLAC/include/private/lpc.h @@ -89,7 +89,12 @@ void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_10(const FLAC__real void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]); #endif #endif +#if defined FLAC__CPU_ARM64 && FLAC__HAS_NEONINTRIN && FLAC__HAS_A64NEONINTRIN +void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]); #endif +#endif /* FLAC__NO_ASM */ /* * FLAC__lpc_compute_lp_coefficients() diff --git a/src/libFLAC/lpc_intrin_neon.c b/src/libFLAC/lpc_intrin_neon.c index ab8f71ea..eedc2f69 100644 --- a/src/libFLAC/lpc_intrin_neon.c +++ b/src/libFLAC/lpc_intrin_neon.c @@ -41,6 +41,30 @@ #include "private/macros.h" #include +#ifdef FLAC__HAS_A64NEONINTRIN +void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]) +{ +#undef MAX_LAG +#define MAX_LAG 14 +#include "deduplication/lpc_compute_autocorrelation_intrin_neon.c" +} + +void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]) +{ +#undef MAX_LAG +#define MAX_LAG 10 +#include "deduplication/lpc_compute_autocorrelation_intrin_neon.c" +} + +void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]) +{ +#undef MAX_LAG +#define MAX_LAG 8 +#include "deduplication/lpc_compute_autocorrelation_intrin_neon.c" +} + +#endif /* ifdef FLAC__HAS_A64NEONINTRIN */ + #define MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_vec, lane) \ summ_0 = vmulq_laneq_s32(tmp_vec[0], qlp_coeff_vec, lane); \ @@ -57,11 +81,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in { int i; FLAC__int32 sum; + int32x4_t tmp_vec[20]; + FLAC__ASSERT(order > 0); FLAC__ASSERT(order <= 32); - int32x4_t tmp_vec[20]; - // Using prologue reads is valid as encoder->private_->local_lpc_compute_residual_from_qlp_coefficients(signal+order,....) if(order <= 12) { if(order > 8) { diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c index 38b19486..0aa98a21 100644 --- a/src/libFLAC/stream_encoder.c +++ b/src/libFLAC/stream_encoder.c @@ -906,6 +906,16 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_( encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation; } #endif +#endif +#if defined FLAC__CPU_ARM64 && FLAC__HAS_NEONINTRIN && FLAC__HAS_A64NEONINTRIN + if(encoder->protected_->max_lpc_order < 8) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8; + else if(encoder->protected_->max_lpc_order < 10) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_neon_lag_10; + else if(encoder->protected_->max_lpc_order < 14) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_neon_lag_14; + else + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation; #endif if(encoder->private_->cpuinfo.use_asm) { # ifdef FLAC__CPU_IA32 @@ -1004,7 +1014,8 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_( # endif /* FLAC__HAS_X86INTRIN */ # endif /* FLAC__CPU_... */ - #if defined FLAC__CPU_ARM64 + #if defined FLAC__CPU_ARM64 && FLAC__HAS_NEONINTRIN + encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon; encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon; encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon;