Add ARM64 NEON intrinsics lpc_compute_autocorrelation routines
This commit is contained in:
parent
bfe5ff9455
commit
ef4ad99231
@ -0,0 +1,70 @@
|
||||
int i;
|
||||
float64x2_t sum0 = vdupq_n_f64(0.0f);
|
||||
float64x2_t sum1 = vdupq_n_f64(0.0f);
|
||||
float64x2_t sum2 = vdupq_n_f64(0.0f);
|
||||
float64x2_t sum3 = vdupq_n_f64(0.0f);
|
||||
float64x2_t d0 = vdupq_n_f64(0.0f);
|
||||
float64x2_t d1 = vdupq_n_f64(0.0f);
|
||||
float64x2_t d2 = vdupq_n_f64(0.0f);
|
||||
float64x2_t d3 = vdupq_n_f64(0.0f);
|
||||
#if MAX_LAG > 8
|
||||
float64x2_t sum4 = vdupq_n_f64(0.0f);
|
||||
float64x2_t d4 = vdupq_n_f64(0.0f);
|
||||
#endif
|
||||
#if MAX_LAG > 10
|
||||
float64x2_t sum5 = vdupq_n_f64(0.0f);
|
||||
float64x2_t sum6 = vdupq_n_f64(0.0f);
|
||||
float64x2_t d5 = vdupq_n_f64(0.0f);
|
||||
float64x2_t d6 = vdupq_n_f64(0.0f);
|
||||
#endif
|
||||
float64x2_t d;
|
||||
|
||||
(void)lag;
|
||||
FLAC__ASSERT(lag <= MAX_LAG);
|
||||
|
||||
// Loop backwards through samples from data_len to 0
|
||||
for (i = data_len - 1; i >= 0; i--)
|
||||
{
|
||||
d = vdupq_n_f64(data[i]); // Create vector with 2 entries data[i]
|
||||
|
||||
// The next 6 lines of code right-shift the elements through the 7 vectors d0..d6.
|
||||
// The 7th line adds the newly loaded element to d0. This works like a stack, where
|
||||
// data[i] is pushed onto the stack every time and the 9th element falls off
|
||||
#if MAX_LAG > 10
|
||||
d6 = vextq_f64(d5,d6,1);
|
||||
d5 = vextq_f64(d4,d5,1);
|
||||
#endif
|
||||
#if MAX_LAG > 8
|
||||
d4 = vextq_f64(d3,d4,1);
|
||||
#endif
|
||||
d3 = vextq_f64(d2,d3,1);
|
||||
d2 = vextq_f64(d1,d2,1);
|
||||
d1 = vextq_f64(d0,d1,1);
|
||||
d0 = vextq_f64(d,d0,1);
|
||||
|
||||
// Fused multiply-add sum += d * d0..d6
|
||||
sum0 = vfmaq_f64(sum0, d, d0);
|
||||
sum1 = vfmaq_f64(sum1, d, d1);
|
||||
sum2 = vfmaq_f64(sum2, d, d2);
|
||||
sum3 = vfmaq_f64(sum3, d, d3);
|
||||
#if MAX_LAG > 8
|
||||
sum4 = vfmaq_f64(sum4, d, d4);
|
||||
#endif
|
||||
#if MAX_LAG > 10
|
||||
sum5 = vfmaq_f64(sum5, d, d5);
|
||||
sum6 = vfmaq_f64(sum6, d, d6);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Store sum0..sum6 in autoc[0..14]
|
||||
vst1q_f64(autoc, sum0);
|
||||
vst1q_f64(autoc + 2, sum1);
|
||||
vst1q_f64(autoc + 4, sum2);
|
||||
vst1q_f64(autoc + 6, sum3);
|
||||
#if MAX_LAG > 8
|
||||
vst1q_f64(autoc + 8, sum4);
|
||||
#endif
|
||||
#if MAX_LAG > 10
|
||||
vst1q_f64(autoc + 10, sum5);
|
||||
vst1q_f64(autoc + 12, sum6);
|
||||
#endif
|
@ -89,7 +89,12 @@ void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_10(const FLAC__real
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
||||
#endif
|
||||
#endif
|
||||
#if defined FLAC__CPU_ARM64 && FLAC__HAS_NEONINTRIN && FLAC__HAS_A64NEONINTRIN
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
||||
#endif
|
||||
#endif /* FLAC__NO_ASM */
|
||||
|
||||
/*
|
||||
* FLAC__lpc_compute_lp_coefficients()
|
||||
|
@ -41,6 +41,30 @@
|
||||
#include "private/macros.h"
|
||||
#include <arm_neon.h>
|
||||
|
||||
#ifdef FLAC__HAS_A64NEONINTRIN
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
|
||||
{
|
||||
#undef MAX_LAG
|
||||
#define MAX_LAG 14
|
||||
#include "deduplication/lpc_compute_autocorrelation_intrin_neon.c"
|
||||
}
|
||||
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
|
||||
{
|
||||
#undef MAX_LAG
|
||||
#define MAX_LAG 10
|
||||
#include "deduplication/lpc_compute_autocorrelation_intrin_neon.c"
|
||||
}
|
||||
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
|
||||
{
|
||||
#undef MAX_LAG
|
||||
#define MAX_LAG 8
|
||||
#include "deduplication/lpc_compute_autocorrelation_intrin_neon.c"
|
||||
}
|
||||
|
||||
#endif /* ifdef FLAC__HAS_A64NEONINTRIN */
|
||||
|
||||
|
||||
#define MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_vec, lane) \
|
||||
summ_0 = vmulq_laneq_s32(tmp_vec[0], qlp_coeff_vec, lane); \
|
||||
@ -57,11 +81,11 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
|
||||
{
|
||||
int i;
|
||||
FLAC__int32 sum;
|
||||
int32x4_t tmp_vec[20];
|
||||
|
||||
FLAC__ASSERT(order > 0);
|
||||
FLAC__ASSERT(order <= 32);
|
||||
|
||||
int32x4_t tmp_vec[20];
|
||||
|
||||
// Using prologue reads is valid as encoder->private_->local_lpc_compute_residual_from_qlp_coefficients(signal+order,....)
|
||||
if(order <= 12) {
|
||||
if(order > 8) {
|
||||
|
@ -906,6 +906,16 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
#if defined FLAC__CPU_ARM64 && FLAC__HAS_NEONINTRIN && FLAC__HAS_A64NEONINTRIN
|
||||
if(encoder->protected_->max_lpc_order < 8)
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8;
|
||||
else if(encoder->protected_->max_lpc_order < 10)
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_neon_lag_10;
|
||||
else if(encoder->protected_->max_lpc_order < 14)
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_neon_lag_14;
|
||||
else
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation;
|
||||
#endif
|
||||
if(encoder->private_->cpuinfo.use_asm) {
|
||||
# ifdef FLAC__CPU_IA32
|
||||
@ -1004,7 +1014,8 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
|
||||
# endif /* FLAC__HAS_X86INTRIN */
|
||||
# endif /* FLAC__CPU_... */
|
||||
|
||||
#if defined FLAC__CPU_ARM64
|
||||
#if defined FLAC__CPU_ARM64 && FLAC__HAS_NEONINTRIN
|
||||
|
||||
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon;
|
||||
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon;
|
||||
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon;
|
||||
|
Loading…
Reference in New Issue
Block a user