diff --git a/src/libFLAC/Makefile.am b/src/libFLAC/Makefile.am index fbba34ed..0203429a 100644 --- a/src/libFLAC/Makefile.am +++ b/src/libFLAC/Makefile.am @@ -82,7 +82,9 @@ EXTRA_DIST = \ libFLAC_static.vcxproj \ libFLAC_static.vcxproj.filters \ libFLAC.m4 \ - windows_unicode_filenames.c + windows_unicode_filenames.c \ + deduplication/lpc_compute_autocorrelation_intrin_sse2.c \ + deduplication/lpc_compute_autocorrelation_intrin_vsx.c if OS_IS_WINDOWS windows_unicode_compat = windows_unicode_filenames.c diff --git a/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_sse2.c b/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_sse2.c new file mode 100644 index 00000000..607b42f4 --- /dev/null +++ b/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_sse2.c @@ -0,0 +1,81 @@ +/* This code is imported several times in lpc_intrin_sse2.c with different + * values for MAX_LAG. Comments are for MAX_LAG == 14 */ + int i; + __m128d sum0, sum1, sum2, sum3; + __m128d d0, d1, d2, d3; +#if MAX_LAG > 8 + __m128d d4; + __m128d sum4; +#endif +#if MAX_LAG > 10 + __m128d d5, d6; + __m128d sum5, sum6; +#endif + + (void) lag; + FLAC__ASSERT(lag <= MAX_LAG); + + /* Initialize all sum vectors with zero */ + sum0 = _mm_setzero_pd(); + sum1 = _mm_setzero_pd(); + sum2 = _mm_setzero_pd(); + sum3 = _mm_setzero_pd(); + d0 = _mm_setzero_pd(); + d1 = _mm_setzero_pd(); + d2 = _mm_setzero_pd(); + d3 = _mm_setzero_pd(); +#if MAX_LAG > 8 + sum4 = _mm_setzero_pd(); + d4 = _mm_setzero_pd(); +#endif +#if MAX_LAG > 10 + sum5 = _mm_setzero_pd(); + sum6 = _mm_setzero_pd(); + d5 = _mm_setzero_pd(); + d6 = _mm_setzero_pd(); +#endif + + /* Loop backwards through samples from data_len to limit */ + for(i = data_len-1; i >= 0; i--) { + __m128d d = _mm_set1_pd(data[i]); + + /* The next lines of code work like a queue. For more + * information see the lag8 version of this function */ +#if MAX_LAG > 10 + d6 = _mm_shuffle_pd(d5, d6, _MM_SHUFFLE(0,0,0,1)); + d5 = _mm_shuffle_pd(d4, d5, _MM_SHUFFLE(0,0,0,1)); +#endif +#if MAX_LAG > 8 + d4 = _mm_shuffle_pd(d3, d4, _MM_SHUFFLE(0,0,0,1)); +#endif + d3 = _mm_shuffle_pd(d2, d3, _MM_SHUFFLE(0,0,0,1)); + d2 = _mm_shuffle_pd(d1, d2, _MM_SHUFFLE(0,0,0,1)); + d1 = _mm_shuffle_pd(d0, d1, _MM_SHUFFLE(0,0,0,1)); + d0 = _mm_shuffle_pd(d, d0, _MM_SHUFFLE(0,0,0,1)); + + /* sumn += d*dn */ + sum0 = _mm_add_pd(sum0, _mm_mul_pd(d, d0)); + sum1 = _mm_add_pd(sum1, _mm_mul_pd(d, d1)); + sum2 = _mm_add_pd(sum2, _mm_mul_pd(d, d2)); + sum3 = _mm_add_pd(sum3, _mm_mul_pd(d, d3)); +#if MAX_LAG > 8 + sum4 = _mm_add_pd(sum4, _mm_mul_pd(d, d4)); +#endif +#if MAX_LAG > 10 + sum5 = _mm_add_pd(sum5, _mm_mul_pd(d, d5)); + sum6 = _mm_add_pd(sum6, _mm_mul_pd(d, d6)); +#endif + } + + /* Store sum0..sum6 in autoc[0..14] */ + _mm_storeu_pd(autoc, sum0); + _mm_storeu_pd(autoc+2, sum1); + _mm_storeu_pd(autoc+4, sum2); + _mm_storeu_pd(autoc+6 ,sum3); +#if MAX_LAG > 8 + _mm_storeu_pd(autoc+8, sum4); +#endif +#if MAX_LAG > 10 + _mm_storeu_pd(autoc+10,sum5); + _mm_storeu_pd(autoc+12,sum6); +#endif diff --git a/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_vsx.c b/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_vsx.c new file mode 100644 index 00000000..721d2a0a --- /dev/null +++ b/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin_vsx.c @@ -0,0 +1,179 @@ +/* This code is imported several times in lpc_intrin_vsx.c with different + * values for MAX_LAG. Comments are for MAX_LAG == 14 */ + +long i; +long limit = (long)data_len - MAX_LAG; +const FLAC__real *base; +vector double d0, d1, d2, d3; +vector double sum0 = { 0.0f, 0.0f}; +vector double sum10 = { 0.0f, 0.0f}; +vector double sum1 = { 0.0f, 0.0f}; +vector double sum11 = { 0.0f, 0.0f}; +vector double sum2 = { 0.0f, 0.0f}; +vector double sum12 = { 0.0f, 0.0f}; +vector double sum3 = { 0.0f, 0.0f}; +vector double sum13 = { 0.0f, 0.0f}; +#if MAX_LAG > 8 +vector double d4; +vector double sum4 = { 0.0f, 0.0f}; +vector double sum14 = { 0.0f, 0.0f}; +#endif +#if MAX_LAG > 10 +vector double d5, d6; +vector double sum5 = { 0.0f, 0.0f}; +vector double sum15 = { 0.0f, 0.0f}; +vector double sum6 = { 0.0f, 0.0f}; +vector double sum16 = { 0.0f, 0.0f}; +#endif + +vector float dtemp; + +#if WORDS_BIGENDIAN +vector unsigned long long vperm = { 0x08090A0B0C0D0E0F, 0x1011121314151617 }; +vector unsigned long long vsel = { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF }; +#else +vector unsigned long long vperm = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; +vector unsigned long long vsel = { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 }; +#endif + +(void) lag; +FLAC__ASSERT(lag <= MAX_LAG); + +base = data; + +/* First, check whether it is possible to load + * 16 elements at once */ +if(limit > 2){ + /* Convert all floats to doubles */ + dtemp = vec_vsx_ld(0, base); + d0 = vec_doubleh(dtemp); + d1 = vec_doublel(dtemp); + dtemp = vec_vsx_ld(16, base); + d2 = vec_doubleh(dtemp); + d3 = vec_doublel(dtemp); +#if MAX_LAG > 8 + dtemp = vec_vsx_ld(32, base); + d4 = vec_doubleh(dtemp); +#endif +#if MAX_LAG > 10 + d5 = vec_doublel(dtemp); + dtemp = vec_vsx_ld(48, base); + d6 = vec_doubleh(dtemp); +#endif + + base += MAX_LAG; + + /* Loop until nearing data_len */ + for (i = 0; i <= (limit-2); i += 2) { + vector double d, dnext; + + /* Load next 2 datapoints and convert to double + * for lag 14 that is data[i+14] and data[i+15] */ + dtemp = vec_vsx_ld(0, base); + dnext = vec_doubleh(dtemp); + base += 2; + + /* Create vector d with both elements set to the first + * element of d0, so both elements data[i] */ + d = vec_splat(d0, 0); + sum0 += d0 * d; // Multiply data[i] with data[i] and data[i+1] + sum1 += d1 * d; // Multiply data[i] with data[i+2] and data[i+3] + sum2 += d2 * d; // Multiply data[i] with data[i+4] and data[i+5] + sum3 += d3 * d; // Multiply data[i] with data[i+6] and data[i+7] +#if MAX_LAG > 8 + sum4 += d4 * d; // Multiply data[i] with data[i+8] and data[i+9] +#endif +#if MAX_LAG > 10 + sum5 += d5 * d; // Multiply data[i] with data[i+10] and data[i+11] + sum6 += d6 * d; // Multiply data[i] with data[i+12] and data[i+13] +#endif + + /* Set both elements of d to data[i+1] */ + d = vec_splat(d0, 1); + + /* Set d0 to data[i+14] and data[i+1] */ + d0 = vec_sel(d0, dnext, vsel); + sum10 += d0 * d; /* Multiply data[i+1] with data[i+14] and data[i+1] */ + sum11 += d1 * d; /* Multiply data[i+1] with data[i+2] and data[i+3] */ + sum12 += d2 * d; + sum13 += d3 * d; +#if MAX_LAG > 8 + sum14 += d4 * d; +#endif +#if MAX_LAG > 10 + sum15 += d5 * d; + sum16 += d6 * d; /* Multiply data[i+1] with data[i+12] and data[i+13] */ +#endif + + /* Shift all loaded values one vector (2 elements) so the next + * iterations aligns again */ + d0 = d1; + d1 = d2; + d2 = d3; +#if MAX_LAG > 8 + d3 = d4; +#endif +#if MAX_LAG > 10 + d4 = d5; + d5 = d6; +#endif + +#if MAX_LAG == 8 + d3 = dnext; +#elif MAX_LAG == 10 + d4 = dnext; +#elif MAX_LAG == 14 + d6 = dnext; +#else +#error "Unsupported lag"; +#endif + } + + /* Because the values in sum10..sum16 do not align with + * the values in sum0..sum6, these need to be 'left-rotated' + * before adding them to sum0..sum6 */ + sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm); + sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm); + sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm); +#if MAX_LAG > 8 + sum3 += vec_perm(sum13, sum14, (vector unsigned char)vperm); +#endif +#if MAX_LAG > 10 + sum4 += vec_perm(sum14, sum15, (vector unsigned char)vperm); + sum5 += vec_perm(sum15, sum16, (vector unsigned char)vperm); +#endif + +#if MAX_LAG == 8 + sum3 += vec_perm(sum13, sum10, (vector unsigned char)vperm); +#elif MAX_LAG == 10 + sum4 += vec_perm(sum14, sum10, (vector unsigned char)vperm); +#elif MAX_LAG == 14 + sum6 += vec_perm(sum16, sum10, (vector unsigned char)vperm); +#else +#error "Unsupported lag"; +#endif +}else{ + i = 0; +} + +/* Store result */ +vec_vsx_st(sum0, 0, autoc); +vec_vsx_st(sum1, 16, autoc); +vec_vsx_st(sum2, 32, autoc); +vec_vsx_st(sum3, 48, autoc); +#if MAX_LAG > 8 +vec_vsx_st(sum4, 64, autoc); +#endif +#if MAX_LAG > 10 +vec_vsx_st(sum5, 80, autoc); +vec_vsx_st(sum6, 96, autoc); +#endif + +/* Process remainder of samples in a non-VSX way */ +for (; i < (long)data_len; i++) { + uint32_t coeff; + + FLAC__real d = data[i]; + for (coeff = 0; coeff < data_len - i; coeff++) + autoc[coeff] += d * data[i+coeff]; +} diff --git a/src/libFLAC/include/private/lpc.h b/src/libFLAC/include/private/lpc.h index 01624149..c6fe2f8f 100644 --- a/src/libFLAC/include/private/lpc.h +++ b/src/libFLAC/include/private/lpc.h @@ -80,12 +80,12 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14(const FLAC__real data[ #if defined(FLAC__CPU_PPC64) && defined(FLAC__USE_VSX) #ifdef FLAC__HAS_TARGET_POWER9 void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]); -void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]); void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]); #endif #ifdef FLAC__HAS_TARGET_POWER8 void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]); -void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]); void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]); #endif #endif diff --git a/src/libFLAC/lpc_intrin_sse2.c b/src/libFLAC/lpc_intrin_sse2.c index 2b320161..ec61ecd9 100644 --- a/src/libFLAC/lpc_intrin_sse2.c +++ b/src/libFLAC/lpc_intrin_sse2.c @@ -54,166 +54,26 @@ FLAC__SSE_TARGET("sse2") void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]) { - // This function calculates autocorrelation with SSE2 - // vector functions up to a lag of 10 (or max LPC order of 9) - int i; - __m128d sum0, sum1, sum2, sum3; - __m128d d0, d1, d2, d3; - - (void) lag; - FLAC__ASSERT(lag <= 8); - - // Initialize all sum vectors with zero - sum0 = _mm_setzero_pd(); - sum1 = _mm_setzero_pd(); - sum2 = _mm_setzero_pd(); - sum3 = _mm_setzero_pd(); - d0 = _mm_setzero_pd(); - d1 = _mm_setzero_pd(); - d2 = _mm_setzero_pd(); - d3 = _mm_setzero_pd(); - - // Loop backwards through samples from data_len to limit - for(i = data_len-1; i >= 0; i--) { - __m128d d = _mm_set1_pd(data[i]); // both elements of d are set to data[i] - - // The next lines of code work like a queue. The queue - // is spread over vectors d0..d3. All items are shifted - // one position, the last item (data[i+9]) is dequeued - // and a new first item is added (data[i]) - d3 = _mm_shuffle_pd(d2, d3, _MM_SHUFFLE(0,0,0,1)); // d3 is made of second element of d2 and first element of d3 - d2 = _mm_shuffle_pd(d1, d2, _MM_SHUFFLE(0,0,0,1)); // d2 is made of second element of d1 and first element of d2 - d1 = _mm_shuffle_pd(d0, d1, _MM_SHUFFLE(0,0,0,1)); // d1 is made of second element of d0 and first element of d1 - d0 = _mm_shuffle_pd(d, d0, _MM_SHUFFLE(0,0,0,1)); // d0 is made of second element of d and first element of d0 - - // sumn += d*dn - sum0 = _mm_add_pd(sum0, _mm_mul_pd(d, d0)); - sum1 = _mm_add_pd(sum1, _mm_mul_pd(d, d1)); - sum2 = _mm_add_pd(sum2, _mm_mul_pd(d, d2)); - sum3 = _mm_add_pd(sum3, _mm_mul_pd(d, d3)); - } - - // Store sum0..sum6 in autoc[0..14] - _mm_storeu_pd(autoc, sum0); - _mm_storeu_pd(autoc+2, sum1); - _mm_storeu_pd(autoc+4, sum2); - _mm_storeu_pd(autoc+6 ,sum3); +#undef MAX_LAG +#define MAX_LAG 8 +#include "deduplication/lpc_compute_autocorrelation_intrin_sse2.c" } FLAC__SSE_TARGET("sse2") void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]) { - // This function calculates autocorrelation with SSE2 - // vector functions up to a lag of 10 (or max LPC order of 9) - int i; - __m128d sum0, sum1, sum2, sum3, sum4; - __m128d d0, d1, d2, d3, d4; - - (void) lag; - FLAC__ASSERT(lag <= 10); - - // Initialize all sum vectors with zero - sum0 = _mm_setzero_pd(); - sum1 = _mm_setzero_pd(); - sum2 = _mm_setzero_pd(); - sum3 = _mm_setzero_pd(); - sum4 = _mm_setzero_pd(); - d0 = _mm_setzero_pd(); - d1 = _mm_setzero_pd(); - d2 = _mm_setzero_pd(); - d3 = _mm_setzero_pd(); - d4 = _mm_setzero_pd(); - - // Loop backwards through samples from data_len to limit - for(i = data_len-1; i >= 0; i--) { - __m128d d = _mm_set1_pd(data[i]); - - // The next lines of code work like a queue. For more - // information see the lag8 version of this function - d4 = _mm_shuffle_pd(d3, d4, _MM_SHUFFLE(0,0,0,1)); - d3 = _mm_shuffle_pd(d2, d3, _MM_SHUFFLE(0,0,0,1)); - d2 = _mm_shuffle_pd(d1, d2, _MM_SHUFFLE(0,0,0,1)); - d1 = _mm_shuffle_pd(d0, d1, _MM_SHUFFLE(0,0,0,1)); - d0 = _mm_shuffle_pd(d, d0, _MM_SHUFFLE(0,0,0,1)); - - // sumn += d*dn - sum0 = _mm_add_pd(sum0, _mm_mul_pd(d, d0)); - sum1 = _mm_add_pd(sum1, _mm_mul_pd(d, d1)); - sum2 = _mm_add_pd(sum2, _mm_mul_pd(d, d2)); - sum3 = _mm_add_pd(sum3, _mm_mul_pd(d, d3)); - sum4 = _mm_add_pd(sum4, _mm_mul_pd(d, d4)); - } - - // Store sum0..sum6 in autoc[0..14] - _mm_storeu_pd(autoc, sum0); - _mm_storeu_pd(autoc+2, sum1); - _mm_storeu_pd(autoc+4, sum2); - _mm_storeu_pd(autoc+6 ,sum3); - _mm_storeu_pd(autoc+8, sum4); +#undef MAX_LAG +#define MAX_LAG 10 +#include "deduplication/lpc_compute_autocorrelation_intrin_sse2.c" } FLAC__SSE_TARGET("sse2") void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]) { - // This function calculates autocorrelation with SSE2 - // vector functions up to a lag of 14 (or max LPC order of 13) - int i; - __m128d sum0, sum1, sum2, sum3, sum4, sum5, sum6; - __m128d d0, d1, d2, d3, d4, d5, d6; - - (void) lag; - FLAC__ASSERT(lag <= 14); - - // Initialize all sum vectors with zero - sum0 = _mm_setzero_pd(); - sum1 = _mm_setzero_pd(); - sum2 = _mm_setzero_pd(); - sum3 = _mm_setzero_pd(); - sum4 = _mm_setzero_pd(); - sum5 = _mm_setzero_pd(); - sum6 = _mm_setzero_pd(); - d0 = _mm_setzero_pd(); - d1 = _mm_setzero_pd(); - d2 = _mm_setzero_pd(); - d3 = _mm_setzero_pd(); - d4 = _mm_setzero_pd(); - d5 = _mm_setzero_pd(); - d6 = _mm_setzero_pd(); - - // Loop backwards through samples from data_len to limit - for(i = data_len-1; i >= 0; i--) { - __m128d d = _mm_set1_pd(data[i]); - - // The next lines of code work like a queue. For more - // information see the lag8 version of this function - d6 = _mm_shuffle_pd(d5, d6, _MM_SHUFFLE(0,0,0,1)); - d5 = _mm_shuffle_pd(d4, d5, _MM_SHUFFLE(0,0,0,1)); - d4 = _mm_shuffle_pd(d3, d4, _MM_SHUFFLE(0,0,0,1)); - d3 = _mm_shuffle_pd(d2, d3, _MM_SHUFFLE(0,0,0,1)); - d2 = _mm_shuffle_pd(d1, d2, _MM_SHUFFLE(0,0,0,1)); - d1 = _mm_shuffle_pd(d0, d1, _MM_SHUFFLE(0,0,0,1)); - d0 = _mm_shuffle_pd(d, d0, _MM_SHUFFLE(0,0,0,1)); - - // sumn += d*dn - sum0 = _mm_add_pd(sum0, _mm_mul_pd(d, d0)); - sum1 = _mm_add_pd(sum1, _mm_mul_pd(d, d1)); - sum2 = _mm_add_pd(sum2, _mm_mul_pd(d, d2)); - sum3 = _mm_add_pd(sum3, _mm_mul_pd(d, d3)); - sum4 = _mm_add_pd(sum4, _mm_mul_pd(d, d4)); - sum5 = _mm_add_pd(sum5, _mm_mul_pd(d, d5)); - sum6 = _mm_add_pd(sum6, _mm_mul_pd(d, d6)); - - } - - // Store sum0..sum6 in autoc[0..14] - _mm_storeu_pd(autoc, sum0); - _mm_storeu_pd(autoc+2, sum1); - _mm_storeu_pd(autoc+4, sum2); - _mm_storeu_pd(autoc+6 ,sum3); - _mm_storeu_pd(autoc+8, sum4); - _mm_storeu_pd(autoc+10,sum5); - _mm_storeu_pd(autoc+12,sum6); +#undef MAX_LAG +#define MAX_LAG 14 +#include "deduplication/lpc_compute_autocorrelation_intrin_sse2.c" } FLAC__SSE_TARGET("sse2") diff --git a/src/libFLAC/lpc_intrin_vsx.c b/src/libFLAC/lpc_intrin_vsx.c index 40dfa35b..1e8560b3 100644 --- a/src/libFLAC/lpc_intrin_vsx.c +++ b/src/libFLAC/lpc_intrin_vsx.c @@ -49,330 +49,25 @@ __attribute__((target("cpu=power8"))) void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]) { - // This function calculates autocorrelation with POWERPC-specific - // vector functions up to a lag of 14 (or max LPC order of 13) - long i; - long limit = (long)data_len - 14; - const FLAC__real *base; - vector double sum0 = { 0.0f, 0.0f}; - vector double sum1 = { 0.0f, 0.0f}; - vector double sum2 = { 0.0f, 0.0f}; - vector double sum3 = { 0.0f, 0.0f}; - vector double sum4 = { 0.0f, 0.0f}; - vector double sum5 = { 0.0f, 0.0f}; - vector double sum6 = { 0.0f, 0.0f}; - vector double sum10 = { 0.0f, 0.0f}; - vector double sum11 = { 0.0f, 0.0f}; - vector double sum12 = { 0.0f, 0.0f}; - vector double sum13 = { 0.0f, 0.0f}; - vector double sum14 = { 0.0f, 0.0f}; - vector double sum15 = { 0.0f, 0.0f}; - vector double sum16 = { 0.0f, 0.0f}; - vector float dtemp; - vector double d0, d1, d2, d3, d4, d5, d6; -#if WORDS_BIGENDIAN - vector unsigned long long vperm = { 0x08090A0B0C0D0E0F, 0x1011121314151617 }; - vector unsigned long long vsel = { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF }; -#else - vector unsigned long long vperm = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; - vector unsigned long long vsel = { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 }; -#endif - - (void) lag; - FLAC__ASSERT(lag <= 14); - - base = data; - - // First, check whether it is possible to load - // 16 elements at once - if(limit > 2){ - // Convert all floats to doubles - dtemp = vec_vsx_ld(0, base); - d0 = vec_doubleh(dtemp); - d1 = vec_doublel(dtemp); - dtemp = vec_vsx_ld(16, base); - d2 = vec_doubleh(dtemp); - d3 = vec_doublel(dtemp); - dtemp = vec_vsx_ld(32, base); - d4 = vec_doubleh(dtemp); - d5 = vec_doublel(dtemp); - dtemp = vec_vsx_ld(48, base); - d6 = vec_doubleh(dtemp); - - base += 14; - - // Loop until nearing data_len - for (i = 0; i <= (limit-2); i += 2) { - vector double d, d7; - - // Load next 2 datapoints and convert to double - // data[i+14] and data[i+15] - dtemp = vec_vsx_ld(0, base); - d7 = vec_doubleh(dtemp); - base += 2; - - // Create vector d with both elements set to the first - // element of d0, so both elements data[i] - d = vec_splat(d0, 0); - sum0 += d0 * d; // Multiply data[i] with data[i] and data[i+1] - sum1 += d1 * d; // Multiply data[i] with data[i+2] and data[i+3] - sum2 += d2 * d; // Multiply data[i] with data[i+4] and data[i+5] - sum3 += d3 * d; // Multiply data[i] with data[i+6] and data[i+7] - sum4 += d4 * d; // Multiply data[i] with data[i+8] and data[i+9] - sum5 += d5 * d; // Multiply data[i] with data[i+10] and data[i+11] - sum6 += d6 * d; // Multiply data[i] with data[i+12] and data[i+13] - - // Set both elements of d to data[i+1] - d = vec_splat(d0, 1); - - // Set d0 to data[i+14] and data[i+1] - d0 = vec_sel(d0, d7, vsel); - sum10 += d0 * d; // Multiply data[i+1] with data[i+14] and data[i+1] - sum11 += d1 * d; // Multiply data[i+1] with data[i+2] and data[i+3] - sum12 += d2 * d; - sum13 += d3 * d; - sum14 += d4 * d; - sum15 += d5 * d; - sum16 += d6 * d; // Multiply data[i+1] with data[i+12] and data[i+13] - - // Shift all loaded values one vector (2 elements) so the next - // iterations aligns again - d0 = d1; - d1 = d2; - d2 = d3; - d3 = d4; - d4 = d5; - d5 = d6; - d6 = d7; - } - - // Because the values in sum10..sum16 do not align with - // the values in sum0..sum6, these need to be 'left-rotated' - // before adding them to sum0..sum6 - sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm); - sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm); - sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm); - sum3 += vec_perm(sum13, sum14, (vector unsigned char)vperm); - sum4 += vec_perm(sum14, sum15, (vector unsigned char)vperm); - sum5 += vec_perm(sum15, sum16, (vector unsigned char)vperm); - sum6 += vec_perm(sum16, sum10, (vector unsigned char)vperm); - }else{ - i = 0; - } - - // Store result - vec_vsx_st(sum0, 0, autoc); - vec_vsx_st(sum1, 16, autoc); - vec_vsx_st(sum2, 32, autoc); - vec_vsx_st(sum3, 48, autoc); - vec_vsx_st(sum4, 64, autoc); - vec_vsx_st(sum5, 80, autoc); - vec_vsx_st(sum6, 96, autoc); - - // Process remainder of samples in a non-VSX way - for (; i < (long)data_len; i++) { - uint32_t coeff; - - FLAC__real d = data[i]; - for (coeff = 0; coeff < data_len - i; coeff++) - autoc[coeff] += d * data[i+coeff]; - } +#undef MAX_LAG +#define MAX_LAG 14 +#include "deduplication/lpc_compute_autocorrelation_intrin_vsx.c" } __attribute__((target("cpu=power8"))) -void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]) +void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]) { - // This function calculates autocorrelation with POWERPC-specific - // vector functions up to a lag of 12 (or max LPC order of 11) - // For explanation, please see the lag_14 version of this function - long i; - long limit = (long)data_len - 12; - const FLAC__real *base; - vector double sum0 = { 0.0f, 0.0f}; - vector double sum1 = { 0.0f, 0.0f}; - vector double sum2 = { 0.0f, 0.0f}; - vector double sum3 = { 0.0f, 0.0f}; - vector double sum4 = { 0.0f, 0.0f}; - vector double sum5 = { 0.0f, 0.0f}; - vector double sum10 = { 0.0f, 0.0f}; - vector double sum11 = { 0.0f, 0.0f}; - vector double sum12 = { 0.0f, 0.0f}; - vector double sum13 = { 0.0f, 0.0f}; - vector double sum14 = { 0.0f, 0.0f}; - vector double sum15 = { 0.0f, 0.0f}; - vector float dtemp; - vector double d0, d1, d2, d3, d4, d5; -#if WORDS_BIGENDIAN - vector unsigned long long vperm = { 0x08090A0B0C0D0E0F, 0x1011121314151617 }; - vector unsigned long long vsel = { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF }; -#else - vector unsigned long long vperm = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; - vector unsigned long long vsel = { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 }; -#endif - - (void) lag; - FLAC__ASSERT(lag <= 12); - - base = data; - if(limit > 0){ - dtemp = vec_vsx_ld(0, base); - d0 = vec_doubleh(dtemp); - d1 = vec_doublel(dtemp); - dtemp = vec_vsx_ld(16, base); - d2 = vec_doubleh(dtemp); - d3 = vec_doublel(dtemp); - dtemp = vec_vsx_ld(32, base); - d4 = vec_doubleh(dtemp); - d5 = vec_doublel(dtemp); - - base += 12; - - for (i = 0; i <= (limit-2); i += 2) { - vector double d, d6; - - dtemp = vec_vsx_ld(0, base); - d6 = vec_doubleh(dtemp); - base += 2; - - d = vec_splat(d0, 0); - sum0 += d0 * d; - sum1 += d1 * d; - sum2 += d2 * d; - sum3 += d3 * d; - sum4 += d4 * d; - sum5 += d5 * d; - - d = vec_splat(d0, 1); - d0 = vec_sel(d0, d6, vsel); - sum10 += d0 * d; - sum11 += d1 * d; - sum12 += d2 * d; - sum13 += d3 * d; - sum14 += d4 * d; - sum15 += d5 * d; - - d0 = d1; - d1 = d2; - d2 = d3; - d3 = d4; - d4 = d5; - d5 = d6; - } - - sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm); - sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm); - sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm); - sum3 += vec_perm(sum13, sum14, (vector unsigned char)vperm); - sum4 += vec_perm(sum14, sum15, (vector unsigned char)vperm); - sum5 += vec_perm(sum15, sum10, (vector unsigned char)vperm); - }else{ - i = 0; - } - - vec_vsx_st(sum0, 0, autoc); - vec_vsx_st(sum1, 16, autoc); - vec_vsx_st(sum2, 32, autoc); - vec_vsx_st(sum3, 48, autoc); - vec_vsx_st(sum4, 64, autoc); - vec_vsx_st(sum5, 80, autoc); - - for (; i < (long)data_len; i++) { - uint32_t coeff; - - FLAC__real d = data[i]; - for (coeff = 0; coeff < data_len - i; coeff++) - autoc[coeff] += d * data[i+coeff]; - } +#undef MAX_LAG +#define MAX_LAG 10 +#include "deduplication/lpc_compute_autocorrelation_intrin_vsx.c" } __attribute__((target("cpu=power8"))) void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]) { - // This function calculates autocorrelation with POWERPC-specific - // vector functions up to a lag of 8 (or max LPC order of 7) - // For explanation, please see the lag_14 version of this function - long i; - long limit = (long)data_len - 8; - const FLAC__real *base; - vector double sum0 = { 0.0f, 0.0f}; - vector double sum1 = { 0.0f, 0.0f}; - vector double sum2 = { 0.0f, 0.0f}; - vector double sum3 = { 0.0f, 0.0f}; - vector double sum10 = { 0.0f, 0.0f}; - vector double sum11 = { 0.0f, 0.0f}; - vector double sum12 = { 0.0f, 0.0f}; - vector double sum13 = { 0.0f, 0.0f}; - vector float dtemp; - vector double d0, d1, d2, d3; -#if WORDS_BIGENDIAN - vector unsigned long long vperm = { 0x08090A0B0C0D0E0F, 0x1011121314151617 }; - vector unsigned long long vsel = { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF }; -#else - vector unsigned long long vperm = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; - vector unsigned long long vsel = { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 }; -#endif - - (void) lag; - FLAC__ASSERT(lag <= 8); - - base = data; - if(limit > 0){ - dtemp = vec_vsx_ld(0, base); - d0 = vec_doubleh(dtemp); - d1 = vec_doublel(dtemp); - dtemp = vec_vsx_ld(16, base); - d2 = vec_doubleh(dtemp); - d3 = vec_doublel(dtemp); - - base += 8; - - for (i = 0; i <= (limit-2); i += 2) { - vector double d, d4; - - dtemp = vec_vsx_ld(0, base); - d4 = vec_doubleh(dtemp); - base += 2; - - d = vec_splat(d0, 0); - sum0 += d0 * d; - sum1 += d1 * d; - sum2 += d2 * d; - sum3 += d3 * d; - - d = vec_splat(d0, 1); - d0 = vec_sel(d0, d4, vsel); - sum10 += d0 * d; - sum11 += d1 * d; - sum12 += d2 * d; - sum13 += d3 * d; - - d0 = d1; - d1 = d2; - d2 = d3; - d3 = d4; - } - - sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm); - sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm); - sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm); - sum3 += vec_perm(sum13, sum10, (vector unsigned char)vperm); - - }else{ - i = 0; - } - - vec_vsx_st(sum0, 0, autoc); - vec_vsx_st(sum1, 16, autoc); - vec_vsx_st(sum2, 32, autoc); - vec_vsx_st(sum3, 48, autoc); - - for (; i < (long)data_len; i++) { - uint32_t coeff; - - FLAC__real d = data[i]; - for (coeff = 0; coeff < data_len - i; coeff++) - autoc[coeff] += d * data[i+coeff]; - } +#undef MAX_LAG +#define MAX_LAG 8 +#include "deduplication/lpc_compute_autocorrelation_intrin_vsx.c" } #endif /* FLAC__HAS_TARGET_POWER8 */ @@ -380,312 +75,25 @@ void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_8(const FLAC__real __attribute__((target("cpu=power9"))) void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]) { - // This function calculates autocorrelation with POWERPC-specific - // vector functions up to a lag of 14 (or max LPC order of 13) - // For explanation, please see the power8 version of this function - long i; - long limit = (long)data_len - 14; - const FLAC__real *base; - vector double sum0 = { 0.0f, 0.0f}; - vector double sum1 = { 0.0f, 0.0f}; - vector double sum2 = { 0.0f, 0.0f}; - vector double sum3 = { 0.0f, 0.0f}; - vector double sum4 = { 0.0f, 0.0f}; - vector double sum5 = { 0.0f, 0.0f}; - vector double sum6 = { 0.0f, 0.0f}; - vector double sum10 = { 0.0f, 0.0f}; - vector double sum11 = { 0.0f, 0.0f}; - vector double sum12 = { 0.0f, 0.0f}; - vector double sum13 = { 0.0f, 0.0f}; - vector double sum14 = { 0.0f, 0.0f}; - vector double sum15 = { 0.0f, 0.0f}; - vector double sum16 = { 0.0f, 0.0f}; - vector float dtemp; - vector double d0, d1, d2, d3, d4, d5, d6; -#if WORDS_BIGENDIAN - vector unsigned long long vperm = { 0x08090A0B0C0D0E0F, 0x1011121314151617 }; - vector unsigned long long vsel = { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF }; -#else - vector unsigned long long vperm = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; - vector unsigned long long vsel = { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 }; -#endif - - (void) lag; - FLAC__ASSERT(lag <= 14); - - base = data; - if(limit > 2){ - dtemp = vec_vsx_ld(0, base); - d0 = vec_doubleh(dtemp); - d1 = vec_doublel(dtemp); - dtemp = vec_vsx_ld(16, base); - d2 = vec_doubleh(dtemp); - d3 = vec_doublel(dtemp); - dtemp = vec_vsx_ld(32, base); - d4 = vec_doubleh(dtemp); - d5 = vec_doublel(dtemp); - dtemp = vec_vsx_ld(48, base); - d6 = vec_doubleh(dtemp); - - base += 14; - - for (i = 0; i <= (limit-2); i += 2) { - vector double d, d7; - - dtemp = vec_vsx_ld(0, base); - d7 = vec_doubleh(dtemp); - base += 2; - - d = vec_splat(d0, 0); - sum0 += d0 * d; - sum1 += d1 * d; - sum2 += d2 * d; - sum3 += d3 * d; - sum4 += d4 * d; - sum5 += d5 * d; - sum6 += d6 * d; - - d = vec_splat(d0, 1); - d0 = vec_sel(d0, d7, vsel); - sum10 += d0 * d; - sum11 += d1 * d; - sum12 += d2 * d; - sum13 += d3 * d; - sum14 += d4 * d; - sum15 += d5 * d; - sum16 += d6 * d; - - d0 = d1; - d1 = d2; - d2 = d3; - d3 = d4; - d4 = d5; - d5 = d6; - d6 = d7; - } - - sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm); - sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm); - sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm); - sum3 += vec_perm(sum13, sum14, (vector unsigned char)vperm); - sum4 += vec_perm(sum14, sum15, (vector unsigned char)vperm); - sum5 += vec_perm(sum15, sum16, (vector unsigned char)vperm); - sum6 += vec_perm(sum16, sum10, (vector unsigned char)vperm); - }else{ - i = 0; - } - - vec_vsx_st(sum0, 0, autoc); - vec_vsx_st(sum1, 16, autoc); - vec_vsx_st(sum2, 32, autoc); - vec_vsx_st(sum3, 48, autoc); - vec_vsx_st(sum4, 64, autoc); - vec_vsx_st(sum5, 80, autoc); - vec_vsx_st(sum6, 96, autoc); - - for (; i < (long)data_len; i++) { - uint32_t coeff; - - FLAC__real d = data[i]; - for (coeff = 0; coeff < data_len - i; coeff++) - autoc[coeff] += d * data[i+coeff]; - } +#undef MAX_LAG +#define MAX_LAG 14 +#include "deduplication/lpc_compute_autocorrelation_intrin_vsx.c" } __attribute__((target("cpu=power9"))) -void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]) +void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]) { - // This function calculates autocorrelation with POWERPC-specific - // vector functions up to a lag of 12 (or max LPC order of 11) - // For explanation, please see the power9, lag_14 version of this function - long i; - long limit = (long)data_len - 12; - const FLAC__real *base; - vector double sum0 = { 0.0f, 0.0f}; - vector double sum1 = { 0.0f, 0.0f}; - vector double sum2 = { 0.0f, 0.0f}; - vector double sum3 = { 0.0f, 0.0f}; - vector double sum4 = { 0.0f, 0.0f}; - vector double sum5 = { 0.0f, 0.0f}; - vector double sum10 = { 0.0f, 0.0f}; - vector double sum11 = { 0.0f, 0.0f}; - vector double sum12 = { 0.0f, 0.0f}; - vector double sum13 = { 0.0f, 0.0f}; - vector double sum14 = { 0.0f, 0.0f}; - vector double sum15 = { 0.0f, 0.0f}; - vector float dtemp; - vector double d0, d1, d2, d3, d4, d5; -#if WORDS_BIGENDIAN - vector unsigned long long vperm = { 0x08090A0B0C0D0E0F, 0x1011121314151617 }; - vector unsigned long long vsel = { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF }; -#else - vector unsigned long long vperm = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; - vector unsigned long long vsel = { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 }; -#endif - - (void) lag; - FLAC__ASSERT(lag <= 12); - - base = data; - if(limit > 0){ - dtemp = vec_vsx_ld(0, base); - d0 = vec_doubleh(dtemp); - d1 = vec_doublel(dtemp); - dtemp = vec_vsx_ld(16, base); - d2 = vec_doubleh(dtemp); - d3 = vec_doublel(dtemp); - dtemp = vec_vsx_ld(32, base); - d4 = vec_doubleh(dtemp); - d5 = vec_doublel(dtemp); - - base += 12; - - for (i = 0; i <= (limit-2); i += 2) { - vector double d, d6; - - dtemp = vec_vsx_ld(0, base); - d6 = vec_doubleh(dtemp); - base += 2; - - d = vec_splat(d0, 0); - sum0 += d0 * d; - sum1 += d1 * d; - sum2 += d2 * d; - sum3 += d3 * d; - sum4 += d4 * d; - sum5 += d5 * d; - - d = vec_splat(d0, 1); - d0 = vec_sel(d0, d6, vsel); - sum10 += d0 * d; - sum11 += d1 * d; - sum12 += d2 * d; - sum13 += d3 * d; - sum14 += d4 * d; - sum15 += d5 * d; - - d0 = d1; - d1 = d2; - d2 = d3; - d3 = d4; - d4 = d5; - d5 = d6; - } - - sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm); - sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm); - sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm); - sum3 += vec_perm(sum13, sum14, (vector unsigned char)vperm); - sum4 += vec_perm(sum14, sum15, (vector unsigned char)vperm); - sum5 += vec_perm(sum15, sum10, (vector unsigned char)vperm); - }else{ - i = 0; - } - - vec_vsx_st(sum0, 0, autoc); - vec_vsx_st(sum1, 16, autoc); - vec_vsx_st(sum2, 32, autoc); - vec_vsx_st(sum3, 48, autoc); - vec_vsx_st(sum4, 64, autoc); - vec_vsx_st(sum5, 80, autoc); - - for (; i < (long)data_len; i++) { - uint32_t coeff; - - FLAC__real d = data[i]; - for (coeff = 0; coeff < data_len - i; coeff++) - autoc[coeff] += d * data[i+coeff]; - } +#undef MAX_LAG +#define MAX_LAG 10 +#include "deduplication/lpc_compute_autocorrelation_intrin_vsx.c" } __attribute__((target("cpu=power9"))) void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]) { - // This function calculates autocorrelation with POWERPC-specific - // vector functions up to a lag of 8 (or max LPC order of 7) - // For explanation, please see the power9, lag_14 version of this function - long i; - long limit = (long)data_len - 8; - const FLAC__real *base; - vector double sum0 = { 0.0f, 0.0f}; - vector double sum1 = { 0.0f, 0.0f}; - vector double sum2 = { 0.0f, 0.0f}; - vector double sum3 = { 0.0f, 0.0f}; - vector double sum10 = { 0.0f, 0.0f}; - vector double sum11 = { 0.0f, 0.0f}; - vector double sum12 = { 0.0f, 0.0f}; - vector double sum13 = { 0.0f, 0.0f}; - vector float dtemp; - vector double d0, d1, d2, d3; -#if WORDS_BIGENDIAN - vector unsigned long long vperm = { 0x08090A0B0C0D0E0F, 0x1011121314151617 }; - vector unsigned long long vsel = { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF }; -#else - vector unsigned long long vperm = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; - vector unsigned long long vsel = { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 }; -#endif - - (void) lag; - FLAC__ASSERT(lag <= 8); - - base = data; - if(limit > 0){ - dtemp = vec_vsx_ld(0, base); - d0 = vec_doubleh(dtemp); - d1 = vec_doublel(dtemp); - dtemp = vec_vsx_ld(16, base); - d2 = vec_doubleh(dtemp); - d3 = vec_doublel(dtemp); - - base += 8; - - for (i = 0; i <= (limit-2); i += 2) { - vector double d, d4; - - dtemp = vec_vsx_ld(0, base); - d4 = vec_doubleh(dtemp); - base += 2; - - d = vec_splat(d0, 0); - sum0 += d0 * d; - sum1 += d1 * d; - sum2 += d2 * d; - sum3 += d3 * d; - - d = vec_splat(d0, 1); - d0 = vec_sel(d0, d4, vsel); - sum10 += d0 * d; - sum11 += d1 * d; - sum12 += d2 * d; - sum13 += d3 * d; - - d0 = d1; - d1 = d2; - d2 = d3; - d3 = d4; - } - - sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm); - sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm); - sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm); - sum3 += vec_perm(sum13, sum10, (vector unsigned char)vperm); - - }else{ - i = 0; - } - - vec_vsx_st(sum0, 0, autoc); - vec_vsx_st(sum1, 16, autoc); - vec_vsx_st(sum2, 32, autoc); - vec_vsx_st(sum3, 48, autoc); - - for (; i < (long)data_len; i++) { - uint32_t coeff; - - FLAC__real d = data[i]; - for (coeff = 0; coeff < data_len - i; coeff++) - autoc[coeff] += d * data[i+coeff]; - } +#undef MAX_LAG +#define MAX_LAG 8 +#include "deduplication/lpc_compute_autocorrelation_intrin_vsx.c" } #endif /* FLAC__HAS_TARGET_POWER9 */ diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c index 0ac8cd60..f6021e11 100644 --- a/src/libFLAC/stream_encoder.c +++ b/src/libFLAC/stream_encoder.c @@ -887,8 +887,8 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_( if (encoder->private_->cpuinfo.ppc.arch_3_00) { if(encoder->protected_->max_lpc_order < 8) encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_8; - else if(encoder->protected_->max_lpc_order < 12) - encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_12; + else if(encoder->protected_->max_lpc_order < 10) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_10; else if(encoder->protected_->max_lpc_order < 14) encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_14; else @@ -898,8 +898,8 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_( if (encoder->private_->cpuinfo.ppc.arch_2_07) { if(encoder->protected_->max_lpc_order < 8) encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_8; - else if(encoder->protected_->max_lpc_order < 12) - encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_12; + else if(encoder->protected_->max_lpc_order < 10) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_10; else if(encoder->protected_->max_lpc_order < 14) encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_14; else