mirror of https://github.com/xiph/flac
Deduplicate VSX and SSE2 autocorelation calculation code
This commit is contained in:
parent
ee18d1b892
commit
b48ed95fcb
|
@ -82,7 +82,9 @@ EXTRA_DIST = \
|
|||
libFLAC_static.vcxproj \
|
||||
libFLAC_static.vcxproj.filters \
|
||||
libFLAC.m4 \
|
||||
windows_unicode_filenames.c
|
||||
windows_unicode_filenames.c \
|
||||
deduplication/lpc_compute_autocorrelation_intrin_sse2.c \
|
||||
deduplication/lpc_compute_autocorrelation_intrin_vsx.c
|
||||
|
||||
if OS_IS_WINDOWS
|
||||
windows_unicode_compat = windows_unicode_filenames.c
|
||||
|
|
|
@ -0,0 +1,81 @@
|
|||
/* This code is imported several times in lpc_intrin_sse2.c with different
|
||||
* values for MAX_LAG. Comments are for MAX_LAG == 14 */
|
||||
int i;
|
||||
__m128d sum0, sum1, sum2, sum3;
|
||||
__m128d d0, d1, d2, d3;
|
||||
#if MAX_LAG > 8
|
||||
__m128d d4;
|
||||
__m128d sum4;
|
||||
#endif
|
||||
#if MAX_LAG > 10
|
||||
__m128d d5, d6;
|
||||
__m128d sum5, sum6;
|
||||
#endif
|
||||
|
||||
(void) lag;
|
||||
FLAC__ASSERT(lag <= MAX_LAG);
|
||||
|
||||
/* Initialize all sum vectors with zero */
|
||||
sum0 = _mm_setzero_pd();
|
||||
sum1 = _mm_setzero_pd();
|
||||
sum2 = _mm_setzero_pd();
|
||||
sum3 = _mm_setzero_pd();
|
||||
d0 = _mm_setzero_pd();
|
||||
d1 = _mm_setzero_pd();
|
||||
d2 = _mm_setzero_pd();
|
||||
d3 = _mm_setzero_pd();
|
||||
#if MAX_LAG > 8
|
||||
sum4 = _mm_setzero_pd();
|
||||
d4 = _mm_setzero_pd();
|
||||
#endif
|
||||
#if MAX_LAG > 10
|
||||
sum5 = _mm_setzero_pd();
|
||||
sum6 = _mm_setzero_pd();
|
||||
d5 = _mm_setzero_pd();
|
||||
d6 = _mm_setzero_pd();
|
||||
#endif
|
||||
|
||||
/* Loop backwards through samples from data_len to limit */
|
||||
for(i = data_len-1; i >= 0; i--) {
|
||||
__m128d d = _mm_set1_pd(data[i]);
|
||||
|
||||
/* The next lines of code work like a queue. For more
|
||||
* information see the lag8 version of this function */
|
||||
#if MAX_LAG > 10
|
||||
d6 = _mm_shuffle_pd(d5, d6, _MM_SHUFFLE(0,0,0,1));
|
||||
d5 = _mm_shuffle_pd(d4, d5, _MM_SHUFFLE(0,0,0,1));
|
||||
#endif
|
||||
#if MAX_LAG > 8
|
||||
d4 = _mm_shuffle_pd(d3, d4, _MM_SHUFFLE(0,0,0,1));
|
||||
#endif
|
||||
d3 = _mm_shuffle_pd(d2, d3, _MM_SHUFFLE(0,0,0,1));
|
||||
d2 = _mm_shuffle_pd(d1, d2, _MM_SHUFFLE(0,0,0,1));
|
||||
d1 = _mm_shuffle_pd(d0, d1, _MM_SHUFFLE(0,0,0,1));
|
||||
d0 = _mm_shuffle_pd(d, d0, _MM_SHUFFLE(0,0,0,1));
|
||||
|
||||
/* sumn += d*dn */
|
||||
sum0 = _mm_add_pd(sum0, _mm_mul_pd(d, d0));
|
||||
sum1 = _mm_add_pd(sum1, _mm_mul_pd(d, d1));
|
||||
sum2 = _mm_add_pd(sum2, _mm_mul_pd(d, d2));
|
||||
sum3 = _mm_add_pd(sum3, _mm_mul_pd(d, d3));
|
||||
#if MAX_LAG > 8
|
||||
sum4 = _mm_add_pd(sum4, _mm_mul_pd(d, d4));
|
||||
#endif
|
||||
#if MAX_LAG > 10
|
||||
sum5 = _mm_add_pd(sum5, _mm_mul_pd(d, d5));
|
||||
sum6 = _mm_add_pd(sum6, _mm_mul_pd(d, d6));
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Store sum0..sum6 in autoc[0..14] */
|
||||
_mm_storeu_pd(autoc, sum0);
|
||||
_mm_storeu_pd(autoc+2, sum1);
|
||||
_mm_storeu_pd(autoc+4, sum2);
|
||||
_mm_storeu_pd(autoc+6 ,sum3);
|
||||
#if MAX_LAG > 8
|
||||
_mm_storeu_pd(autoc+8, sum4);
|
||||
#endif
|
||||
#if MAX_LAG > 10
|
||||
_mm_storeu_pd(autoc+10,sum5);
|
||||
_mm_storeu_pd(autoc+12,sum6);
|
||||
#endif
|
|
@ -0,0 +1,179 @@
|
|||
/* This code is imported several times in lpc_intrin_vsx.c with different
|
||||
* values for MAX_LAG. Comments are for MAX_LAG == 14 */
|
||||
|
||||
long i;
|
||||
long limit = (long)data_len - MAX_LAG;
|
||||
const FLAC__real *base;
|
||||
vector double d0, d1, d2, d3;
|
||||
vector double sum0 = { 0.0f, 0.0f};
|
||||
vector double sum10 = { 0.0f, 0.0f};
|
||||
vector double sum1 = { 0.0f, 0.0f};
|
||||
vector double sum11 = { 0.0f, 0.0f};
|
||||
vector double sum2 = { 0.0f, 0.0f};
|
||||
vector double sum12 = { 0.0f, 0.0f};
|
||||
vector double sum3 = { 0.0f, 0.0f};
|
||||
vector double sum13 = { 0.0f, 0.0f};
|
||||
#if MAX_LAG > 8
|
||||
vector double d4;
|
||||
vector double sum4 = { 0.0f, 0.0f};
|
||||
vector double sum14 = { 0.0f, 0.0f};
|
||||
#endif
|
||||
#if MAX_LAG > 10
|
||||
vector double d5, d6;
|
||||
vector double sum5 = { 0.0f, 0.0f};
|
||||
vector double sum15 = { 0.0f, 0.0f};
|
||||
vector double sum6 = { 0.0f, 0.0f};
|
||||
vector double sum16 = { 0.0f, 0.0f};
|
||||
#endif
|
||||
|
||||
vector float dtemp;
|
||||
|
||||
#if WORDS_BIGENDIAN
|
||||
vector unsigned long long vperm = { 0x08090A0B0C0D0E0F, 0x1011121314151617 };
|
||||
vector unsigned long long vsel = { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF };
|
||||
#else
|
||||
vector unsigned long long vperm = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
|
||||
vector unsigned long long vsel = { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 };
|
||||
#endif
|
||||
|
||||
(void) lag;
|
||||
FLAC__ASSERT(lag <= MAX_LAG);
|
||||
|
||||
base = data;
|
||||
|
||||
/* First, check whether it is possible to load
|
||||
* 16 elements at once */
|
||||
if(limit > 2){
|
||||
/* Convert all floats to doubles */
|
||||
dtemp = vec_vsx_ld(0, base);
|
||||
d0 = vec_doubleh(dtemp);
|
||||
d1 = vec_doublel(dtemp);
|
||||
dtemp = vec_vsx_ld(16, base);
|
||||
d2 = vec_doubleh(dtemp);
|
||||
d3 = vec_doublel(dtemp);
|
||||
#if MAX_LAG > 8
|
||||
dtemp = vec_vsx_ld(32, base);
|
||||
d4 = vec_doubleh(dtemp);
|
||||
#endif
|
||||
#if MAX_LAG > 10
|
||||
d5 = vec_doublel(dtemp);
|
||||
dtemp = vec_vsx_ld(48, base);
|
||||
d6 = vec_doubleh(dtemp);
|
||||
#endif
|
||||
|
||||
base += MAX_LAG;
|
||||
|
||||
/* Loop until nearing data_len */
|
||||
for (i = 0; i <= (limit-2); i += 2) {
|
||||
vector double d, dnext;
|
||||
|
||||
/* Load next 2 datapoints and convert to double
|
||||
* for lag 14 that is data[i+14] and data[i+15] */
|
||||
dtemp = vec_vsx_ld(0, base);
|
||||
dnext = vec_doubleh(dtemp);
|
||||
base += 2;
|
||||
|
||||
/* Create vector d with both elements set to the first
|
||||
* element of d0, so both elements data[i] */
|
||||
d = vec_splat(d0, 0);
|
||||
sum0 += d0 * d; // Multiply data[i] with data[i] and data[i+1]
|
||||
sum1 += d1 * d; // Multiply data[i] with data[i+2] and data[i+3]
|
||||
sum2 += d2 * d; // Multiply data[i] with data[i+4] and data[i+5]
|
||||
sum3 += d3 * d; // Multiply data[i] with data[i+6] and data[i+7]
|
||||
#if MAX_LAG > 8
|
||||
sum4 += d4 * d; // Multiply data[i] with data[i+8] and data[i+9]
|
||||
#endif
|
||||
#if MAX_LAG > 10
|
||||
sum5 += d5 * d; // Multiply data[i] with data[i+10] and data[i+11]
|
||||
sum6 += d6 * d; // Multiply data[i] with data[i+12] and data[i+13]
|
||||
#endif
|
||||
|
||||
/* Set both elements of d to data[i+1] */
|
||||
d = vec_splat(d0, 1);
|
||||
|
||||
/* Set d0 to data[i+14] and data[i+1] */
|
||||
d0 = vec_sel(d0, dnext, vsel);
|
||||
sum10 += d0 * d; /* Multiply data[i+1] with data[i+14] and data[i+1] */
|
||||
sum11 += d1 * d; /* Multiply data[i+1] with data[i+2] and data[i+3] */
|
||||
sum12 += d2 * d;
|
||||
sum13 += d3 * d;
|
||||
#if MAX_LAG > 8
|
||||
sum14 += d4 * d;
|
||||
#endif
|
||||
#if MAX_LAG > 10
|
||||
sum15 += d5 * d;
|
||||
sum16 += d6 * d; /* Multiply data[i+1] with data[i+12] and data[i+13] */
|
||||
#endif
|
||||
|
||||
/* Shift all loaded values one vector (2 elements) so the next
|
||||
* iterations aligns again */
|
||||
d0 = d1;
|
||||
d1 = d2;
|
||||
d2 = d3;
|
||||
#if MAX_LAG > 8
|
||||
d3 = d4;
|
||||
#endif
|
||||
#if MAX_LAG > 10
|
||||
d4 = d5;
|
||||
d5 = d6;
|
||||
#endif
|
||||
|
||||
#if MAX_LAG == 8
|
||||
d3 = dnext;
|
||||
#elif MAX_LAG == 10
|
||||
d4 = dnext;
|
||||
#elif MAX_LAG == 14
|
||||
d6 = dnext;
|
||||
#else
|
||||
#error "Unsupported lag";
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Because the values in sum10..sum16 do not align with
|
||||
* the values in sum0..sum6, these need to be 'left-rotated'
|
||||
* before adding them to sum0..sum6 */
|
||||
sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm);
|
||||
sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm);
|
||||
sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm);
|
||||
#if MAX_LAG > 8
|
||||
sum3 += vec_perm(sum13, sum14, (vector unsigned char)vperm);
|
||||
#endif
|
||||
#if MAX_LAG > 10
|
||||
sum4 += vec_perm(sum14, sum15, (vector unsigned char)vperm);
|
||||
sum5 += vec_perm(sum15, sum16, (vector unsigned char)vperm);
|
||||
#endif
|
||||
|
||||
#if MAX_LAG == 8
|
||||
sum3 += vec_perm(sum13, sum10, (vector unsigned char)vperm);
|
||||
#elif MAX_LAG == 10
|
||||
sum4 += vec_perm(sum14, sum10, (vector unsigned char)vperm);
|
||||
#elif MAX_LAG == 14
|
||||
sum6 += vec_perm(sum16, sum10, (vector unsigned char)vperm);
|
||||
#else
|
||||
#error "Unsupported lag";
|
||||
#endif
|
||||
}else{
|
||||
i = 0;
|
||||
}
|
||||
|
||||
/* Store result */
|
||||
vec_vsx_st(sum0, 0, autoc);
|
||||
vec_vsx_st(sum1, 16, autoc);
|
||||
vec_vsx_st(sum2, 32, autoc);
|
||||
vec_vsx_st(sum3, 48, autoc);
|
||||
#if MAX_LAG > 8
|
||||
vec_vsx_st(sum4, 64, autoc);
|
||||
#endif
|
||||
#if MAX_LAG > 10
|
||||
vec_vsx_st(sum5, 80, autoc);
|
||||
vec_vsx_st(sum6, 96, autoc);
|
||||
#endif
|
||||
|
||||
/* Process remainder of samples in a non-VSX way */
|
||||
for (; i < (long)data_len; i++) {
|
||||
uint32_t coeff;
|
||||
|
||||
FLAC__real d = data[i];
|
||||
for (coeff = 0; coeff < data_len - i; coeff++)
|
||||
autoc[coeff] += d * data[i+coeff];
|
||||
}
|
|
@ -80,12 +80,12 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14(const FLAC__real data[
|
|||
#if defined(FLAC__CPU_PPC64) && defined(FLAC__USE_VSX)
|
||||
#ifdef FLAC__HAS_TARGET_POWER9
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
||||
#endif
|
||||
#ifdef FLAC__HAS_TARGET_POWER8
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -54,166 +54,26 @@
|
|||
FLAC__SSE_TARGET("sse2")
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
|
||||
{
|
||||
// This function calculates autocorrelation with SSE2
|
||||
// vector functions up to a lag of 10 (or max LPC order of 9)
|
||||
int i;
|
||||
__m128d sum0, sum1, sum2, sum3;
|
||||
__m128d d0, d1, d2, d3;
|
||||
|
||||
(void) lag;
|
||||
FLAC__ASSERT(lag <= 8);
|
||||
|
||||
// Initialize all sum vectors with zero
|
||||
sum0 = _mm_setzero_pd();
|
||||
sum1 = _mm_setzero_pd();
|
||||
sum2 = _mm_setzero_pd();
|
||||
sum3 = _mm_setzero_pd();
|
||||
d0 = _mm_setzero_pd();
|
||||
d1 = _mm_setzero_pd();
|
||||
d2 = _mm_setzero_pd();
|
||||
d3 = _mm_setzero_pd();
|
||||
|
||||
// Loop backwards through samples from data_len to limit
|
||||
for(i = data_len-1; i >= 0; i--) {
|
||||
__m128d d = _mm_set1_pd(data[i]); // both elements of d are set to data[i]
|
||||
|
||||
// The next lines of code work like a queue. The queue
|
||||
// is spread over vectors d0..d3. All items are shifted
|
||||
// one position, the last item (data[i+9]) is dequeued
|
||||
// and a new first item is added (data[i])
|
||||
d3 = _mm_shuffle_pd(d2, d3, _MM_SHUFFLE(0,0,0,1)); // d3 is made of second element of d2 and first element of d3
|
||||
d2 = _mm_shuffle_pd(d1, d2, _MM_SHUFFLE(0,0,0,1)); // d2 is made of second element of d1 and first element of d2
|
||||
d1 = _mm_shuffle_pd(d0, d1, _MM_SHUFFLE(0,0,0,1)); // d1 is made of second element of d0 and first element of d1
|
||||
d0 = _mm_shuffle_pd(d, d0, _MM_SHUFFLE(0,0,0,1)); // d0 is made of second element of d and first element of d0
|
||||
|
||||
// sumn += d*dn
|
||||
sum0 = _mm_add_pd(sum0, _mm_mul_pd(d, d0));
|
||||
sum1 = _mm_add_pd(sum1, _mm_mul_pd(d, d1));
|
||||
sum2 = _mm_add_pd(sum2, _mm_mul_pd(d, d2));
|
||||
sum3 = _mm_add_pd(sum3, _mm_mul_pd(d, d3));
|
||||
}
|
||||
|
||||
// Store sum0..sum6 in autoc[0..14]
|
||||
_mm_storeu_pd(autoc, sum0);
|
||||
_mm_storeu_pd(autoc+2, sum1);
|
||||
_mm_storeu_pd(autoc+4, sum2);
|
||||
_mm_storeu_pd(autoc+6 ,sum3);
|
||||
#undef MAX_LAG
|
||||
#define MAX_LAG 8
|
||||
#include "deduplication/lpc_compute_autocorrelation_intrin_sse2.c"
|
||||
}
|
||||
|
||||
FLAC__SSE_TARGET("sse2")
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
|
||||
{
|
||||
// This function calculates autocorrelation with SSE2
|
||||
// vector functions up to a lag of 10 (or max LPC order of 9)
|
||||
int i;
|
||||
__m128d sum0, sum1, sum2, sum3, sum4;
|
||||
__m128d d0, d1, d2, d3, d4;
|
||||
|
||||
(void) lag;
|
||||
FLAC__ASSERT(lag <= 10);
|
||||
|
||||
// Initialize all sum vectors with zero
|
||||
sum0 = _mm_setzero_pd();
|
||||
sum1 = _mm_setzero_pd();
|
||||
sum2 = _mm_setzero_pd();
|
||||
sum3 = _mm_setzero_pd();
|
||||
sum4 = _mm_setzero_pd();
|
||||
d0 = _mm_setzero_pd();
|
||||
d1 = _mm_setzero_pd();
|
||||
d2 = _mm_setzero_pd();
|
||||
d3 = _mm_setzero_pd();
|
||||
d4 = _mm_setzero_pd();
|
||||
|
||||
// Loop backwards through samples from data_len to limit
|
||||
for(i = data_len-1; i >= 0; i--) {
|
||||
__m128d d = _mm_set1_pd(data[i]);
|
||||
|
||||
// The next lines of code work like a queue. For more
|
||||
// information see the lag8 version of this function
|
||||
d4 = _mm_shuffle_pd(d3, d4, _MM_SHUFFLE(0,0,0,1));
|
||||
d3 = _mm_shuffle_pd(d2, d3, _MM_SHUFFLE(0,0,0,1));
|
||||
d2 = _mm_shuffle_pd(d1, d2, _MM_SHUFFLE(0,0,0,1));
|
||||
d1 = _mm_shuffle_pd(d0, d1, _MM_SHUFFLE(0,0,0,1));
|
||||
d0 = _mm_shuffle_pd(d, d0, _MM_SHUFFLE(0,0,0,1));
|
||||
|
||||
// sumn += d*dn
|
||||
sum0 = _mm_add_pd(sum0, _mm_mul_pd(d, d0));
|
||||
sum1 = _mm_add_pd(sum1, _mm_mul_pd(d, d1));
|
||||
sum2 = _mm_add_pd(sum2, _mm_mul_pd(d, d2));
|
||||
sum3 = _mm_add_pd(sum3, _mm_mul_pd(d, d3));
|
||||
sum4 = _mm_add_pd(sum4, _mm_mul_pd(d, d4));
|
||||
}
|
||||
|
||||
// Store sum0..sum6 in autoc[0..14]
|
||||
_mm_storeu_pd(autoc, sum0);
|
||||
_mm_storeu_pd(autoc+2, sum1);
|
||||
_mm_storeu_pd(autoc+4, sum2);
|
||||
_mm_storeu_pd(autoc+6 ,sum3);
|
||||
_mm_storeu_pd(autoc+8, sum4);
|
||||
#undef MAX_LAG
|
||||
#define MAX_LAG 10
|
||||
#include "deduplication/lpc_compute_autocorrelation_intrin_sse2.c"
|
||||
}
|
||||
|
||||
|
||||
FLAC__SSE_TARGET("sse2")
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
|
||||
{
|
||||
// This function calculates autocorrelation with SSE2
|
||||
// vector functions up to a lag of 14 (or max LPC order of 13)
|
||||
int i;
|
||||
__m128d sum0, sum1, sum2, sum3, sum4, sum5, sum6;
|
||||
__m128d d0, d1, d2, d3, d4, d5, d6;
|
||||
|
||||
(void) lag;
|
||||
FLAC__ASSERT(lag <= 14);
|
||||
|
||||
// Initialize all sum vectors with zero
|
||||
sum0 = _mm_setzero_pd();
|
||||
sum1 = _mm_setzero_pd();
|
||||
sum2 = _mm_setzero_pd();
|
||||
sum3 = _mm_setzero_pd();
|
||||
sum4 = _mm_setzero_pd();
|
||||
sum5 = _mm_setzero_pd();
|
||||
sum6 = _mm_setzero_pd();
|
||||
d0 = _mm_setzero_pd();
|
||||
d1 = _mm_setzero_pd();
|
||||
d2 = _mm_setzero_pd();
|
||||
d3 = _mm_setzero_pd();
|
||||
d4 = _mm_setzero_pd();
|
||||
d5 = _mm_setzero_pd();
|
||||
d6 = _mm_setzero_pd();
|
||||
|
||||
// Loop backwards through samples from data_len to limit
|
||||
for(i = data_len-1; i >= 0; i--) {
|
||||
__m128d d = _mm_set1_pd(data[i]);
|
||||
|
||||
// The next lines of code work like a queue. For more
|
||||
// information see the lag8 version of this function
|
||||
d6 = _mm_shuffle_pd(d5, d6, _MM_SHUFFLE(0,0,0,1));
|
||||
d5 = _mm_shuffle_pd(d4, d5, _MM_SHUFFLE(0,0,0,1));
|
||||
d4 = _mm_shuffle_pd(d3, d4, _MM_SHUFFLE(0,0,0,1));
|
||||
d3 = _mm_shuffle_pd(d2, d3, _MM_SHUFFLE(0,0,0,1));
|
||||
d2 = _mm_shuffle_pd(d1, d2, _MM_SHUFFLE(0,0,0,1));
|
||||
d1 = _mm_shuffle_pd(d0, d1, _MM_SHUFFLE(0,0,0,1));
|
||||
d0 = _mm_shuffle_pd(d, d0, _MM_SHUFFLE(0,0,0,1));
|
||||
|
||||
// sumn += d*dn
|
||||
sum0 = _mm_add_pd(sum0, _mm_mul_pd(d, d0));
|
||||
sum1 = _mm_add_pd(sum1, _mm_mul_pd(d, d1));
|
||||
sum2 = _mm_add_pd(sum2, _mm_mul_pd(d, d2));
|
||||
sum3 = _mm_add_pd(sum3, _mm_mul_pd(d, d3));
|
||||
sum4 = _mm_add_pd(sum4, _mm_mul_pd(d, d4));
|
||||
sum5 = _mm_add_pd(sum5, _mm_mul_pd(d, d5));
|
||||
sum6 = _mm_add_pd(sum6, _mm_mul_pd(d, d6));
|
||||
|
||||
}
|
||||
|
||||
// Store sum0..sum6 in autoc[0..14]
|
||||
_mm_storeu_pd(autoc, sum0);
|
||||
_mm_storeu_pd(autoc+2, sum1);
|
||||
_mm_storeu_pd(autoc+4, sum2);
|
||||
_mm_storeu_pd(autoc+6 ,sum3);
|
||||
_mm_storeu_pd(autoc+8, sum4);
|
||||
_mm_storeu_pd(autoc+10,sum5);
|
||||
_mm_storeu_pd(autoc+12,sum6);
|
||||
#undef MAX_LAG
|
||||
#define MAX_LAG 14
|
||||
#include "deduplication/lpc_compute_autocorrelation_intrin_sse2.c"
|
||||
}
|
||||
|
||||
FLAC__SSE_TARGET("sse2")
|
||||
|
|
|
@ -49,330 +49,25 @@
|
|||
__attribute__((target("cpu=power8")))
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
|
||||
{
|
||||
// This function calculates autocorrelation with POWERPC-specific
|
||||
// vector functions up to a lag of 14 (or max LPC order of 13)
|
||||
long i;
|
||||
long limit = (long)data_len - 14;
|
||||
const FLAC__real *base;
|
||||
vector double sum0 = { 0.0f, 0.0f};
|
||||
vector double sum1 = { 0.0f, 0.0f};
|
||||
vector double sum2 = { 0.0f, 0.0f};
|
||||
vector double sum3 = { 0.0f, 0.0f};
|
||||
vector double sum4 = { 0.0f, 0.0f};
|
||||
vector double sum5 = { 0.0f, 0.0f};
|
||||
vector double sum6 = { 0.0f, 0.0f};
|
||||
vector double sum10 = { 0.0f, 0.0f};
|
||||
vector double sum11 = { 0.0f, 0.0f};
|
||||
vector double sum12 = { 0.0f, 0.0f};
|
||||
vector double sum13 = { 0.0f, 0.0f};
|
||||
vector double sum14 = { 0.0f, 0.0f};
|
||||
vector double sum15 = { 0.0f, 0.0f};
|
||||
vector double sum16 = { 0.0f, 0.0f};
|
||||
vector float dtemp;
|
||||
vector double d0, d1, d2, d3, d4, d5, d6;
|
||||
#if WORDS_BIGENDIAN
|
||||
vector unsigned long long vperm = { 0x08090A0B0C0D0E0F, 0x1011121314151617 };
|
||||
vector unsigned long long vsel = { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF };
|
||||
#else
|
||||
vector unsigned long long vperm = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
|
||||
vector unsigned long long vsel = { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 };
|
||||
#endif
|
||||
|
||||
(void) lag;
|
||||
FLAC__ASSERT(lag <= 14);
|
||||
|
||||
base = data;
|
||||
|
||||
// First, check whether it is possible to load
|
||||
// 16 elements at once
|
||||
if(limit > 2){
|
||||
// Convert all floats to doubles
|
||||
dtemp = vec_vsx_ld(0, base);
|
||||
d0 = vec_doubleh(dtemp);
|
||||
d1 = vec_doublel(dtemp);
|
||||
dtemp = vec_vsx_ld(16, base);
|
||||
d2 = vec_doubleh(dtemp);
|
||||
d3 = vec_doublel(dtemp);
|
||||
dtemp = vec_vsx_ld(32, base);
|
||||
d4 = vec_doubleh(dtemp);
|
||||
d5 = vec_doublel(dtemp);
|
||||
dtemp = vec_vsx_ld(48, base);
|
||||
d6 = vec_doubleh(dtemp);
|
||||
|
||||
base += 14;
|
||||
|
||||
// Loop until nearing data_len
|
||||
for (i = 0; i <= (limit-2); i += 2) {
|
||||
vector double d, d7;
|
||||
|
||||
// Load next 2 datapoints and convert to double
|
||||
// data[i+14] and data[i+15]
|
||||
dtemp = vec_vsx_ld(0, base);
|
||||
d7 = vec_doubleh(dtemp);
|
||||
base += 2;
|
||||
|
||||
// Create vector d with both elements set to the first
|
||||
// element of d0, so both elements data[i]
|
||||
d = vec_splat(d0, 0);
|
||||
sum0 += d0 * d; // Multiply data[i] with data[i] and data[i+1]
|
||||
sum1 += d1 * d; // Multiply data[i] with data[i+2] and data[i+3]
|
||||
sum2 += d2 * d; // Multiply data[i] with data[i+4] and data[i+5]
|
||||
sum3 += d3 * d; // Multiply data[i] with data[i+6] and data[i+7]
|
||||
sum4 += d4 * d; // Multiply data[i] with data[i+8] and data[i+9]
|
||||
sum5 += d5 * d; // Multiply data[i] with data[i+10] and data[i+11]
|
||||
sum6 += d6 * d; // Multiply data[i] with data[i+12] and data[i+13]
|
||||
|
||||
// Set both elements of d to data[i+1]
|
||||
d = vec_splat(d0, 1);
|
||||
|
||||
// Set d0 to data[i+14] and data[i+1]
|
||||
d0 = vec_sel(d0, d7, vsel);
|
||||
sum10 += d0 * d; // Multiply data[i+1] with data[i+14] and data[i+1]
|
||||
sum11 += d1 * d; // Multiply data[i+1] with data[i+2] and data[i+3]
|
||||
sum12 += d2 * d;
|
||||
sum13 += d3 * d;
|
||||
sum14 += d4 * d;
|
||||
sum15 += d5 * d;
|
||||
sum16 += d6 * d; // Multiply data[i+1] with data[i+12] and data[i+13]
|
||||
|
||||
// Shift all loaded values one vector (2 elements) so the next
|
||||
// iterations aligns again
|
||||
d0 = d1;
|
||||
d1 = d2;
|
||||
d2 = d3;
|
||||
d3 = d4;
|
||||
d4 = d5;
|
||||
d5 = d6;
|
||||
d6 = d7;
|
||||
}
|
||||
|
||||
// Because the values in sum10..sum16 do not align with
|
||||
// the values in sum0..sum6, these need to be 'left-rotated'
|
||||
// before adding them to sum0..sum6
|
||||
sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm);
|
||||
sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm);
|
||||
sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm);
|
||||
sum3 += vec_perm(sum13, sum14, (vector unsigned char)vperm);
|
||||
sum4 += vec_perm(sum14, sum15, (vector unsigned char)vperm);
|
||||
sum5 += vec_perm(sum15, sum16, (vector unsigned char)vperm);
|
||||
sum6 += vec_perm(sum16, sum10, (vector unsigned char)vperm);
|
||||
}else{
|
||||
i = 0;
|
||||
}
|
||||
|
||||
// Store result
|
||||
vec_vsx_st(sum0, 0, autoc);
|
||||
vec_vsx_st(sum1, 16, autoc);
|
||||
vec_vsx_st(sum2, 32, autoc);
|
||||
vec_vsx_st(sum3, 48, autoc);
|
||||
vec_vsx_st(sum4, 64, autoc);
|
||||
vec_vsx_st(sum5, 80, autoc);
|
||||
vec_vsx_st(sum6, 96, autoc);
|
||||
|
||||
// Process remainder of samples in a non-VSX way
|
||||
for (; i < (long)data_len; i++) {
|
||||
uint32_t coeff;
|
||||
|
||||
FLAC__real d = data[i];
|
||||
for (coeff = 0; coeff < data_len - i; coeff++)
|
||||
autoc[coeff] += d * data[i+coeff];
|
||||
}
|
||||
#undef MAX_LAG
|
||||
#define MAX_LAG 14
|
||||
#include "deduplication/lpc_compute_autocorrelation_intrin_vsx.c"
|
||||
}
|
||||
|
||||
__attribute__((target("cpu=power8")))
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
|
||||
{
|
||||
// This function calculates autocorrelation with POWERPC-specific
|
||||
// vector functions up to a lag of 12 (or max LPC order of 11)
|
||||
// For explanation, please see the lag_14 version of this function
|
||||
long i;
|
||||
long limit = (long)data_len - 12;
|
||||
const FLAC__real *base;
|
||||
vector double sum0 = { 0.0f, 0.0f};
|
||||
vector double sum1 = { 0.0f, 0.0f};
|
||||
vector double sum2 = { 0.0f, 0.0f};
|
||||
vector double sum3 = { 0.0f, 0.0f};
|
||||
vector double sum4 = { 0.0f, 0.0f};
|
||||
vector double sum5 = { 0.0f, 0.0f};
|
||||
vector double sum10 = { 0.0f, 0.0f};
|
||||
vector double sum11 = { 0.0f, 0.0f};
|
||||
vector double sum12 = { 0.0f, 0.0f};
|
||||
vector double sum13 = { 0.0f, 0.0f};
|
||||
vector double sum14 = { 0.0f, 0.0f};
|
||||
vector double sum15 = { 0.0f, 0.0f};
|
||||
vector float dtemp;
|
||||
vector double d0, d1, d2, d3, d4, d5;
|
||||
#if WORDS_BIGENDIAN
|
||||
vector unsigned long long vperm = { 0x08090A0B0C0D0E0F, 0x1011121314151617 };
|
||||
vector unsigned long long vsel = { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF };
|
||||
#else
|
||||
vector unsigned long long vperm = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
|
||||
vector unsigned long long vsel = { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 };
|
||||
#endif
|
||||
|
||||
(void) lag;
|
||||
FLAC__ASSERT(lag <= 12);
|
||||
|
||||
base = data;
|
||||
if(limit > 0){
|
||||
dtemp = vec_vsx_ld(0, base);
|
||||
d0 = vec_doubleh(dtemp);
|
||||
d1 = vec_doublel(dtemp);
|
||||
dtemp = vec_vsx_ld(16, base);
|
||||
d2 = vec_doubleh(dtemp);
|
||||
d3 = vec_doublel(dtemp);
|
||||
dtemp = vec_vsx_ld(32, base);
|
||||
d4 = vec_doubleh(dtemp);
|
||||
d5 = vec_doublel(dtemp);
|
||||
|
||||
base += 12;
|
||||
|
||||
for (i = 0; i <= (limit-2); i += 2) {
|
||||
vector double d, d6;
|
||||
|
||||
dtemp = vec_vsx_ld(0, base);
|
||||
d6 = vec_doubleh(dtemp);
|
||||
base += 2;
|
||||
|
||||
d = vec_splat(d0, 0);
|
||||
sum0 += d0 * d;
|
||||
sum1 += d1 * d;
|
||||
sum2 += d2 * d;
|
||||
sum3 += d3 * d;
|
||||
sum4 += d4 * d;
|
||||
sum5 += d5 * d;
|
||||
|
||||
d = vec_splat(d0, 1);
|
||||
d0 = vec_sel(d0, d6, vsel);
|
||||
sum10 += d0 * d;
|
||||
sum11 += d1 * d;
|
||||
sum12 += d2 * d;
|
||||
sum13 += d3 * d;
|
||||
sum14 += d4 * d;
|
||||
sum15 += d5 * d;
|
||||
|
||||
d0 = d1;
|
||||
d1 = d2;
|
||||
d2 = d3;
|
||||
d3 = d4;
|
||||
d4 = d5;
|
||||
d5 = d6;
|
||||
}
|
||||
|
||||
sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm);
|
||||
sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm);
|
||||
sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm);
|
||||
sum3 += vec_perm(sum13, sum14, (vector unsigned char)vperm);
|
||||
sum4 += vec_perm(sum14, sum15, (vector unsigned char)vperm);
|
||||
sum5 += vec_perm(sum15, sum10, (vector unsigned char)vperm);
|
||||
}else{
|
||||
i = 0;
|
||||
}
|
||||
|
||||
vec_vsx_st(sum0, 0, autoc);
|
||||
vec_vsx_st(sum1, 16, autoc);
|
||||
vec_vsx_st(sum2, 32, autoc);
|
||||
vec_vsx_st(sum3, 48, autoc);
|
||||
vec_vsx_st(sum4, 64, autoc);
|
||||
vec_vsx_st(sum5, 80, autoc);
|
||||
|
||||
for (; i < (long)data_len; i++) {
|
||||
uint32_t coeff;
|
||||
|
||||
FLAC__real d = data[i];
|
||||
for (coeff = 0; coeff < data_len - i; coeff++)
|
||||
autoc[coeff] += d * data[i+coeff];
|
||||
}
|
||||
#undef MAX_LAG
|
||||
#define MAX_LAG 10
|
||||
#include "deduplication/lpc_compute_autocorrelation_intrin_vsx.c"
|
||||
}
|
||||
|
||||
__attribute__((target("cpu=power8")))
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
|
||||
{
|
||||
// This function calculates autocorrelation with POWERPC-specific
|
||||
// vector functions up to a lag of 8 (or max LPC order of 7)
|
||||
// For explanation, please see the lag_14 version of this function
|
||||
long i;
|
||||
long limit = (long)data_len - 8;
|
||||
const FLAC__real *base;
|
||||
vector double sum0 = { 0.0f, 0.0f};
|
||||
vector double sum1 = { 0.0f, 0.0f};
|
||||
vector double sum2 = { 0.0f, 0.0f};
|
||||
vector double sum3 = { 0.0f, 0.0f};
|
||||
vector double sum10 = { 0.0f, 0.0f};
|
||||
vector double sum11 = { 0.0f, 0.0f};
|
||||
vector double sum12 = { 0.0f, 0.0f};
|
||||
vector double sum13 = { 0.0f, 0.0f};
|
||||
vector float dtemp;
|
||||
vector double d0, d1, d2, d3;
|
||||
#if WORDS_BIGENDIAN
|
||||
vector unsigned long long vperm = { 0x08090A0B0C0D0E0F, 0x1011121314151617 };
|
||||
vector unsigned long long vsel = { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF };
|
||||
#else
|
||||
vector unsigned long long vperm = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
|
||||
vector unsigned long long vsel = { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 };
|
||||
#endif
|
||||
|
||||
(void) lag;
|
||||
FLAC__ASSERT(lag <= 8);
|
||||
|
||||
base = data;
|
||||
if(limit > 0){
|
||||
dtemp = vec_vsx_ld(0, base);
|
||||
d0 = vec_doubleh(dtemp);
|
||||
d1 = vec_doublel(dtemp);
|
||||
dtemp = vec_vsx_ld(16, base);
|
||||
d2 = vec_doubleh(dtemp);
|
||||
d3 = vec_doublel(dtemp);
|
||||
|
||||
base += 8;
|
||||
|
||||
for (i = 0; i <= (limit-2); i += 2) {
|
||||
vector double d, d4;
|
||||
|
||||
dtemp = vec_vsx_ld(0, base);
|
||||
d4 = vec_doubleh(dtemp);
|
||||
base += 2;
|
||||
|
||||
d = vec_splat(d0, 0);
|
||||
sum0 += d0 * d;
|
||||
sum1 += d1 * d;
|
||||
sum2 += d2 * d;
|
||||
sum3 += d3 * d;
|
||||
|
||||
d = vec_splat(d0, 1);
|
||||
d0 = vec_sel(d0, d4, vsel);
|
||||
sum10 += d0 * d;
|
||||
sum11 += d1 * d;
|
||||
sum12 += d2 * d;
|
||||
sum13 += d3 * d;
|
||||
|
||||
d0 = d1;
|
||||
d1 = d2;
|
||||
d2 = d3;
|
||||
d3 = d4;
|
||||
}
|
||||
|
||||
sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm);
|
||||
sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm);
|
||||
sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm);
|
||||
sum3 += vec_perm(sum13, sum10, (vector unsigned char)vperm);
|
||||
|
||||
}else{
|
||||
i = 0;
|
||||
}
|
||||
|
||||
vec_vsx_st(sum0, 0, autoc);
|
||||
vec_vsx_st(sum1, 16, autoc);
|
||||
vec_vsx_st(sum2, 32, autoc);
|
||||
vec_vsx_st(sum3, 48, autoc);
|
||||
|
||||
for (; i < (long)data_len; i++) {
|
||||
uint32_t coeff;
|
||||
|
||||
FLAC__real d = data[i];
|
||||
for (coeff = 0; coeff < data_len - i; coeff++)
|
||||
autoc[coeff] += d * data[i+coeff];
|
||||
}
|
||||
#undef MAX_LAG
|
||||
#define MAX_LAG 8
|
||||
#include "deduplication/lpc_compute_autocorrelation_intrin_vsx.c"
|
||||
}
|
||||
#endif /* FLAC__HAS_TARGET_POWER8 */
|
||||
|
||||
|
@ -380,312 +75,25 @@ void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_8(const FLAC__real
|
|||
__attribute__((target("cpu=power9")))
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
|
||||
{
|
||||
// This function calculates autocorrelation with POWERPC-specific
|
||||
// vector functions up to a lag of 14 (or max LPC order of 13)
|
||||
// For explanation, please see the power8 version of this function
|
||||
long i;
|
||||
long limit = (long)data_len - 14;
|
||||
const FLAC__real *base;
|
||||
vector double sum0 = { 0.0f, 0.0f};
|
||||
vector double sum1 = { 0.0f, 0.0f};
|
||||
vector double sum2 = { 0.0f, 0.0f};
|
||||
vector double sum3 = { 0.0f, 0.0f};
|
||||
vector double sum4 = { 0.0f, 0.0f};
|
||||
vector double sum5 = { 0.0f, 0.0f};
|
||||
vector double sum6 = { 0.0f, 0.0f};
|
||||
vector double sum10 = { 0.0f, 0.0f};
|
||||
vector double sum11 = { 0.0f, 0.0f};
|
||||
vector double sum12 = { 0.0f, 0.0f};
|
||||
vector double sum13 = { 0.0f, 0.0f};
|
||||
vector double sum14 = { 0.0f, 0.0f};
|
||||
vector double sum15 = { 0.0f, 0.0f};
|
||||
vector double sum16 = { 0.0f, 0.0f};
|
||||
vector float dtemp;
|
||||
vector double d0, d1, d2, d3, d4, d5, d6;
|
||||
#if WORDS_BIGENDIAN
|
||||
vector unsigned long long vperm = { 0x08090A0B0C0D0E0F, 0x1011121314151617 };
|
||||
vector unsigned long long vsel = { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF };
|
||||
#else
|
||||
vector unsigned long long vperm = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
|
||||
vector unsigned long long vsel = { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 };
|
||||
#endif
|
||||
|
||||
(void) lag;
|
||||
FLAC__ASSERT(lag <= 14);
|
||||
|
||||
base = data;
|
||||
if(limit > 2){
|
||||
dtemp = vec_vsx_ld(0, base);
|
||||
d0 = vec_doubleh(dtemp);
|
||||
d1 = vec_doublel(dtemp);
|
||||
dtemp = vec_vsx_ld(16, base);
|
||||
d2 = vec_doubleh(dtemp);
|
||||
d3 = vec_doublel(dtemp);
|
||||
dtemp = vec_vsx_ld(32, base);
|
||||
d4 = vec_doubleh(dtemp);
|
||||
d5 = vec_doublel(dtemp);
|
||||
dtemp = vec_vsx_ld(48, base);
|
||||
d6 = vec_doubleh(dtemp);
|
||||
|
||||
base += 14;
|
||||
|
||||
for (i = 0; i <= (limit-2); i += 2) {
|
||||
vector double d, d7;
|
||||
|
||||
dtemp = vec_vsx_ld(0, base);
|
||||
d7 = vec_doubleh(dtemp);
|
||||
base += 2;
|
||||
|
||||
d = vec_splat(d0, 0);
|
||||
sum0 += d0 * d;
|
||||
sum1 += d1 * d;
|
||||
sum2 += d2 * d;
|
||||
sum3 += d3 * d;
|
||||
sum4 += d4 * d;
|
||||
sum5 += d5 * d;
|
||||
sum6 += d6 * d;
|
||||
|
||||
d = vec_splat(d0, 1);
|
||||
d0 = vec_sel(d0, d7, vsel);
|
||||
sum10 += d0 * d;
|
||||
sum11 += d1 * d;
|
||||
sum12 += d2 * d;
|
||||
sum13 += d3 * d;
|
||||
sum14 += d4 * d;
|
||||
sum15 += d5 * d;
|
||||
sum16 += d6 * d;
|
||||
|
||||
d0 = d1;
|
||||
d1 = d2;
|
||||
d2 = d3;
|
||||
d3 = d4;
|
||||
d4 = d5;
|
||||
d5 = d6;
|
||||
d6 = d7;
|
||||
}
|
||||
|
||||
sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm);
|
||||
sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm);
|
||||
sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm);
|
||||
sum3 += vec_perm(sum13, sum14, (vector unsigned char)vperm);
|
||||
sum4 += vec_perm(sum14, sum15, (vector unsigned char)vperm);
|
||||
sum5 += vec_perm(sum15, sum16, (vector unsigned char)vperm);
|
||||
sum6 += vec_perm(sum16, sum10, (vector unsigned char)vperm);
|
||||
}else{
|
||||
i = 0;
|
||||
}
|
||||
|
||||
vec_vsx_st(sum0, 0, autoc);
|
||||
vec_vsx_st(sum1, 16, autoc);
|
||||
vec_vsx_st(sum2, 32, autoc);
|
||||
vec_vsx_st(sum3, 48, autoc);
|
||||
vec_vsx_st(sum4, 64, autoc);
|
||||
vec_vsx_st(sum5, 80, autoc);
|
||||
vec_vsx_st(sum6, 96, autoc);
|
||||
|
||||
for (; i < (long)data_len; i++) {
|
||||
uint32_t coeff;
|
||||
|
||||
FLAC__real d = data[i];
|
||||
for (coeff = 0; coeff < data_len - i; coeff++)
|
||||
autoc[coeff] += d * data[i+coeff];
|
||||
}
|
||||
#undef MAX_LAG
|
||||
#define MAX_LAG 14
|
||||
#include "deduplication/lpc_compute_autocorrelation_intrin_vsx.c"
|
||||
}
|
||||
|
||||
__attribute__((target("cpu=power9")))
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
|
||||
{
|
||||
// This function calculates autocorrelation with POWERPC-specific
|
||||
// vector functions up to a lag of 12 (or max LPC order of 11)
|
||||
// For explanation, please see the power9, lag_14 version of this function
|
||||
long i;
|
||||
long limit = (long)data_len - 12;
|
||||
const FLAC__real *base;
|
||||
vector double sum0 = { 0.0f, 0.0f};
|
||||
vector double sum1 = { 0.0f, 0.0f};
|
||||
vector double sum2 = { 0.0f, 0.0f};
|
||||
vector double sum3 = { 0.0f, 0.0f};
|
||||
vector double sum4 = { 0.0f, 0.0f};
|
||||
vector double sum5 = { 0.0f, 0.0f};
|
||||
vector double sum10 = { 0.0f, 0.0f};
|
||||
vector double sum11 = { 0.0f, 0.0f};
|
||||
vector double sum12 = { 0.0f, 0.0f};
|
||||
vector double sum13 = { 0.0f, 0.0f};
|
||||
vector double sum14 = { 0.0f, 0.0f};
|
||||
vector double sum15 = { 0.0f, 0.0f};
|
||||
vector float dtemp;
|
||||
vector double d0, d1, d2, d3, d4, d5;
|
||||
#if WORDS_BIGENDIAN
|
||||
vector unsigned long long vperm = { 0x08090A0B0C0D0E0F, 0x1011121314151617 };
|
||||
vector unsigned long long vsel = { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF };
|
||||
#else
|
||||
vector unsigned long long vperm = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
|
||||
vector unsigned long long vsel = { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 };
|
||||
#endif
|
||||
|
||||
(void) lag;
|
||||
FLAC__ASSERT(lag <= 12);
|
||||
|
||||
base = data;
|
||||
if(limit > 0){
|
||||
dtemp = vec_vsx_ld(0, base);
|
||||
d0 = vec_doubleh(dtemp);
|
||||
d1 = vec_doublel(dtemp);
|
||||
dtemp = vec_vsx_ld(16, base);
|
||||
d2 = vec_doubleh(dtemp);
|
||||
d3 = vec_doublel(dtemp);
|
||||
dtemp = vec_vsx_ld(32, base);
|
||||
d4 = vec_doubleh(dtemp);
|
||||
d5 = vec_doublel(dtemp);
|
||||
|
||||
base += 12;
|
||||
|
||||
for (i = 0; i <= (limit-2); i += 2) {
|
||||
vector double d, d6;
|
||||
|
||||
dtemp = vec_vsx_ld(0, base);
|
||||
d6 = vec_doubleh(dtemp);
|
||||
base += 2;
|
||||
|
||||
d = vec_splat(d0, 0);
|
||||
sum0 += d0 * d;
|
||||
sum1 += d1 * d;
|
||||
sum2 += d2 * d;
|
||||
sum3 += d3 * d;
|
||||
sum4 += d4 * d;
|
||||
sum5 += d5 * d;
|
||||
|
||||
d = vec_splat(d0, 1);
|
||||
d0 = vec_sel(d0, d6, vsel);
|
||||
sum10 += d0 * d;
|
||||
sum11 += d1 * d;
|
||||
sum12 += d2 * d;
|
||||
sum13 += d3 * d;
|
||||
sum14 += d4 * d;
|
||||
sum15 += d5 * d;
|
||||
|
||||
d0 = d1;
|
||||
d1 = d2;
|
||||
d2 = d3;
|
||||
d3 = d4;
|
||||
d4 = d5;
|
||||
d5 = d6;
|
||||
}
|
||||
|
||||
sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm);
|
||||
sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm);
|
||||
sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm);
|
||||
sum3 += vec_perm(sum13, sum14, (vector unsigned char)vperm);
|
||||
sum4 += vec_perm(sum14, sum15, (vector unsigned char)vperm);
|
||||
sum5 += vec_perm(sum15, sum10, (vector unsigned char)vperm);
|
||||
}else{
|
||||
i = 0;
|
||||
}
|
||||
|
||||
vec_vsx_st(sum0, 0, autoc);
|
||||
vec_vsx_st(sum1, 16, autoc);
|
||||
vec_vsx_st(sum2, 32, autoc);
|
||||
vec_vsx_st(sum3, 48, autoc);
|
||||
vec_vsx_st(sum4, 64, autoc);
|
||||
vec_vsx_st(sum5, 80, autoc);
|
||||
|
||||
for (; i < (long)data_len; i++) {
|
||||
uint32_t coeff;
|
||||
|
||||
FLAC__real d = data[i];
|
||||
for (coeff = 0; coeff < data_len - i; coeff++)
|
||||
autoc[coeff] += d * data[i+coeff];
|
||||
}
|
||||
#undef MAX_LAG
|
||||
#define MAX_LAG 10
|
||||
#include "deduplication/lpc_compute_autocorrelation_intrin_vsx.c"
|
||||
}
|
||||
|
||||
__attribute__((target("cpu=power9")))
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
|
||||
{
|
||||
// This function calculates autocorrelation with POWERPC-specific
|
||||
// vector functions up to a lag of 8 (or max LPC order of 7)
|
||||
// For explanation, please see the power9, lag_14 version of this function
|
||||
long i;
|
||||
long limit = (long)data_len - 8;
|
||||
const FLAC__real *base;
|
||||
vector double sum0 = { 0.0f, 0.0f};
|
||||
vector double sum1 = { 0.0f, 0.0f};
|
||||
vector double sum2 = { 0.0f, 0.0f};
|
||||
vector double sum3 = { 0.0f, 0.0f};
|
||||
vector double sum10 = { 0.0f, 0.0f};
|
||||
vector double sum11 = { 0.0f, 0.0f};
|
||||
vector double sum12 = { 0.0f, 0.0f};
|
||||
vector double sum13 = { 0.0f, 0.0f};
|
||||
vector float dtemp;
|
||||
vector double d0, d1, d2, d3;
|
||||
#if WORDS_BIGENDIAN
|
||||
vector unsigned long long vperm = { 0x08090A0B0C0D0E0F, 0x1011121314151617 };
|
||||
vector unsigned long long vsel = { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF };
|
||||
#else
|
||||
vector unsigned long long vperm = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
|
||||
vector unsigned long long vsel = { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 };
|
||||
#endif
|
||||
|
||||
(void) lag;
|
||||
FLAC__ASSERT(lag <= 8);
|
||||
|
||||
base = data;
|
||||
if(limit > 0){
|
||||
dtemp = vec_vsx_ld(0, base);
|
||||
d0 = vec_doubleh(dtemp);
|
||||
d1 = vec_doublel(dtemp);
|
||||
dtemp = vec_vsx_ld(16, base);
|
||||
d2 = vec_doubleh(dtemp);
|
||||
d3 = vec_doublel(dtemp);
|
||||
|
||||
base += 8;
|
||||
|
||||
for (i = 0; i <= (limit-2); i += 2) {
|
||||
vector double d, d4;
|
||||
|
||||
dtemp = vec_vsx_ld(0, base);
|
||||
d4 = vec_doubleh(dtemp);
|
||||
base += 2;
|
||||
|
||||
d = vec_splat(d0, 0);
|
||||
sum0 += d0 * d;
|
||||
sum1 += d1 * d;
|
||||
sum2 += d2 * d;
|
||||
sum3 += d3 * d;
|
||||
|
||||
d = vec_splat(d0, 1);
|
||||
d0 = vec_sel(d0, d4, vsel);
|
||||
sum10 += d0 * d;
|
||||
sum11 += d1 * d;
|
||||
sum12 += d2 * d;
|
||||
sum13 += d3 * d;
|
||||
|
||||
d0 = d1;
|
||||
d1 = d2;
|
||||
d2 = d3;
|
||||
d3 = d4;
|
||||
}
|
||||
|
||||
sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm);
|
||||
sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm);
|
||||
sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm);
|
||||
sum3 += vec_perm(sum13, sum10, (vector unsigned char)vperm);
|
||||
|
||||
}else{
|
||||
i = 0;
|
||||
}
|
||||
|
||||
vec_vsx_st(sum0, 0, autoc);
|
||||
vec_vsx_st(sum1, 16, autoc);
|
||||
vec_vsx_st(sum2, 32, autoc);
|
||||
vec_vsx_st(sum3, 48, autoc);
|
||||
|
||||
for (; i < (long)data_len; i++) {
|
||||
uint32_t coeff;
|
||||
|
||||
FLAC__real d = data[i];
|
||||
for (coeff = 0; coeff < data_len - i; coeff++)
|
||||
autoc[coeff] += d * data[i+coeff];
|
||||
}
|
||||
#undef MAX_LAG
|
||||
#define MAX_LAG 8
|
||||
#include "deduplication/lpc_compute_autocorrelation_intrin_vsx.c"
|
||||
}
|
||||
#endif /* FLAC__HAS_TARGET_POWER9 */
|
||||
|
||||
|
|
|
@ -887,8 +887,8 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
|
|||
if (encoder->private_->cpuinfo.ppc.arch_3_00) {
|
||||
if(encoder->protected_->max_lpc_order < 8)
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_8;
|
||||
else if(encoder->protected_->max_lpc_order < 12)
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_12;
|
||||
else if(encoder->protected_->max_lpc_order < 10)
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_10;
|
||||
else if(encoder->protected_->max_lpc_order < 14)
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_14;
|
||||
else
|
||||
|
@ -898,8 +898,8 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
|
|||
if (encoder->private_->cpuinfo.ppc.arch_2_07) {
|
||||
if(encoder->protected_->max_lpc_order < 8)
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_8;
|
||||
else if(encoder->protected_->max_lpc_order < 12)
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_12;
|
||||
else if(encoder->protected_->max_lpc_order < 10)
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_10;
|
||||
else if(encoder->protected_->max_lpc_order < 14)
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_14;
|
||||
else
|
||||
|
|
Loading…
Reference in New Issue