Deduplicate VSX and SSE2 autocorelation calculation code

This commit is contained in:
Martijn van Beurden 2022-04-18 21:24:54 +02:00
parent ee18d1b892
commit b48ed95fcb
7 changed files with 298 additions and 768 deletions

View File

@ -82,7 +82,9 @@ EXTRA_DIST = \
libFLAC_static.vcxproj \
libFLAC_static.vcxproj.filters \
libFLAC.m4 \
windows_unicode_filenames.c
windows_unicode_filenames.c \
deduplication/lpc_compute_autocorrelation_intrin_sse2.c \
deduplication/lpc_compute_autocorrelation_intrin_vsx.c
if OS_IS_WINDOWS
windows_unicode_compat = windows_unicode_filenames.c

View File

@ -0,0 +1,81 @@
/* This code is imported several times in lpc_intrin_sse2.c with different
* values for MAX_LAG. Comments are for MAX_LAG == 14 */
int i;
__m128d sum0, sum1, sum2, sum3;
__m128d d0, d1, d2, d3;
#if MAX_LAG > 8
__m128d d4;
__m128d sum4;
#endif
#if MAX_LAG > 10
__m128d d5, d6;
__m128d sum5, sum6;
#endif
(void) lag;
FLAC__ASSERT(lag <= MAX_LAG);
/* Initialize all sum vectors with zero */
sum0 = _mm_setzero_pd();
sum1 = _mm_setzero_pd();
sum2 = _mm_setzero_pd();
sum3 = _mm_setzero_pd();
d0 = _mm_setzero_pd();
d1 = _mm_setzero_pd();
d2 = _mm_setzero_pd();
d3 = _mm_setzero_pd();
#if MAX_LAG > 8
sum4 = _mm_setzero_pd();
d4 = _mm_setzero_pd();
#endif
#if MAX_LAG > 10
sum5 = _mm_setzero_pd();
sum6 = _mm_setzero_pd();
d5 = _mm_setzero_pd();
d6 = _mm_setzero_pd();
#endif
/* Loop backwards through samples from data_len to limit */
for(i = data_len-1; i >= 0; i--) {
__m128d d = _mm_set1_pd(data[i]);
/* The next lines of code work like a queue. For more
* information see the lag8 version of this function */
#if MAX_LAG > 10
d6 = _mm_shuffle_pd(d5, d6, _MM_SHUFFLE(0,0,0,1));
d5 = _mm_shuffle_pd(d4, d5, _MM_SHUFFLE(0,0,0,1));
#endif
#if MAX_LAG > 8
d4 = _mm_shuffle_pd(d3, d4, _MM_SHUFFLE(0,0,0,1));
#endif
d3 = _mm_shuffle_pd(d2, d3, _MM_SHUFFLE(0,0,0,1));
d2 = _mm_shuffle_pd(d1, d2, _MM_SHUFFLE(0,0,0,1));
d1 = _mm_shuffle_pd(d0, d1, _MM_SHUFFLE(0,0,0,1));
d0 = _mm_shuffle_pd(d, d0, _MM_SHUFFLE(0,0,0,1));
/* sumn += d*dn */
sum0 = _mm_add_pd(sum0, _mm_mul_pd(d, d0));
sum1 = _mm_add_pd(sum1, _mm_mul_pd(d, d1));
sum2 = _mm_add_pd(sum2, _mm_mul_pd(d, d2));
sum3 = _mm_add_pd(sum3, _mm_mul_pd(d, d3));
#if MAX_LAG > 8
sum4 = _mm_add_pd(sum4, _mm_mul_pd(d, d4));
#endif
#if MAX_LAG > 10
sum5 = _mm_add_pd(sum5, _mm_mul_pd(d, d5));
sum6 = _mm_add_pd(sum6, _mm_mul_pd(d, d6));
#endif
}
/* Store sum0..sum6 in autoc[0..14] */
_mm_storeu_pd(autoc, sum0);
_mm_storeu_pd(autoc+2, sum1);
_mm_storeu_pd(autoc+4, sum2);
_mm_storeu_pd(autoc+6 ,sum3);
#if MAX_LAG > 8
_mm_storeu_pd(autoc+8, sum4);
#endif
#if MAX_LAG > 10
_mm_storeu_pd(autoc+10,sum5);
_mm_storeu_pd(autoc+12,sum6);
#endif

View File

@ -0,0 +1,179 @@
/* This code is imported several times in lpc_intrin_vsx.c with different
* values for MAX_LAG. Comments are for MAX_LAG == 14 */
long i;
long limit = (long)data_len - MAX_LAG;
const FLAC__real *base;
vector double d0, d1, d2, d3;
vector double sum0 = { 0.0f, 0.0f};
vector double sum10 = { 0.0f, 0.0f};
vector double sum1 = { 0.0f, 0.0f};
vector double sum11 = { 0.0f, 0.0f};
vector double sum2 = { 0.0f, 0.0f};
vector double sum12 = { 0.0f, 0.0f};
vector double sum3 = { 0.0f, 0.0f};
vector double sum13 = { 0.0f, 0.0f};
#if MAX_LAG > 8
vector double d4;
vector double sum4 = { 0.0f, 0.0f};
vector double sum14 = { 0.0f, 0.0f};
#endif
#if MAX_LAG > 10
vector double d5, d6;
vector double sum5 = { 0.0f, 0.0f};
vector double sum15 = { 0.0f, 0.0f};
vector double sum6 = { 0.0f, 0.0f};
vector double sum16 = { 0.0f, 0.0f};
#endif
vector float dtemp;
#if WORDS_BIGENDIAN
vector unsigned long long vperm = { 0x08090A0B0C0D0E0F, 0x1011121314151617 };
vector unsigned long long vsel = { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF };
#else
vector unsigned long long vperm = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
vector unsigned long long vsel = { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 };
#endif
(void) lag;
FLAC__ASSERT(lag <= MAX_LAG);
base = data;
/* First, check whether it is possible to load
* 16 elements at once */
if(limit > 2){
/* Convert all floats to doubles */
dtemp = vec_vsx_ld(0, base);
d0 = vec_doubleh(dtemp);
d1 = vec_doublel(dtemp);
dtemp = vec_vsx_ld(16, base);
d2 = vec_doubleh(dtemp);
d3 = vec_doublel(dtemp);
#if MAX_LAG > 8
dtemp = vec_vsx_ld(32, base);
d4 = vec_doubleh(dtemp);
#endif
#if MAX_LAG > 10
d5 = vec_doublel(dtemp);
dtemp = vec_vsx_ld(48, base);
d6 = vec_doubleh(dtemp);
#endif
base += MAX_LAG;
/* Loop until nearing data_len */
for (i = 0; i <= (limit-2); i += 2) {
vector double d, dnext;
/* Load next 2 datapoints and convert to double
* for lag 14 that is data[i+14] and data[i+15] */
dtemp = vec_vsx_ld(0, base);
dnext = vec_doubleh(dtemp);
base += 2;
/* Create vector d with both elements set to the first
* element of d0, so both elements data[i] */
d = vec_splat(d0, 0);
sum0 += d0 * d; // Multiply data[i] with data[i] and data[i+1]
sum1 += d1 * d; // Multiply data[i] with data[i+2] and data[i+3]
sum2 += d2 * d; // Multiply data[i] with data[i+4] and data[i+5]
sum3 += d3 * d; // Multiply data[i] with data[i+6] and data[i+7]
#if MAX_LAG > 8
sum4 += d4 * d; // Multiply data[i] with data[i+8] and data[i+9]
#endif
#if MAX_LAG > 10
sum5 += d5 * d; // Multiply data[i] with data[i+10] and data[i+11]
sum6 += d6 * d; // Multiply data[i] with data[i+12] and data[i+13]
#endif
/* Set both elements of d to data[i+1] */
d = vec_splat(d0, 1);
/* Set d0 to data[i+14] and data[i+1] */
d0 = vec_sel(d0, dnext, vsel);
sum10 += d0 * d; /* Multiply data[i+1] with data[i+14] and data[i+1] */
sum11 += d1 * d; /* Multiply data[i+1] with data[i+2] and data[i+3] */
sum12 += d2 * d;
sum13 += d3 * d;
#if MAX_LAG > 8
sum14 += d4 * d;
#endif
#if MAX_LAG > 10
sum15 += d5 * d;
sum16 += d6 * d; /* Multiply data[i+1] with data[i+12] and data[i+13] */
#endif
/* Shift all loaded values one vector (2 elements) so the next
* iterations aligns again */
d0 = d1;
d1 = d2;
d2 = d3;
#if MAX_LAG > 8
d3 = d4;
#endif
#if MAX_LAG > 10
d4 = d5;
d5 = d6;
#endif
#if MAX_LAG == 8
d3 = dnext;
#elif MAX_LAG == 10
d4 = dnext;
#elif MAX_LAG == 14
d6 = dnext;
#else
#error "Unsupported lag";
#endif
}
/* Because the values in sum10..sum16 do not align with
* the values in sum0..sum6, these need to be 'left-rotated'
* before adding them to sum0..sum6 */
sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm);
sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm);
sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm);
#if MAX_LAG > 8
sum3 += vec_perm(sum13, sum14, (vector unsigned char)vperm);
#endif
#if MAX_LAG > 10
sum4 += vec_perm(sum14, sum15, (vector unsigned char)vperm);
sum5 += vec_perm(sum15, sum16, (vector unsigned char)vperm);
#endif
#if MAX_LAG == 8
sum3 += vec_perm(sum13, sum10, (vector unsigned char)vperm);
#elif MAX_LAG == 10
sum4 += vec_perm(sum14, sum10, (vector unsigned char)vperm);
#elif MAX_LAG == 14
sum6 += vec_perm(sum16, sum10, (vector unsigned char)vperm);
#else
#error "Unsupported lag";
#endif
}else{
i = 0;
}
/* Store result */
vec_vsx_st(sum0, 0, autoc);
vec_vsx_st(sum1, 16, autoc);
vec_vsx_st(sum2, 32, autoc);
vec_vsx_st(sum3, 48, autoc);
#if MAX_LAG > 8
vec_vsx_st(sum4, 64, autoc);
#endif
#if MAX_LAG > 10
vec_vsx_st(sum5, 80, autoc);
vec_vsx_st(sum6, 96, autoc);
#endif
/* Process remainder of samples in a non-VSX way */
for (; i < (long)data_len; i++) {
uint32_t coeff;
FLAC__real d = data[i];
for (coeff = 0; coeff < data_len - i; coeff++)
autoc[coeff] += d * data[i+coeff];
}

View File

@ -80,12 +80,12 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14(const FLAC__real data[
#if defined(FLAC__CPU_PPC64) && defined(FLAC__USE_VSX)
#ifdef FLAC__HAS_TARGET_POWER9
void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
#endif
#ifdef FLAC__HAS_TARGET_POWER8
void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
#endif
#endif

View File

@ -54,166 +54,26 @@
FLAC__SSE_TARGET("sse2")
void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
{
// This function calculates autocorrelation with SSE2
// vector functions up to a lag of 10 (or max LPC order of 9)
int i;
__m128d sum0, sum1, sum2, sum3;
__m128d d0, d1, d2, d3;
(void) lag;
FLAC__ASSERT(lag <= 8);
// Initialize all sum vectors with zero
sum0 = _mm_setzero_pd();
sum1 = _mm_setzero_pd();
sum2 = _mm_setzero_pd();
sum3 = _mm_setzero_pd();
d0 = _mm_setzero_pd();
d1 = _mm_setzero_pd();
d2 = _mm_setzero_pd();
d3 = _mm_setzero_pd();
// Loop backwards through samples from data_len to limit
for(i = data_len-1; i >= 0; i--) {
__m128d d = _mm_set1_pd(data[i]); // both elements of d are set to data[i]
// The next lines of code work like a queue. The queue
// is spread over vectors d0..d3. All items are shifted
// one position, the last item (data[i+9]) is dequeued
// and a new first item is added (data[i])
d3 = _mm_shuffle_pd(d2, d3, _MM_SHUFFLE(0,0,0,1)); // d3 is made of second element of d2 and first element of d3
d2 = _mm_shuffle_pd(d1, d2, _MM_SHUFFLE(0,0,0,1)); // d2 is made of second element of d1 and first element of d2
d1 = _mm_shuffle_pd(d0, d1, _MM_SHUFFLE(0,0,0,1)); // d1 is made of second element of d0 and first element of d1
d0 = _mm_shuffle_pd(d, d0, _MM_SHUFFLE(0,0,0,1)); // d0 is made of second element of d and first element of d0
// sumn += d*dn
sum0 = _mm_add_pd(sum0, _mm_mul_pd(d, d0));
sum1 = _mm_add_pd(sum1, _mm_mul_pd(d, d1));
sum2 = _mm_add_pd(sum2, _mm_mul_pd(d, d2));
sum3 = _mm_add_pd(sum3, _mm_mul_pd(d, d3));
}
// Store sum0..sum6 in autoc[0..14]
_mm_storeu_pd(autoc, sum0);
_mm_storeu_pd(autoc+2, sum1);
_mm_storeu_pd(autoc+4, sum2);
_mm_storeu_pd(autoc+6 ,sum3);
#undef MAX_LAG
#define MAX_LAG 8
#include "deduplication/lpc_compute_autocorrelation_intrin_sse2.c"
}
FLAC__SSE_TARGET("sse2")
void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
{
// This function calculates autocorrelation with SSE2
// vector functions up to a lag of 10 (or max LPC order of 9)
int i;
__m128d sum0, sum1, sum2, sum3, sum4;
__m128d d0, d1, d2, d3, d4;
(void) lag;
FLAC__ASSERT(lag <= 10);
// Initialize all sum vectors with zero
sum0 = _mm_setzero_pd();
sum1 = _mm_setzero_pd();
sum2 = _mm_setzero_pd();
sum3 = _mm_setzero_pd();
sum4 = _mm_setzero_pd();
d0 = _mm_setzero_pd();
d1 = _mm_setzero_pd();
d2 = _mm_setzero_pd();
d3 = _mm_setzero_pd();
d4 = _mm_setzero_pd();
// Loop backwards through samples from data_len to limit
for(i = data_len-1; i >= 0; i--) {
__m128d d = _mm_set1_pd(data[i]);
// The next lines of code work like a queue. For more
// information see the lag8 version of this function
d4 = _mm_shuffle_pd(d3, d4, _MM_SHUFFLE(0,0,0,1));
d3 = _mm_shuffle_pd(d2, d3, _MM_SHUFFLE(0,0,0,1));
d2 = _mm_shuffle_pd(d1, d2, _MM_SHUFFLE(0,0,0,1));
d1 = _mm_shuffle_pd(d0, d1, _MM_SHUFFLE(0,0,0,1));
d0 = _mm_shuffle_pd(d, d0, _MM_SHUFFLE(0,0,0,1));
// sumn += d*dn
sum0 = _mm_add_pd(sum0, _mm_mul_pd(d, d0));
sum1 = _mm_add_pd(sum1, _mm_mul_pd(d, d1));
sum2 = _mm_add_pd(sum2, _mm_mul_pd(d, d2));
sum3 = _mm_add_pd(sum3, _mm_mul_pd(d, d3));
sum4 = _mm_add_pd(sum4, _mm_mul_pd(d, d4));
}
// Store sum0..sum6 in autoc[0..14]
_mm_storeu_pd(autoc, sum0);
_mm_storeu_pd(autoc+2, sum1);
_mm_storeu_pd(autoc+4, sum2);
_mm_storeu_pd(autoc+6 ,sum3);
_mm_storeu_pd(autoc+8, sum4);
#undef MAX_LAG
#define MAX_LAG 10
#include "deduplication/lpc_compute_autocorrelation_intrin_sse2.c"
}
FLAC__SSE_TARGET("sse2")
void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
{
// This function calculates autocorrelation with SSE2
// vector functions up to a lag of 14 (or max LPC order of 13)
int i;
__m128d sum0, sum1, sum2, sum3, sum4, sum5, sum6;
__m128d d0, d1, d2, d3, d4, d5, d6;
(void) lag;
FLAC__ASSERT(lag <= 14);
// Initialize all sum vectors with zero
sum0 = _mm_setzero_pd();
sum1 = _mm_setzero_pd();
sum2 = _mm_setzero_pd();
sum3 = _mm_setzero_pd();
sum4 = _mm_setzero_pd();
sum5 = _mm_setzero_pd();
sum6 = _mm_setzero_pd();
d0 = _mm_setzero_pd();
d1 = _mm_setzero_pd();
d2 = _mm_setzero_pd();
d3 = _mm_setzero_pd();
d4 = _mm_setzero_pd();
d5 = _mm_setzero_pd();
d6 = _mm_setzero_pd();
// Loop backwards through samples from data_len to limit
for(i = data_len-1; i >= 0; i--) {
__m128d d = _mm_set1_pd(data[i]);
// The next lines of code work like a queue. For more
// information see the lag8 version of this function
d6 = _mm_shuffle_pd(d5, d6, _MM_SHUFFLE(0,0,0,1));
d5 = _mm_shuffle_pd(d4, d5, _MM_SHUFFLE(0,0,0,1));
d4 = _mm_shuffle_pd(d3, d4, _MM_SHUFFLE(0,0,0,1));
d3 = _mm_shuffle_pd(d2, d3, _MM_SHUFFLE(0,0,0,1));
d2 = _mm_shuffle_pd(d1, d2, _MM_SHUFFLE(0,0,0,1));
d1 = _mm_shuffle_pd(d0, d1, _MM_SHUFFLE(0,0,0,1));
d0 = _mm_shuffle_pd(d, d0, _MM_SHUFFLE(0,0,0,1));
// sumn += d*dn
sum0 = _mm_add_pd(sum0, _mm_mul_pd(d, d0));
sum1 = _mm_add_pd(sum1, _mm_mul_pd(d, d1));
sum2 = _mm_add_pd(sum2, _mm_mul_pd(d, d2));
sum3 = _mm_add_pd(sum3, _mm_mul_pd(d, d3));
sum4 = _mm_add_pd(sum4, _mm_mul_pd(d, d4));
sum5 = _mm_add_pd(sum5, _mm_mul_pd(d, d5));
sum6 = _mm_add_pd(sum6, _mm_mul_pd(d, d6));
}
// Store sum0..sum6 in autoc[0..14]
_mm_storeu_pd(autoc, sum0);
_mm_storeu_pd(autoc+2, sum1);
_mm_storeu_pd(autoc+4, sum2);
_mm_storeu_pd(autoc+6 ,sum3);
_mm_storeu_pd(autoc+8, sum4);
_mm_storeu_pd(autoc+10,sum5);
_mm_storeu_pd(autoc+12,sum6);
#undef MAX_LAG
#define MAX_LAG 14
#include "deduplication/lpc_compute_autocorrelation_intrin_sse2.c"
}
FLAC__SSE_TARGET("sse2")

View File

@ -49,330 +49,25 @@
__attribute__((target("cpu=power8")))
void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
{
// This function calculates autocorrelation with POWERPC-specific
// vector functions up to a lag of 14 (or max LPC order of 13)
long i;
long limit = (long)data_len - 14;
const FLAC__real *base;
vector double sum0 = { 0.0f, 0.0f};
vector double sum1 = { 0.0f, 0.0f};
vector double sum2 = { 0.0f, 0.0f};
vector double sum3 = { 0.0f, 0.0f};
vector double sum4 = { 0.0f, 0.0f};
vector double sum5 = { 0.0f, 0.0f};
vector double sum6 = { 0.0f, 0.0f};
vector double sum10 = { 0.0f, 0.0f};
vector double sum11 = { 0.0f, 0.0f};
vector double sum12 = { 0.0f, 0.0f};
vector double sum13 = { 0.0f, 0.0f};
vector double sum14 = { 0.0f, 0.0f};
vector double sum15 = { 0.0f, 0.0f};
vector double sum16 = { 0.0f, 0.0f};
vector float dtemp;
vector double d0, d1, d2, d3, d4, d5, d6;
#if WORDS_BIGENDIAN
vector unsigned long long vperm = { 0x08090A0B0C0D0E0F, 0x1011121314151617 };
vector unsigned long long vsel = { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF };
#else
vector unsigned long long vperm = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
vector unsigned long long vsel = { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 };
#endif
(void) lag;
FLAC__ASSERT(lag <= 14);
base = data;
// First, check whether it is possible to load
// 16 elements at once
if(limit > 2){
// Convert all floats to doubles
dtemp = vec_vsx_ld(0, base);
d0 = vec_doubleh(dtemp);
d1 = vec_doublel(dtemp);
dtemp = vec_vsx_ld(16, base);
d2 = vec_doubleh(dtemp);
d3 = vec_doublel(dtemp);
dtemp = vec_vsx_ld(32, base);
d4 = vec_doubleh(dtemp);
d5 = vec_doublel(dtemp);
dtemp = vec_vsx_ld(48, base);
d6 = vec_doubleh(dtemp);
base += 14;
// Loop until nearing data_len
for (i = 0; i <= (limit-2); i += 2) {
vector double d, d7;
// Load next 2 datapoints and convert to double
// data[i+14] and data[i+15]
dtemp = vec_vsx_ld(0, base);
d7 = vec_doubleh(dtemp);
base += 2;
// Create vector d with both elements set to the first
// element of d0, so both elements data[i]
d = vec_splat(d0, 0);
sum0 += d0 * d; // Multiply data[i] with data[i] and data[i+1]
sum1 += d1 * d; // Multiply data[i] with data[i+2] and data[i+3]
sum2 += d2 * d; // Multiply data[i] with data[i+4] and data[i+5]
sum3 += d3 * d; // Multiply data[i] with data[i+6] and data[i+7]
sum4 += d4 * d; // Multiply data[i] with data[i+8] and data[i+9]
sum5 += d5 * d; // Multiply data[i] with data[i+10] and data[i+11]
sum6 += d6 * d; // Multiply data[i] with data[i+12] and data[i+13]
// Set both elements of d to data[i+1]
d = vec_splat(d0, 1);
// Set d0 to data[i+14] and data[i+1]
d0 = vec_sel(d0, d7, vsel);
sum10 += d0 * d; // Multiply data[i+1] with data[i+14] and data[i+1]
sum11 += d1 * d; // Multiply data[i+1] with data[i+2] and data[i+3]
sum12 += d2 * d;
sum13 += d3 * d;
sum14 += d4 * d;
sum15 += d5 * d;
sum16 += d6 * d; // Multiply data[i+1] with data[i+12] and data[i+13]
// Shift all loaded values one vector (2 elements) so the next
// iterations aligns again
d0 = d1;
d1 = d2;
d2 = d3;
d3 = d4;
d4 = d5;
d5 = d6;
d6 = d7;
}
// Because the values in sum10..sum16 do not align with
// the values in sum0..sum6, these need to be 'left-rotated'
// before adding them to sum0..sum6
sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm);
sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm);
sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm);
sum3 += vec_perm(sum13, sum14, (vector unsigned char)vperm);
sum4 += vec_perm(sum14, sum15, (vector unsigned char)vperm);
sum5 += vec_perm(sum15, sum16, (vector unsigned char)vperm);
sum6 += vec_perm(sum16, sum10, (vector unsigned char)vperm);
}else{
i = 0;
}
// Store result
vec_vsx_st(sum0, 0, autoc);
vec_vsx_st(sum1, 16, autoc);
vec_vsx_st(sum2, 32, autoc);
vec_vsx_st(sum3, 48, autoc);
vec_vsx_st(sum4, 64, autoc);
vec_vsx_st(sum5, 80, autoc);
vec_vsx_st(sum6, 96, autoc);
// Process remainder of samples in a non-VSX way
for (; i < (long)data_len; i++) {
uint32_t coeff;
FLAC__real d = data[i];
for (coeff = 0; coeff < data_len - i; coeff++)
autoc[coeff] += d * data[i+coeff];
}
#undef MAX_LAG
#define MAX_LAG 14
#include "deduplication/lpc_compute_autocorrelation_intrin_vsx.c"
}
__attribute__((target("cpu=power8")))
void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
{
// This function calculates autocorrelation with POWERPC-specific
// vector functions up to a lag of 12 (or max LPC order of 11)
// For explanation, please see the lag_14 version of this function
long i;
long limit = (long)data_len - 12;
const FLAC__real *base;
vector double sum0 = { 0.0f, 0.0f};
vector double sum1 = { 0.0f, 0.0f};
vector double sum2 = { 0.0f, 0.0f};
vector double sum3 = { 0.0f, 0.0f};
vector double sum4 = { 0.0f, 0.0f};
vector double sum5 = { 0.0f, 0.0f};
vector double sum10 = { 0.0f, 0.0f};
vector double sum11 = { 0.0f, 0.0f};
vector double sum12 = { 0.0f, 0.0f};
vector double sum13 = { 0.0f, 0.0f};
vector double sum14 = { 0.0f, 0.0f};
vector double sum15 = { 0.0f, 0.0f};
vector float dtemp;
vector double d0, d1, d2, d3, d4, d5;
#if WORDS_BIGENDIAN
vector unsigned long long vperm = { 0x08090A0B0C0D0E0F, 0x1011121314151617 };
vector unsigned long long vsel = { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF };
#else
vector unsigned long long vperm = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
vector unsigned long long vsel = { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 };
#endif
(void) lag;
FLAC__ASSERT(lag <= 12);
base = data;
if(limit > 0){
dtemp = vec_vsx_ld(0, base);
d0 = vec_doubleh(dtemp);
d1 = vec_doublel(dtemp);
dtemp = vec_vsx_ld(16, base);
d2 = vec_doubleh(dtemp);
d3 = vec_doublel(dtemp);
dtemp = vec_vsx_ld(32, base);
d4 = vec_doubleh(dtemp);
d5 = vec_doublel(dtemp);
base += 12;
for (i = 0; i <= (limit-2); i += 2) {
vector double d, d6;
dtemp = vec_vsx_ld(0, base);
d6 = vec_doubleh(dtemp);
base += 2;
d = vec_splat(d0, 0);
sum0 += d0 * d;
sum1 += d1 * d;
sum2 += d2 * d;
sum3 += d3 * d;
sum4 += d4 * d;
sum5 += d5 * d;
d = vec_splat(d0, 1);
d0 = vec_sel(d0, d6, vsel);
sum10 += d0 * d;
sum11 += d1 * d;
sum12 += d2 * d;
sum13 += d3 * d;
sum14 += d4 * d;
sum15 += d5 * d;
d0 = d1;
d1 = d2;
d2 = d3;
d3 = d4;
d4 = d5;
d5 = d6;
}
sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm);
sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm);
sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm);
sum3 += vec_perm(sum13, sum14, (vector unsigned char)vperm);
sum4 += vec_perm(sum14, sum15, (vector unsigned char)vperm);
sum5 += vec_perm(sum15, sum10, (vector unsigned char)vperm);
}else{
i = 0;
}
vec_vsx_st(sum0, 0, autoc);
vec_vsx_st(sum1, 16, autoc);
vec_vsx_st(sum2, 32, autoc);
vec_vsx_st(sum3, 48, autoc);
vec_vsx_st(sum4, 64, autoc);
vec_vsx_st(sum5, 80, autoc);
for (; i < (long)data_len; i++) {
uint32_t coeff;
FLAC__real d = data[i];
for (coeff = 0; coeff < data_len - i; coeff++)
autoc[coeff] += d * data[i+coeff];
}
#undef MAX_LAG
#define MAX_LAG 10
#include "deduplication/lpc_compute_autocorrelation_intrin_vsx.c"
}
__attribute__((target("cpu=power8")))
void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
{
// This function calculates autocorrelation with POWERPC-specific
// vector functions up to a lag of 8 (or max LPC order of 7)
// For explanation, please see the lag_14 version of this function
long i;
long limit = (long)data_len - 8;
const FLAC__real *base;
vector double sum0 = { 0.0f, 0.0f};
vector double sum1 = { 0.0f, 0.0f};
vector double sum2 = { 0.0f, 0.0f};
vector double sum3 = { 0.0f, 0.0f};
vector double sum10 = { 0.0f, 0.0f};
vector double sum11 = { 0.0f, 0.0f};
vector double sum12 = { 0.0f, 0.0f};
vector double sum13 = { 0.0f, 0.0f};
vector float dtemp;
vector double d0, d1, d2, d3;
#if WORDS_BIGENDIAN
vector unsigned long long vperm = { 0x08090A0B0C0D0E0F, 0x1011121314151617 };
vector unsigned long long vsel = { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF };
#else
vector unsigned long long vperm = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
vector unsigned long long vsel = { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 };
#endif
(void) lag;
FLAC__ASSERT(lag <= 8);
base = data;
if(limit > 0){
dtemp = vec_vsx_ld(0, base);
d0 = vec_doubleh(dtemp);
d1 = vec_doublel(dtemp);
dtemp = vec_vsx_ld(16, base);
d2 = vec_doubleh(dtemp);
d3 = vec_doublel(dtemp);
base += 8;
for (i = 0; i <= (limit-2); i += 2) {
vector double d, d4;
dtemp = vec_vsx_ld(0, base);
d4 = vec_doubleh(dtemp);
base += 2;
d = vec_splat(d0, 0);
sum0 += d0 * d;
sum1 += d1 * d;
sum2 += d2 * d;
sum3 += d3 * d;
d = vec_splat(d0, 1);
d0 = vec_sel(d0, d4, vsel);
sum10 += d0 * d;
sum11 += d1 * d;
sum12 += d2 * d;
sum13 += d3 * d;
d0 = d1;
d1 = d2;
d2 = d3;
d3 = d4;
}
sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm);
sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm);
sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm);
sum3 += vec_perm(sum13, sum10, (vector unsigned char)vperm);
}else{
i = 0;
}
vec_vsx_st(sum0, 0, autoc);
vec_vsx_st(sum1, 16, autoc);
vec_vsx_st(sum2, 32, autoc);
vec_vsx_st(sum3, 48, autoc);
for (; i < (long)data_len; i++) {
uint32_t coeff;
FLAC__real d = data[i];
for (coeff = 0; coeff < data_len - i; coeff++)
autoc[coeff] += d * data[i+coeff];
}
#undef MAX_LAG
#define MAX_LAG 8
#include "deduplication/lpc_compute_autocorrelation_intrin_vsx.c"
}
#endif /* FLAC__HAS_TARGET_POWER8 */
@ -380,312 +75,25 @@ void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_8(const FLAC__real
__attribute__((target("cpu=power9")))
void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
{
// This function calculates autocorrelation with POWERPC-specific
// vector functions up to a lag of 14 (or max LPC order of 13)
// For explanation, please see the power8 version of this function
long i;
long limit = (long)data_len - 14;
const FLAC__real *base;
vector double sum0 = { 0.0f, 0.0f};
vector double sum1 = { 0.0f, 0.0f};
vector double sum2 = { 0.0f, 0.0f};
vector double sum3 = { 0.0f, 0.0f};
vector double sum4 = { 0.0f, 0.0f};
vector double sum5 = { 0.0f, 0.0f};
vector double sum6 = { 0.0f, 0.0f};
vector double sum10 = { 0.0f, 0.0f};
vector double sum11 = { 0.0f, 0.0f};
vector double sum12 = { 0.0f, 0.0f};
vector double sum13 = { 0.0f, 0.0f};
vector double sum14 = { 0.0f, 0.0f};
vector double sum15 = { 0.0f, 0.0f};
vector double sum16 = { 0.0f, 0.0f};
vector float dtemp;
vector double d0, d1, d2, d3, d4, d5, d6;
#if WORDS_BIGENDIAN
vector unsigned long long vperm = { 0x08090A0B0C0D0E0F, 0x1011121314151617 };
vector unsigned long long vsel = { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF };
#else
vector unsigned long long vperm = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
vector unsigned long long vsel = { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 };
#endif
(void) lag;
FLAC__ASSERT(lag <= 14);
base = data;
if(limit > 2){
dtemp = vec_vsx_ld(0, base);
d0 = vec_doubleh(dtemp);
d1 = vec_doublel(dtemp);
dtemp = vec_vsx_ld(16, base);
d2 = vec_doubleh(dtemp);
d3 = vec_doublel(dtemp);
dtemp = vec_vsx_ld(32, base);
d4 = vec_doubleh(dtemp);
d5 = vec_doublel(dtemp);
dtemp = vec_vsx_ld(48, base);
d6 = vec_doubleh(dtemp);
base += 14;
for (i = 0; i <= (limit-2); i += 2) {
vector double d, d7;
dtemp = vec_vsx_ld(0, base);
d7 = vec_doubleh(dtemp);
base += 2;
d = vec_splat(d0, 0);
sum0 += d0 * d;
sum1 += d1 * d;
sum2 += d2 * d;
sum3 += d3 * d;
sum4 += d4 * d;
sum5 += d5 * d;
sum6 += d6 * d;
d = vec_splat(d0, 1);
d0 = vec_sel(d0, d7, vsel);
sum10 += d0 * d;
sum11 += d1 * d;
sum12 += d2 * d;
sum13 += d3 * d;
sum14 += d4 * d;
sum15 += d5 * d;
sum16 += d6 * d;
d0 = d1;
d1 = d2;
d2 = d3;
d3 = d4;
d4 = d5;
d5 = d6;
d6 = d7;
}
sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm);
sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm);
sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm);
sum3 += vec_perm(sum13, sum14, (vector unsigned char)vperm);
sum4 += vec_perm(sum14, sum15, (vector unsigned char)vperm);
sum5 += vec_perm(sum15, sum16, (vector unsigned char)vperm);
sum6 += vec_perm(sum16, sum10, (vector unsigned char)vperm);
}else{
i = 0;
}
vec_vsx_st(sum0, 0, autoc);
vec_vsx_st(sum1, 16, autoc);
vec_vsx_st(sum2, 32, autoc);
vec_vsx_st(sum3, 48, autoc);
vec_vsx_st(sum4, 64, autoc);
vec_vsx_st(sum5, 80, autoc);
vec_vsx_st(sum6, 96, autoc);
for (; i < (long)data_len; i++) {
uint32_t coeff;
FLAC__real d = data[i];
for (coeff = 0; coeff < data_len - i; coeff++)
autoc[coeff] += d * data[i+coeff];
}
#undef MAX_LAG
#define MAX_LAG 14
#include "deduplication/lpc_compute_autocorrelation_intrin_vsx.c"
}
__attribute__((target("cpu=power9")))
void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_10(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
{
// This function calculates autocorrelation with POWERPC-specific
// vector functions up to a lag of 12 (or max LPC order of 11)
// For explanation, please see the power9, lag_14 version of this function
long i;
long limit = (long)data_len - 12;
const FLAC__real *base;
vector double sum0 = { 0.0f, 0.0f};
vector double sum1 = { 0.0f, 0.0f};
vector double sum2 = { 0.0f, 0.0f};
vector double sum3 = { 0.0f, 0.0f};
vector double sum4 = { 0.0f, 0.0f};
vector double sum5 = { 0.0f, 0.0f};
vector double sum10 = { 0.0f, 0.0f};
vector double sum11 = { 0.0f, 0.0f};
vector double sum12 = { 0.0f, 0.0f};
vector double sum13 = { 0.0f, 0.0f};
vector double sum14 = { 0.0f, 0.0f};
vector double sum15 = { 0.0f, 0.0f};
vector float dtemp;
vector double d0, d1, d2, d3, d4, d5;
#if WORDS_BIGENDIAN
vector unsigned long long vperm = { 0x08090A0B0C0D0E0F, 0x1011121314151617 };
vector unsigned long long vsel = { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF };
#else
vector unsigned long long vperm = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
vector unsigned long long vsel = { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 };
#endif
(void) lag;
FLAC__ASSERT(lag <= 12);
base = data;
if(limit > 0){
dtemp = vec_vsx_ld(0, base);
d0 = vec_doubleh(dtemp);
d1 = vec_doublel(dtemp);
dtemp = vec_vsx_ld(16, base);
d2 = vec_doubleh(dtemp);
d3 = vec_doublel(dtemp);
dtemp = vec_vsx_ld(32, base);
d4 = vec_doubleh(dtemp);
d5 = vec_doublel(dtemp);
base += 12;
for (i = 0; i <= (limit-2); i += 2) {
vector double d, d6;
dtemp = vec_vsx_ld(0, base);
d6 = vec_doubleh(dtemp);
base += 2;
d = vec_splat(d0, 0);
sum0 += d0 * d;
sum1 += d1 * d;
sum2 += d2 * d;
sum3 += d3 * d;
sum4 += d4 * d;
sum5 += d5 * d;
d = vec_splat(d0, 1);
d0 = vec_sel(d0, d6, vsel);
sum10 += d0 * d;
sum11 += d1 * d;
sum12 += d2 * d;
sum13 += d3 * d;
sum14 += d4 * d;
sum15 += d5 * d;
d0 = d1;
d1 = d2;
d2 = d3;
d3 = d4;
d4 = d5;
d5 = d6;
}
sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm);
sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm);
sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm);
sum3 += vec_perm(sum13, sum14, (vector unsigned char)vperm);
sum4 += vec_perm(sum14, sum15, (vector unsigned char)vperm);
sum5 += vec_perm(sum15, sum10, (vector unsigned char)vperm);
}else{
i = 0;
}
vec_vsx_st(sum0, 0, autoc);
vec_vsx_st(sum1, 16, autoc);
vec_vsx_st(sum2, 32, autoc);
vec_vsx_st(sum3, 48, autoc);
vec_vsx_st(sum4, 64, autoc);
vec_vsx_st(sum5, 80, autoc);
for (; i < (long)data_len; i++) {
uint32_t coeff;
FLAC__real d = data[i];
for (coeff = 0; coeff < data_len - i; coeff++)
autoc[coeff] += d * data[i+coeff];
}
#undef MAX_LAG
#define MAX_LAG 10
#include "deduplication/lpc_compute_autocorrelation_intrin_vsx.c"
}
__attribute__((target("cpu=power9")))
void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
{
// This function calculates autocorrelation with POWERPC-specific
// vector functions up to a lag of 8 (or max LPC order of 7)
// For explanation, please see the power9, lag_14 version of this function
long i;
long limit = (long)data_len - 8;
const FLAC__real *base;
vector double sum0 = { 0.0f, 0.0f};
vector double sum1 = { 0.0f, 0.0f};
vector double sum2 = { 0.0f, 0.0f};
vector double sum3 = { 0.0f, 0.0f};
vector double sum10 = { 0.0f, 0.0f};
vector double sum11 = { 0.0f, 0.0f};
vector double sum12 = { 0.0f, 0.0f};
vector double sum13 = { 0.0f, 0.0f};
vector float dtemp;
vector double d0, d1, d2, d3;
#if WORDS_BIGENDIAN
vector unsigned long long vperm = { 0x08090A0B0C0D0E0F, 0x1011121314151617 };
vector unsigned long long vsel = { 0x0000000000000000, 0xFFFFFFFFFFFFFFFF };
#else
vector unsigned long long vperm = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
vector unsigned long long vsel = { 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 };
#endif
(void) lag;
FLAC__ASSERT(lag <= 8);
base = data;
if(limit > 0){
dtemp = vec_vsx_ld(0, base);
d0 = vec_doubleh(dtemp);
d1 = vec_doublel(dtemp);
dtemp = vec_vsx_ld(16, base);
d2 = vec_doubleh(dtemp);
d3 = vec_doublel(dtemp);
base += 8;
for (i = 0; i <= (limit-2); i += 2) {
vector double d, d4;
dtemp = vec_vsx_ld(0, base);
d4 = vec_doubleh(dtemp);
base += 2;
d = vec_splat(d0, 0);
sum0 += d0 * d;
sum1 += d1 * d;
sum2 += d2 * d;
sum3 += d3 * d;
d = vec_splat(d0, 1);
d0 = vec_sel(d0, d4, vsel);
sum10 += d0 * d;
sum11 += d1 * d;
sum12 += d2 * d;
sum13 += d3 * d;
d0 = d1;
d1 = d2;
d2 = d3;
d3 = d4;
}
sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm);
sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm);
sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm);
sum3 += vec_perm(sum13, sum10, (vector unsigned char)vperm);
}else{
i = 0;
}
vec_vsx_st(sum0, 0, autoc);
vec_vsx_st(sum1, 16, autoc);
vec_vsx_st(sum2, 32, autoc);
vec_vsx_st(sum3, 48, autoc);
for (; i < (long)data_len; i++) {
uint32_t coeff;
FLAC__real d = data[i];
for (coeff = 0; coeff < data_len - i; coeff++)
autoc[coeff] += d * data[i+coeff];
}
#undef MAX_LAG
#define MAX_LAG 8
#include "deduplication/lpc_compute_autocorrelation_intrin_vsx.c"
}
#endif /* FLAC__HAS_TARGET_POWER9 */

View File

@ -887,8 +887,8 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
if (encoder->private_->cpuinfo.ppc.arch_3_00) {
if(encoder->protected_->max_lpc_order < 8)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_8;
else if(encoder->protected_->max_lpc_order < 12)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_12;
else if(encoder->protected_->max_lpc_order < 10)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_10;
else if(encoder->protected_->max_lpc_order < 14)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_14;
else
@ -898,8 +898,8 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
if (encoder->private_->cpuinfo.ppc.arch_2_07) {
if(encoder->protected_->max_lpc_order < 8)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_8;
else if(encoder->protected_->max_lpc_order < 12)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_12;
else if(encoder->protected_->max_lpc_order < 10)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_10;
else if(encoder->protected_->max_lpc_order < 14)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_14;
else