Add FMA intrinsics for autocorrelation calculation

See https://github.com/xiph/flac/pull/387 for details
This commit is contained in:
Martijn van Beurden 2022-07-25 12:07:24 +02:00 committed by GitHub
parent 67131c04b8
commit c6a4d5c07b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 130 additions and 9 deletions

View File

@ -22,6 +22,7 @@ if(FLAC__CPU_X86_64 OR FLAC__CPU_IA32)
option(WITH_AVX "Enable AVX, AVX2 optimizations (with runtime detection, resulting binary does not require AVX2)" ON) option(WITH_AVX "Enable AVX, AVX2 optimizations (with runtime detection, resulting binary does not require AVX2)" ON)
if(WITH_AVX AND MSVC) if(WITH_AVX AND MSVC)
set_source_files_properties(lpc_intrin_avx2.c stream_encoder_intrin_avx2.c PROPERTIES COMPILE_FLAGS /arch:AVX2) set_source_files_properties(lpc_intrin_avx2.c stream_encoder_intrin_avx2.c PROPERTIES COMPILE_FLAGS /arch:AVX2)
set_source_files_properties(lpc_intrin_fma.c PROPERTIES COMPILE_FLAGS "/arch:AVX2 /fp:fast")
endif() endif()
else() else()
check_cpu_arch_ppc64(FLAC__CPU_PPC64) check_cpu_arch_ppc64(FLAC__CPU_PPC64)
@ -82,6 +83,7 @@ add_library(FLAC
lpc_intrin_sse2.c lpc_intrin_sse2.c
lpc_intrin_sse41.c lpc_intrin_sse41.c
lpc_intrin_avx2.c lpc_intrin_avx2.c
lpc_intrin_fma.c
lpc_intrin_vsx.c lpc_intrin_vsx.c
md5.c md5.c
memory.c memory.c

View File

@ -75,6 +75,7 @@ EXTRA_DIST = \
CMakeLists.txt \ CMakeLists.txt \
flac.pc.in \ flac.pc.in \
libFLAC.m4 \ libFLAC.m4 \
deduplication/lpc_compute_autocorrelation_intrin.c \
deduplication/lpc_compute_autocorrelation_intrin_sse2.c \ deduplication/lpc_compute_autocorrelation_intrin_sse2.c \
deduplication/lpc_compute_autocorrelation_intrin_vsx.c \ deduplication/lpc_compute_autocorrelation_intrin_vsx.c \
deduplication/lpc_compute_autocorrelation_intrin_neon.c deduplication/lpc_compute_autocorrelation_intrin_neon.c
@ -109,6 +110,7 @@ libFLAC_sources = \
lpc_intrin_sse2.c \ lpc_intrin_sse2.c \
lpc_intrin_sse41.c \ lpc_intrin_sse41.c \
lpc_intrin_avx2.c \ lpc_intrin_avx2.c \
lpc_intrin_fma.c \
lpc_intrin_vsx.c \ lpc_intrin_vsx.c \
lpc_intrin_neon.c \ lpc_intrin_neon.c \
md5.c \ md5.c \

View File

@ -0,0 +1,13 @@
(void) lag;
FLAC__ASSERT(lag <= MAX_LAG);
for(int i = 0; i < MAX_LAG; i++)
autoc[i] = 0.0;
for(int i = 0; i < MAX_LAG; i++)
for(int j = 0; j <= i; j++)
autoc[j] += (double)data[i] * (double)data[i-j];
for(int i = MAX_LAG; i < (int)data_len; i++)
for(int j = 0; j < MAX_LAG; j++)
autoc[j] += (double)data[i] * (double)data[i-j];

View File

@ -63,6 +63,7 @@
/* SSE intrinsics support by ICC/MSVC/GCC */ /* SSE intrinsics support by ICC/MSVC/GCC */
#if defined __INTEL_COMPILER #if defined __INTEL_COMPILER
#define FLAC__SSE_TARGET(x) #define FLAC__SSE_TARGET(x)
#define FLAC__FAST_MATH_TARGET(x)
#define FLAC__SSE_SUPPORTED 1 #define FLAC__SSE_SUPPORTED 1
#define FLAC__SSE2_SUPPORTED 1 #define FLAC__SSE2_SUPPORTED 1
#if (__INTEL_COMPILER >= 1000) /* Intel C++ Compiler 10.0 */ #if (__INTEL_COMPILER >= 1000) /* Intel C++ Compiler 10.0 */
@ -80,6 +81,7 @@
#endif #endif
#elif defined __clang__ && __has_attribute(__target__) /* clang */ #elif defined __clang__ && __has_attribute(__target__) /* clang */
#define FLAC__SSE_TARGET(x) __attribute__ ((__target__ (x))) #define FLAC__SSE_TARGET(x) __attribute__ ((__target__ (x)))
#define FLAC__FAST_MATH_TARGET(x) __attribute__ ((__target__ (x), optimize("-ffast-math")))
#if __has_builtin(__builtin_ia32_maxps) #if __has_builtin(__builtin_ia32_maxps)
#define FLAC__SSE_SUPPORTED 1 #define FLAC__SSE_SUPPORTED 1
#endif #endif
@ -105,6 +107,7 @@
#endif #endif
#elif defined __GNUC__ && !defined __clang__ && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) /* GCC 4.9+ */ #elif defined __GNUC__ && !defined __clang__ && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) /* GCC 4.9+ */
#define FLAC__SSE_TARGET(x) __attribute__ ((__target__ (x))) #define FLAC__SSE_TARGET(x) __attribute__ ((__target__ (x)))
#define FLAC__FAST_MATH_TARGET(x) __attribute__ ((__target__ (x), optimize("-ffast-math")))
#define FLAC__SSE_SUPPORTED 1 #define FLAC__SSE_SUPPORTED 1
#define FLAC__SSE2_SUPPORTED 1 #define FLAC__SSE2_SUPPORTED 1
#define FLAC__SSSE3_SUPPORTED 1 #define FLAC__SSSE3_SUPPORTED 1
@ -116,6 +119,7 @@
#endif #endif
#elif defined _MSC_VER #elif defined _MSC_VER
#define FLAC__SSE_TARGET(x) #define FLAC__SSE_TARGET(x)
#define FLAC__FAST_MATH_TARGET(x)
#define FLAC__SSE_SUPPORTED 1 #define FLAC__SSE_SUPPORTED 1
#define FLAC__SSE2_SUPPORTED 1 #define FLAC__SSE2_SUPPORTED 1
#if (_MSC_VER >= 1500) /* MS Visual Studio 2008 */ #if (_MSC_VER >= 1500) /* MS Visual Studio 2008 */
@ -133,6 +137,7 @@
#endif #endif
#else #else
#define FLAC__SSE_TARGET(x) #define FLAC__SSE_TARGET(x)
#define FLAC__FAST_MATH_TARGET(x)
#ifdef __SSE__ #ifdef __SSE__
#define FLAC__SSE_SUPPORTED 1 #define FLAC__SSE_SUPPORTED 1
#endif #endif

View File

@ -78,6 +78,13 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_10(const FLAC__real data[
void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]); void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
# endif # endif
# endif # endif
# if defined FLAC__CPU_X86_64 && FLAC__HAS_X86INTRIN
# ifdef FLAC__FMA_SUPPORTED
void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
# endif
# endif
#if defined(FLAC__CPU_PPC64) && defined(FLAC__USE_VSX) #if defined(FLAC__CPU_PPC64) && defined(FLAC__USE_VSX)
#ifdef FLAC__HAS_TARGET_POWER9 #ifdef FLAC__HAS_TARGET_POWER9
void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]); void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);

View File

@ -0,0 +1,73 @@
/* libFLAC - Free Lossless Audio Codec library
* Copyright (C) 2022 Xiph.Org Foundation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* - Neither the name of the Xiph.org Foundation nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#include "private/cpu.h"
#ifndef FLAC__INTEGER_ONLY_LIBRARY
#ifndef FLAC__NO_ASM
#if defined FLAC__CPU_X86_64 && FLAC__HAS_X86INTRIN
#include "private/lpc.h"
#ifdef FLAC__FMA_SUPPORTED
#include "FLAC/assert.h"
FLAC__FAST_MATH_TARGET("fma")
void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
{
#undef MAX_LAG
#define MAX_LAG 8
#include "deduplication/lpc_compute_autocorrelation_intrin.c"
}
FLAC__FAST_MATH_TARGET("fma")
void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
{
#undef MAX_LAG
#define MAX_LAG 12
#include "deduplication/lpc_compute_autocorrelation_intrin.c"
}
FLAC__FAST_MATH_TARGET("fma")
void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
{
#undef MAX_LAG
#define MAX_LAG 16
#include "deduplication/lpc_compute_autocorrelation_intrin.c"
}
#endif /* FLAC__FMA_SUPPORTED */
#endif /* FLAC__CPU_X86_64 && FLAC__HAS_X86INTRIN */
#endif /* FLAC__NO_ASM */
#endif /* FLAC__INTEGER_ONLY_LIBRARY */

View File

@ -368,6 +368,7 @@ typedef struct FLAC__StreamEncoderPrivate {
FLAC__bool disable_ssse3; FLAC__bool disable_ssse3;
FLAC__bool disable_sse41; FLAC__bool disable_sse41;
FLAC__bool disable_avx2; FLAC__bool disable_avx2;
FLAC__bool disable_fma;
FLAC__bool disable_constant_subframes; FLAC__bool disable_constant_subframes;
FLAC__bool disable_fixed_subframes; FLAC__bool disable_fixed_subframes;
FLAC__bool disable_verbatim_subframes; FLAC__bool disable_verbatim_subframes;
@ -885,6 +886,8 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
encoder->private_->cpuinfo.x86.sse41 = false; encoder->private_->cpuinfo.x86.sse41 = false;
if(encoder->private_->disable_avx2) if(encoder->private_->disable_avx2)
encoder->private_->cpuinfo.x86.avx2 = false; encoder->private_->cpuinfo.x86.avx2 = false;
if(encoder->private_->disable_fma)
encoder->private_->cpuinfo.x86.fma = false;
/* first default to the non-asm routines */ /* first default to the non-asm routines */
#ifndef FLAC__INTEGER_ONLY_LIBRARY #ifndef FLAC__INTEGER_ONLY_LIBRARY
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation; encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation;
@ -1005,14 +1008,16 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_X86_64); FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_X86_64);
# if FLAC__HAS_X86INTRIN # if FLAC__HAS_X86INTRIN
# ifdef FLAC__SSE2_SUPPORTED # ifdef FLAC__SSE2_SUPPORTED
if(encoder->protected_->max_lpc_order < 8) if(encoder->private_->cpuinfo.x86.sse2) { /* For fuzzing */
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_8; if(encoder->protected_->max_lpc_order < 8)
else if(encoder->protected_->max_lpc_order < 10) encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_8;
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_10; else if(encoder->protected_->max_lpc_order < 10)
else if(encoder->protected_->max_lpc_order < 14) encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_10;
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14; else if(encoder->protected_->max_lpc_order < 14)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14;
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2; encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2;
}
# endif # endif
# ifdef FLAC__SSE4_1_SUPPORTED # ifdef FLAC__SSE4_1_SUPPORTED
if(encoder->private_->cpuinfo.x86.sse41) { if(encoder->private_->cpuinfo.x86.sse41) {
@ -1026,10 +1031,23 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2; encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2;
} }
# endif # endif
# ifdef FLAC__FMA_SUPPORTED
if(encoder->private_->cpuinfo.x86.fma) {
if(encoder->protected_->max_lpc_order < 8)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_fma_lag_8;
else if(encoder->protected_->max_lpc_order < 12)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_fma_lag_12;
else if(encoder->protected_->max_lpc_order < 16)
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_fma_lag_16;
}
# endif
# ifdef FLAC__SSE2_SUPPORTED # ifdef FLAC__SSE2_SUPPORTED
encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_intrin_sse2; if(encoder->private_->cpuinfo.x86.sse2) { /* For fuzzing */
encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide_intrin_sse2; encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_intrin_sse2;
encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide_intrin_sse2;
}
# endif # endif
# ifdef FLAC__SSSE3_SUPPORTED # ifdef FLAC__SSSE3_SUPPORTED
if (encoder->private_->cpuinfo.x86.ssse3) { if (encoder->private_->cpuinfo.x86.ssse3) {
@ -1957,6 +1975,7 @@ FLAC_API FLAC__bool FLAC__stream_encoder_disable_instruction_set(FLAC__StreamEnc
encoder->private_->disable_ssse3 = value & 4; encoder->private_->disable_ssse3 = value & 4;
encoder->private_->disable_sse41 = value & 8; encoder->private_->disable_sse41 = value & 8;
encoder->private_->disable_avx2 = value & 16; encoder->private_->disable_avx2 = value & 16;
encoder->private_->disable_fma = value & 32;
return true; return true;
} }