From c6a4d5c07b8e84a634a39fe2d86153796e207351 Mon Sep 17 00:00:00 2001 From: Martijn van Beurden Date: Mon, 25 Jul 2022 12:07:24 +0200 Subject: [PATCH] Add FMA intrinsics for autocorrelation calculation See https://github.com/xiph/flac/pull/387 for details --- src/libFLAC/CMakeLists.txt | 2 + src/libFLAC/Makefile.am | 2 + .../lpc_compute_autocorrelation_intrin.c | 13 ++++ src/libFLAC/include/private/cpu.h | 5 ++ src/libFLAC/include/private/lpc.h | 7 ++ src/libFLAC/lpc_intrin_fma.c | 73 +++++++++++++++++++ src/libFLAC/stream_encoder.c | 37 +++++++--- 7 files changed, 130 insertions(+), 9 deletions(-) create mode 100644 src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin.c create mode 100644 src/libFLAC/lpc_intrin_fma.c diff --git a/src/libFLAC/CMakeLists.txt b/src/libFLAC/CMakeLists.txt index 3e3804e0..50cb6dbf 100644 --- a/src/libFLAC/CMakeLists.txt +++ b/src/libFLAC/CMakeLists.txt @@ -22,6 +22,7 @@ if(FLAC__CPU_X86_64 OR FLAC__CPU_IA32) option(WITH_AVX "Enable AVX, AVX2 optimizations (with runtime detection, resulting binary does not require AVX2)" ON) if(WITH_AVX AND MSVC) set_source_files_properties(lpc_intrin_avx2.c stream_encoder_intrin_avx2.c PROPERTIES COMPILE_FLAGS /arch:AVX2) + set_source_files_properties(lpc_intrin_fma.c PROPERTIES COMPILE_FLAGS "/arch:AVX2 /fp:fast") endif() else() check_cpu_arch_ppc64(FLAC__CPU_PPC64) @@ -82,6 +83,7 @@ add_library(FLAC lpc_intrin_sse2.c lpc_intrin_sse41.c lpc_intrin_avx2.c + lpc_intrin_fma.c lpc_intrin_vsx.c md5.c memory.c diff --git a/src/libFLAC/Makefile.am b/src/libFLAC/Makefile.am index ddd37bc7..12c7abdf 100644 --- a/src/libFLAC/Makefile.am +++ b/src/libFLAC/Makefile.am @@ -75,6 +75,7 @@ EXTRA_DIST = \ CMakeLists.txt \ flac.pc.in \ libFLAC.m4 \ + deduplication/lpc_compute_autocorrelation_intrin.c \ deduplication/lpc_compute_autocorrelation_intrin_sse2.c \ deduplication/lpc_compute_autocorrelation_intrin_vsx.c \ deduplication/lpc_compute_autocorrelation_intrin_neon.c @@ -109,6 +110,7 @@ libFLAC_sources = \ lpc_intrin_sse2.c \ lpc_intrin_sse41.c \ lpc_intrin_avx2.c \ + lpc_intrin_fma.c \ lpc_intrin_vsx.c \ lpc_intrin_neon.c \ md5.c \ diff --git a/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin.c b/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin.c new file mode 100644 index 00000000..5843b000 --- /dev/null +++ b/src/libFLAC/deduplication/lpc_compute_autocorrelation_intrin.c @@ -0,0 +1,13 @@ + (void) lag; + FLAC__ASSERT(lag <= MAX_LAG); + + for(int i = 0; i < MAX_LAG; i++) + autoc[i] = 0.0; + + for(int i = 0; i < MAX_LAG; i++) + for(int j = 0; j <= i; j++) + autoc[j] += (double)data[i] * (double)data[i-j]; + + for(int i = MAX_LAG; i < (int)data_len; i++) + for(int j = 0; j < MAX_LAG; j++) + autoc[j] += (double)data[i] * (double)data[i-j]; diff --git a/src/libFLAC/include/private/cpu.h b/src/libFLAC/include/private/cpu.h index 638c1b20..0a115135 100644 --- a/src/libFLAC/include/private/cpu.h +++ b/src/libFLAC/include/private/cpu.h @@ -63,6 +63,7 @@ /* SSE intrinsics support by ICC/MSVC/GCC */ #if defined __INTEL_COMPILER #define FLAC__SSE_TARGET(x) + #define FLAC__FAST_MATH_TARGET(x) #define FLAC__SSE_SUPPORTED 1 #define FLAC__SSE2_SUPPORTED 1 #if (__INTEL_COMPILER >= 1000) /* Intel C++ Compiler 10.0 */ @@ -80,6 +81,7 @@ #endif #elif defined __clang__ && __has_attribute(__target__) /* clang */ #define FLAC__SSE_TARGET(x) __attribute__ ((__target__ (x))) + #define FLAC__FAST_MATH_TARGET(x) __attribute__ ((__target__ (x), optimize("-ffast-math"))) #if __has_builtin(__builtin_ia32_maxps) #define FLAC__SSE_SUPPORTED 1 #endif @@ -105,6 +107,7 @@ #endif #elif defined __GNUC__ && !defined __clang__ && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) /* GCC 4.9+ */ #define FLAC__SSE_TARGET(x) __attribute__ ((__target__ (x))) + #define FLAC__FAST_MATH_TARGET(x) __attribute__ ((__target__ (x), optimize("-ffast-math"))) #define FLAC__SSE_SUPPORTED 1 #define FLAC__SSE2_SUPPORTED 1 #define FLAC__SSSE3_SUPPORTED 1 @@ -116,6 +119,7 @@ #endif #elif defined _MSC_VER #define FLAC__SSE_TARGET(x) + #define FLAC__FAST_MATH_TARGET(x) #define FLAC__SSE_SUPPORTED 1 #define FLAC__SSE2_SUPPORTED 1 #if (_MSC_VER >= 1500) /* MS Visual Studio 2008 */ @@ -133,6 +137,7 @@ #endif #else #define FLAC__SSE_TARGET(x) + #define FLAC__FAST_MATH_TARGET(x) #ifdef __SSE__ #define FLAC__SSE_SUPPORTED 1 #endif diff --git a/src/libFLAC/include/private/lpc.h b/src/libFLAC/include/private/lpc.h index 99f62de8..40971356 100644 --- a/src/libFLAC/include/private/lpc.h +++ b/src/libFLAC/include/private/lpc.h @@ -78,6 +78,13 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_10(const FLAC__real data[ void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]); # endif # endif +# if defined FLAC__CPU_X86_64 && FLAC__HAS_X86INTRIN +# ifdef FLAC__FMA_SUPPORTED +void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]); +void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]); +# endif +# endif #if defined(FLAC__CPU_PPC64) && defined(FLAC__USE_VSX) #ifdef FLAC__HAS_TARGET_POWER9 void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]); diff --git a/src/libFLAC/lpc_intrin_fma.c b/src/libFLAC/lpc_intrin_fma.c new file mode 100644 index 00000000..ad125c8a --- /dev/null +++ b/src/libFLAC/lpc_intrin_fma.c @@ -0,0 +1,73 @@ +/* libFLAC - Free Lossless Audio Codec library + * Copyright (C) 2022 Xiph.Org Foundation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * - Neither the name of the Xiph.org Foundation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include "private/cpu.h" + +#ifndef FLAC__INTEGER_ONLY_LIBRARY +#ifndef FLAC__NO_ASM +#if defined FLAC__CPU_X86_64 && FLAC__HAS_X86INTRIN +#include "private/lpc.h" +#ifdef FLAC__FMA_SUPPORTED + +#include "FLAC/assert.h" + +FLAC__FAST_MATH_TARGET("fma") +void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]) +{ +#undef MAX_LAG +#define MAX_LAG 8 +#include "deduplication/lpc_compute_autocorrelation_intrin.c" +} + +FLAC__FAST_MATH_TARGET("fma") +void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]) +{ +#undef MAX_LAG +#define MAX_LAG 12 +#include "deduplication/lpc_compute_autocorrelation_intrin.c" +} +FLAC__FAST_MATH_TARGET("fma") +void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]) +{ +#undef MAX_LAG +#define MAX_LAG 16 +#include "deduplication/lpc_compute_autocorrelation_intrin.c" + +} + +#endif /* FLAC__FMA_SUPPORTED */ +#endif /* FLAC__CPU_X86_64 && FLAC__HAS_X86INTRIN */ +#endif /* FLAC__NO_ASM */ +#endif /* FLAC__INTEGER_ONLY_LIBRARY */ diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c index 34222174..1b4f9668 100644 --- a/src/libFLAC/stream_encoder.c +++ b/src/libFLAC/stream_encoder.c @@ -368,6 +368,7 @@ typedef struct FLAC__StreamEncoderPrivate { FLAC__bool disable_ssse3; FLAC__bool disable_sse41; FLAC__bool disable_avx2; + FLAC__bool disable_fma; FLAC__bool disable_constant_subframes; FLAC__bool disable_fixed_subframes; FLAC__bool disable_verbatim_subframes; @@ -885,6 +886,8 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_( encoder->private_->cpuinfo.x86.sse41 = false; if(encoder->private_->disable_avx2) encoder->private_->cpuinfo.x86.avx2 = false; + if(encoder->private_->disable_fma) + encoder->private_->cpuinfo.x86.fma = false; /* first default to the non-asm routines */ #ifndef FLAC__INTEGER_ONLY_LIBRARY encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation; @@ -1005,14 +1008,16 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_( FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_X86_64); # if FLAC__HAS_X86INTRIN # ifdef FLAC__SSE2_SUPPORTED - if(encoder->protected_->max_lpc_order < 8) - encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_8; - else if(encoder->protected_->max_lpc_order < 10) - encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_10; - else if(encoder->protected_->max_lpc_order < 14) - encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14; + if(encoder->private_->cpuinfo.x86.sse2) { /* For fuzzing */ + if(encoder->protected_->max_lpc_order < 8) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_8; + else if(encoder->protected_->max_lpc_order < 10) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_10; + else if(encoder->protected_->max_lpc_order < 14) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14; - encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2; + encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2; + } # endif # ifdef FLAC__SSE4_1_SUPPORTED if(encoder->private_->cpuinfo.x86.sse41) { @@ -1026,10 +1031,23 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_( encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2; } # endif +# ifdef FLAC__FMA_SUPPORTED + if(encoder->private_->cpuinfo.x86.fma) { + if(encoder->protected_->max_lpc_order < 8) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_fma_lag_8; + else if(encoder->protected_->max_lpc_order < 12) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_fma_lag_12; + else if(encoder->protected_->max_lpc_order < 16) + encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_fma_lag_16; + } +# endif + # ifdef FLAC__SSE2_SUPPORTED - encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_intrin_sse2; - encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide_intrin_sse2; + if(encoder->private_->cpuinfo.x86.sse2) { /* For fuzzing */ + encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_intrin_sse2; + encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide_intrin_sse2; + } # endif # ifdef FLAC__SSSE3_SUPPORTED if (encoder->private_->cpuinfo.x86.ssse3) { @@ -1957,6 +1975,7 @@ FLAC_API FLAC__bool FLAC__stream_encoder_disable_instruction_set(FLAC__StreamEnc encoder->private_->disable_ssse3 = value & 4; encoder->private_->disable_sse41 = value & 8; encoder->private_->disable_avx2 = value & 16; + encoder->private_->disable_fma = value & 32; return true; }