Add FMA intrinsics for autocorrelation calculation
See https://github.com/xiph/flac/pull/387 for details
This commit is contained in:
parent
67131c04b8
commit
c6a4d5c07b
@ -22,6 +22,7 @@ if(FLAC__CPU_X86_64 OR FLAC__CPU_IA32)
|
||||
option(WITH_AVX "Enable AVX, AVX2 optimizations (with runtime detection, resulting binary does not require AVX2)" ON)
|
||||
if(WITH_AVX AND MSVC)
|
||||
set_source_files_properties(lpc_intrin_avx2.c stream_encoder_intrin_avx2.c PROPERTIES COMPILE_FLAGS /arch:AVX2)
|
||||
set_source_files_properties(lpc_intrin_fma.c PROPERTIES COMPILE_FLAGS "/arch:AVX2 /fp:fast")
|
||||
endif()
|
||||
else()
|
||||
check_cpu_arch_ppc64(FLAC__CPU_PPC64)
|
||||
@ -82,6 +83,7 @@ add_library(FLAC
|
||||
lpc_intrin_sse2.c
|
||||
lpc_intrin_sse41.c
|
||||
lpc_intrin_avx2.c
|
||||
lpc_intrin_fma.c
|
||||
lpc_intrin_vsx.c
|
||||
md5.c
|
||||
memory.c
|
||||
|
@ -75,6 +75,7 @@ EXTRA_DIST = \
|
||||
CMakeLists.txt \
|
||||
flac.pc.in \
|
||||
libFLAC.m4 \
|
||||
deduplication/lpc_compute_autocorrelation_intrin.c \
|
||||
deduplication/lpc_compute_autocorrelation_intrin_sse2.c \
|
||||
deduplication/lpc_compute_autocorrelation_intrin_vsx.c \
|
||||
deduplication/lpc_compute_autocorrelation_intrin_neon.c
|
||||
@ -109,6 +110,7 @@ libFLAC_sources = \
|
||||
lpc_intrin_sse2.c \
|
||||
lpc_intrin_sse41.c \
|
||||
lpc_intrin_avx2.c \
|
||||
lpc_intrin_fma.c \
|
||||
lpc_intrin_vsx.c \
|
||||
lpc_intrin_neon.c \
|
||||
md5.c \
|
||||
|
@ -0,0 +1,13 @@
|
||||
(void) lag;
|
||||
FLAC__ASSERT(lag <= MAX_LAG);
|
||||
|
||||
for(int i = 0; i < MAX_LAG; i++)
|
||||
autoc[i] = 0.0;
|
||||
|
||||
for(int i = 0; i < MAX_LAG; i++)
|
||||
for(int j = 0; j <= i; j++)
|
||||
autoc[j] += (double)data[i] * (double)data[i-j];
|
||||
|
||||
for(int i = MAX_LAG; i < (int)data_len; i++)
|
||||
for(int j = 0; j < MAX_LAG; j++)
|
||||
autoc[j] += (double)data[i] * (double)data[i-j];
|
@ -63,6 +63,7 @@
|
||||
/* SSE intrinsics support by ICC/MSVC/GCC */
|
||||
#if defined __INTEL_COMPILER
|
||||
#define FLAC__SSE_TARGET(x)
|
||||
#define FLAC__FAST_MATH_TARGET(x)
|
||||
#define FLAC__SSE_SUPPORTED 1
|
||||
#define FLAC__SSE2_SUPPORTED 1
|
||||
#if (__INTEL_COMPILER >= 1000) /* Intel C++ Compiler 10.0 */
|
||||
@ -80,6 +81,7 @@
|
||||
#endif
|
||||
#elif defined __clang__ && __has_attribute(__target__) /* clang */
|
||||
#define FLAC__SSE_TARGET(x) __attribute__ ((__target__ (x)))
|
||||
#define FLAC__FAST_MATH_TARGET(x) __attribute__ ((__target__ (x), optimize("-ffast-math")))
|
||||
#if __has_builtin(__builtin_ia32_maxps)
|
||||
#define FLAC__SSE_SUPPORTED 1
|
||||
#endif
|
||||
@ -105,6 +107,7 @@
|
||||
#endif
|
||||
#elif defined __GNUC__ && !defined __clang__ && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) /* GCC 4.9+ */
|
||||
#define FLAC__SSE_TARGET(x) __attribute__ ((__target__ (x)))
|
||||
#define FLAC__FAST_MATH_TARGET(x) __attribute__ ((__target__ (x), optimize("-ffast-math")))
|
||||
#define FLAC__SSE_SUPPORTED 1
|
||||
#define FLAC__SSE2_SUPPORTED 1
|
||||
#define FLAC__SSSE3_SUPPORTED 1
|
||||
@ -116,6 +119,7 @@
|
||||
#endif
|
||||
#elif defined _MSC_VER
|
||||
#define FLAC__SSE_TARGET(x)
|
||||
#define FLAC__FAST_MATH_TARGET(x)
|
||||
#define FLAC__SSE_SUPPORTED 1
|
||||
#define FLAC__SSE2_SUPPORTED 1
|
||||
#if (_MSC_VER >= 1500) /* MS Visual Studio 2008 */
|
||||
@ -133,6 +137,7 @@
|
||||
#endif
|
||||
#else
|
||||
#define FLAC__SSE_TARGET(x)
|
||||
#define FLAC__FAST_MATH_TARGET(x)
|
||||
#ifdef __SSE__
|
||||
#define FLAC__SSE_SUPPORTED 1
|
||||
#endif
|
||||
|
@ -78,6 +78,13 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_10(const FLAC__real data[
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
||||
# endif
|
||||
# endif
|
||||
# if defined FLAC__CPU_X86_64 && FLAC__HAS_X86INTRIN
|
||||
# ifdef FLAC__FMA_SUPPORTED
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
||||
# endif
|
||||
# endif
|
||||
#if defined(FLAC__CPU_PPC64) && defined(FLAC__USE_VSX)
|
||||
#ifdef FLAC__HAS_TARGET_POWER9
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
||||
|
73
src/libFLAC/lpc_intrin_fma.c
Normal file
73
src/libFLAC/lpc_intrin_fma.c
Normal file
@ -0,0 +1,73 @@
|
||||
/* libFLAC - Free Lossless Audio Codec library
|
||||
* Copyright (C) 2022 Xiph.Org Foundation
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* - Neither the name of the Xiph.org Foundation nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
# include <config.h>
|
||||
#endif
|
||||
|
||||
#include "private/cpu.h"
|
||||
|
||||
#ifndef FLAC__INTEGER_ONLY_LIBRARY
|
||||
#ifndef FLAC__NO_ASM
|
||||
#if defined FLAC__CPU_X86_64 && FLAC__HAS_X86INTRIN
|
||||
#include "private/lpc.h"
|
||||
#ifdef FLAC__FMA_SUPPORTED
|
||||
|
||||
#include "FLAC/assert.h"
|
||||
|
||||
FLAC__FAST_MATH_TARGET("fma")
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
|
||||
{
|
||||
#undef MAX_LAG
|
||||
#define MAX_LAG 8
|
||||
#include "deduplication/lpc_compute_autocorrelation_intrin.c"
|
||||
}
|
||||
|
||||
FLAC__FAST_MATH_TARGET("fma")
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
|
||||
{
|
||||
#undef MAX_LAG
|
||||
#define MAX_LAG 12
|
||||
#include "deduplication/lpc_compute_autocorrelation_intrin.c"
|
||||
}
|
||||
FLAC__FAST_MATH_TARGET("fma")
|
||||
void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
|
||||
{
|
||||
#undef MAX_LAG
|
||||
#define MAX_LAG 16
|
||||
#include "deduplication/lpc_compute_autocorrelation_intrin.c"
|
||||
|
||||
}
|
||||
|
||||
#endif /* FLAC__FMA_SUPPORTED */
|
||||
#endif /* FLAC__CPU_X86_64 && FLAC__HAS_X86INTRIN */
|
||||
#endif /* FLAC__NO_ASM */
|
||||
#endif /* FLAC__INTEGER_ONLY_LIBRARY */
|
@ -368,6 +368,7 @@ typedef struct FLAC__StreamEncoderPrivate {
|
||||
FLAC__bool disable_ssse3;
|
||||
FLAC__bool disable_sse41;
|
||||
FLAC__bool disable_avx2;
|
||||
FLAC__bool disable_fma;
|
||||
FLAC__bool disable_constant_subframes;
|
||||
FLAC__bool disable_fixed_subframes;
|
||||
FLAC__bool disable_verbatim_subframes;
|
||||
@ -885,6 +886,8 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
|
||||
encoder->private_->cpuinfo.x86.sse41 = false;
|
||||
if(encoder->private_->disable_avx2)
|
||||
encoder->private_->cpuinfo.x86.avx2 = false;
|
||||
if(encoder->private_->disable_fma)
|
||||
encoder->private_->cpuinfo.x86.fma = false;
|
||||
/* first default to the non-asm routines */
|
||||
#ifndef FLAC__INTEGER_ONLY_LIBRARY
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation;
|
||||
@ -1005,14 +1008,16 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
|
||||
FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_X86_64);
|
||||
# if FLAC__HAS_X86INTRIN
|
||||
# ifdef FLAC__SSE2_SUPPORTED
|
||||
if(encoder->protected_->max_lpc_order < 8)
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_8;
|
||||
else if(encoder->protected_->max_lpc_order < 10)
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_10;
|
||||
else if(encoder->protected_->max_lpc_order < 14)
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14;
|
||||
if(encoder->private_->cpuinfo.x86.sse2) { /* For fuzzing */
|
||||
if(encoder->protected_->max_lpc_order < 8)
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_8;
|
||||
else if(encoder->protected_->max_lpc_order < 10)
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_10;
|
||||
else if(encoder->protected_->max_lpc_order < 14)
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14;
|
||||
|
||||
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2;
|
||||
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2;
|
||||
}
|
||||
# endif
|
||||
# ifdef FLAC__SSE4_1_SUPPORTED
|
||||
if(encoder->private_->cpuinfo.x86.sse41) {
|
||||
@ -1026,10 +1031,23 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
|
||||
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2;
|
||||
}
|
||||
# endif
|
||||
# ifdef FLAC__FMA_SUPPORTED
|
||||
if(encoder->private_->cpuinfo.x86.fma) {
|
||||
if(encoder->protected_->max_lpc_order < 8)
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_fma_lag_8;
|
||||
else if(encoder->protected_->max_lpc_order < 12)
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_fma_lag_12;
|
||||
else if(encoder->protected_->max_lpc_order < 16)
|
||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_fma_lag_16;
|
||||
}
|
||||
# endif
|
||||
|
||||
|
||||
# ifdef FLAC__SSE2_SUPPORTED
|
||||
encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_intrin_sse2;
|
||||
encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide_intrin_sse2;
|
||||
if(encoder->private_->cpuinfo.x86.sse2) { /* For fuzzing */
|
||||
encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_intrin_sse2;
|
||||
encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide_intrin_sse2;
|
||||
}
|
||||
# endif
|
||||
# ifdef FLAC__SSSE3_SUPPORTED
|
||||
if (encoder->private_->cpuinfo.x86.ssse3) {
|
||||
@ -1957,6 +1975,7 @@ FLAC_API FLAC__bool FLAC__stream_encoder_disable_instruction_set(FLAC__StreamEnc
|
||||
encoder->private_->disable_ssse3 = value & 4;
|
||||
encoder->private_->disable_sse41 = value & 8;
|
||||
encoder->private_->disable_avx2 = value & 16;
|
||||
encoder->private_->disable_fma = value & 32;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user