Add FMA intrinsics for autocorrelation calculation
See https://github.com/xiph/flac/pull/387 for details
This commit is contained in:
parent
67131c04b8
commit
c6a4d5c07b
@ -22,6 +22,7 @@ if(FLAC__CPU_X86_64 OR FLAC__CPU_IA32)
|
|||||||
option(WITH_AVX "Enable AVX, AVX2 optimizations (with runtime detection, resulting binary does not require AVX2)" ON)
|
option(WITH_AVX "Enable AVX, AVX2 optimizations (with runtime detection, resulting binary does not require AVX2)" ON)
|
||||||
if(WITH_AVX AND MSVC)
|
if(WITH_AVX AND MSVC)
|
||||||
set_source_files_properties(lpc_intrin_avx2.c stream_encoder_intrin_avx2.c PROPERTIES COMPILE_FLAGS /arch:AVX2)
|
set_source_files_properties(lpc_intrin_avx2.c stream_encoder_intrin_avx2.c PROPERTIES COMPILE_FLAGS /arch:AVX2)
|
||||||
|
set_source_files_properties(lpc_intrin_fma.c PROPERTIES COMPILE_FLAGS "/arch:AVX2 /fp:fast")
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
check_cpu_arch_ppc64(FLAC__CPU_PPC64)
|
check_cpu_arch_ppc64(FLAC__CPU_PPC64)
|
||||||
@ -82,6 +83,7 @@ add_library(FLAC
|
|||||||
lpc_intrin_sse2.c
|
lpc_intrin_sse2.c
|
||||||
lpc_intrin_sse41.c
|
lpc_intrin_sse41.c
|
||||||
lpc_intrin_avx2.c
|
lpc_intrin_avx2.c
|
||||||
|
lpc_intrin_fma.c
|
||||||
lpc_intrin_vsx.c
|
lpc_intrin_vsx.c
|
||||||
md5.c
|
md5.c
|
||||||
memory.c
|
memory.c
|
||||||
|
@ -75,6 +75,7 @@ EXTRA_DIST = \
|
|||||||
CMakeLists.txt \
|
CMakeLists.txt \
|
||||||
flac.pc.in \
|
flac.pc.in \
|
||||||
libFLAC.m4 \
|
libFLAC.m4 \
|
||||||
|
deduplication/lpc_compute_autocorrelation_intrin.c \
|
||||||
deduplication/lpc_compute_autocorrelation_intrin_sse2.c \
|
deduplication/lpc_compute_autocorrelation_intrin_sse2.c \
|
||||||
deduplication/lpc_compute_autocorrelation_intrin_vsx.c \
|
deduplication/lpc_compute_autocorrelation_intrin_vsx.c \
|
||||||
deduplication/lpc_compute_autocorrelation_intrin_neon.c
|
deduplication/lpc_compute_autocorrelation_intrin_neon.c
|
||||||
@ -109,6 +110,7 @@ libFLAC_sources = \
|
|||||||
lpc_intrin_sse2.c \
|
lpc_intrin_sse2.c \
|
||||||
lpc_intrin_sse41.c \
|
lpc_intrin_sse41.c \
|
||||||
lpc_intrin_avx2.c \
|
lpc_intrin_avx2.c \
|
||||||
|
lpc_intrin_fma.c \
|
||||||
lpc_intrin_vsx.c \
|
lpc_intrin_vsx.c \
|
||||||
lpc_intrin_neon.c \
|
lpc_intrin_neon.c \
|
||||||
md5.c \
|
md5.c \
|
||||||
|
@ -0,0 +1,13 @@
|
|||||||
|
(void) lag;
|
||||||
|
FLAC__ASSERT(lag <= MAX_LAG);
|
||||||
|
|
||||||
|
for(int i = 0; i < MAX_LAG; i++)
|
||||||
|
autoc[i] = 0.0;
|
||||||
|
|
||||||
|
for(int i = 0; i < MAX_LAG; i++)
|
||||||
|
for(int j = 0; j <= i; j++)
|
||||||
|
autoc[j] += (double)data[i] * (double)data[i-j];
|
||||||
|
|
||||||
|
for(int i = MAX_LAG; i < (int)data_len; i++)
|
||||||
|
for(int j = 0; j < MAX_LAG; j++)
|
||||||
|
autoc[j] += (double)data[i] * (double)data[i-j];
|
@ -63,6 +63,7 @@
|
|||||||
/* SSE intrinsics support by ICC/MSVC/GCC */
|
/* SSE intrinsics support by ICC/MSVC/GCC */
|
||||||
#if defined __INTEL_COMPILER
|
#if defined __INTEL_COMPILER
|
||||||
#define FLAC__SSE_TARGET(x)
|
#define FLAC__SSE_TARGET(x)
|
||||||
|
#define FLAC__FAST_MATH_TARGET(x)
|
||||||
#define FLAC__SSE_SUPPORTED 1
|
#define FLAC__SSE_SUPPORTED 1
|
||||||
#define FLAC__SSE2_SUPPORTED 1
|
#define FLAC__SSE2_SUPPORTED 1
|
||||||
#if (__INTEL_COMPILER >= 1000) /* Intel C++ Compiler 10.0 */
|
#if (__INTEL_COMPILER >= 1000) /* Intel C++ Compiler 10.0 */
|
||||||
@ -80,6 +81,7 @@
|
|||||||
#endif
|
#endif
|
||||||
#elif defined __clang__ && __has_attribute(__target__) /* clang */
|
#elif defined __clang__ && __has_attribute(__target__) /* clang */
|
||||||
#define FLAC__SSE_TARGET(x) __attribute__ ((__target__ (x)))
|
#define FLAC__SSE_TARGET(x) __attribute__ ((__target__ (x)))
|
||||||
|
#define FLAC__FAST_MATH_TARGET(x) __attribute__ ((__target__ (x), optimize("-ffast-math")))
|
||||||
#if __has_builtin(__builtin_ia32_maxps)
|
#if __has_builtin(__builtin_ia32_maxps)
|
||||||
#define FLAC__SSE_SUPPORTED 1
|
#define FLAC__SSE_SUPPORTED 1
|
||||||
#endif
|
#endif
|
||||||
@ -105,6 +107,7 @@
|
|||||||
#endif
|
#endif
|
||||||
#elif defined __GNUC__ && !defined __clang__ && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) /* GCC 4.9+ */
|
#elif defined __GNUC__ && !defined __clang__ && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) /* GCC 4.9+ */
|
||||||
#define FLAC__SSE_TARGET(x) __attribute__ ((__target__ (x)))
|
#define FLAC__SSE_TARGET(x) __attribute__ ((__target__ (x)))
|
||||||
|
#define FLAC__FAST_MATH_TARGET(x) __attribute__ ((__target__ (x), optimize("-ffast-math")))
|
||||||
#define FLAC__SSE_SUPPORTED 1
|
#define FLAC__SSE_SUPPORTED 1
|
||||||
#define FLAC__SSE2_SUPPORTED 1
|
#define FLAC__SSE2_SUPPORTED 1
|
||||||
#define FLAC__SSSE3_SUPPORTED 1
|
#define FLAC__SSSE3_SUPPORTED 1
|
||||||
@ -116,6 +119,7 @@
|
|||||||
#endif
|
#endif
|
||||||
#elif defined _MSC_VER
|
#elif defined _MSC_VER
|
||||||
#define FLAC__SSE_TARGET(x)
|
#define FLAC__SSE_TARGET(x)
|
||||||
|
#define FLAC__FAST_MATH_TARGET(x)
|
||||||
#define FLAC__SSE_SUPPORTED 1
|
#define FLAC__SSE_SUPPORTED 1
|
||||||
#define FLAC__SSE2_SUPPORTED 1
|
#define FLAC__SSE2_SUPPORTED 1
|
||||||
#if (_MSC_VER >= 1500) /* MS Visual Studio 2008 */
|
#if (_MSC_VER >= 1500) /* MS Visual Studio 2008 */
|
||||||
@ -133,6 +137,7 @@
|
|||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
#define FLAC__SSE_TARGET(x)
|
#define FLAC__SSE_TARGET(x)
|
||||||
|
#define FLAC__FAST_MATH_TARGET(x)
|
||||||
#ifdef __SSE__
|
#ifdef __SSE__
|
||||||
#define FLAC__SSE_SUPPORTED 1
|
#define FLAC__SSE_SUPPORTED 1
|
||||||
#endif
|
#endif
|
||||||
|
@ -78,6 +78,13 @@ void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_10(const FLAC__real data[
|
|||||||
void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
void FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
||||||
# endif
|
# endif
|
||||||
# endif
|
# endif
|
||||||
|
# if defined FLAC__CPU_X86_64 && FLAC__HAS_X86INTRIN
|
||||||
|
# ifdef FLAC__FMA_SUPPORTED
|
||||||
|
void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
||||||
|
void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
||||||
|
void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
||||||
|
# endif
|
||||||
|
# endif
|
||||||
#if defined(FLAC__CPU_PPC64) && defined(FLAC__USE_VSX)
|
#if defined(FLAC__CPU_PPC64) && defined(FLAC__USE_VSX)
|
||||||
#ifdef FLAC__HAS_TARGET_POWER9
|
#ifdef FLAC__HAS_TARGET_POWER9
|
||||||
void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[]);
|
||||||
|
73
src/libFLAC/lpc_intrin_fma.c
Normal file
73
src/libFLAC/lpc_intrin_fma.c
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
/* libFLAC - Free Lossless Audio Codec library
|
||||||
|
* Copyright (C) 2022 Xiph.Org Foundation
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* - Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* - Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* - Neither the name of the Xiph.org Foundation nor the names of its
|
||||||
|
* contributors may be used to endorse or promote products derived from
|
||||||
|
* this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
|
||||||
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef HAVE_CONFIG_H
|
||||||
|
# include <config.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "private/cpu.h"
|
||||||
|
|
||||||
|
#ifndef FLAC__INTEGER_ONLY_LIBRARY
|
||||||
|
#ifndef FLAC__NO_ASM
|
||||||
|
#if defined FLAC__CPU_X86_64 && FLAC__HAS_X86INTRIN
|
||||||
|
#include "private/lpc.h"
|
||||||
|
#ifdef FLAC__FMA_SUPPORTED
|
||||||
|
|
||||||
|
#include "FLAC/assert.h"
|
||||||
|
|
||||||
|
FLAC__FAST_MATH_TARGET("fma")
|
||||||
|
void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
|
||||||
|
{
|
||||||
|
#undef MAX_LAG
|
||||||
|
#define MAX_LAG 8
|
||||||
|
#include "deduplication/lpc_compute_autocorrelation_intrin.c"
|
||||||
|
}
|
||||||
|
|
||||||
|
FLAC__FAST_MATH_TARGET("fma")
|
||||||
|
void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
|
||||||
|
{
|
||||||
|
#undef MAX_LAG
|
||||||
|
#define MAX_LAG 12
|
||||||
|
#include "deduplication/lpc_compute_autocorrelation_intrin.c"
|
||||||
|
}
|
||||||
|
FLAC__FAST_MATH_TARGET("fma")
|
||||||
|
void FLAC__lpc_compute_autocorrelation_intrin_fma_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, double autoc[])
|
||||||
|
{
|
||||||
|
#undef MAX_LAG
|
||||||
|
#define MAX_LAG 16
|
||||||
|
#include "deduplication/lpc_compute_autocorrelation_intrin.c"
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* FLAC__FMA_SUPPORTED */
|
||||||
|
#endif /* FLAC__CPU_X86_64 && FLAC__HAS_X86INTRIN */
|
||||||
|
#endif /* FLAC__NO_ASM */
|
||||||
|
#endif /* FLAC__INTEGER_ONLY_LIBRARY */
|
@ -368,6 +368,7 @@ typedef struct FLAC__StreamEncoderPrivate {
|
|||||||
FLAC__bool disable_ssse3;
|
FLAC__bool disable_ssse3;
|
||||||
FLAC__bool disable_sse41;
|
FLAC__bool disable_sse41;
|
||||||
FLAC__bool disable_avx2;
|
FLAC__bool disable_avx2;
|
||||||
|
FLAC__bool disable_fma;
|
||||||
FLAC__bool disable_constant_subframes;
|
FLAC__bool disable_constant_subframes;
|
||||||
FLAC__bool disable_fixed_subframes;
|
FLAC__bool disable_fixed_subframes;
|
||||||
FLAC__bool disable_verbatim_subframes;
|
FLAC__bool disable_verbatim_subframes;
|
||||||
@ -885,6 +886,8 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
|
|||||||
encoder->private_->cpuinfo.x86.sse41 = false;
|
encoder->private_->cpuinfo.x86.sse41 = false;
|
||||||
if(encoder->private_->disable_avx2)
|
if(encoder->private_->disable_avx2)
|
||||||
encoder->private_->cpuinfo.x86.avx2 = false;
|
encoder->private_->cpuinfo.x86.avx2 = false;
|
||||||
|
if(encoder->private_->disable_fma)
|
||||||
|
encoder->private_->cpuinfo.x86.fma = false;
|
||||||
/* first default to the non-asm routines */
|
/* first default to the non-asm routines */
|
||||||
#ifndef FLAC__INTEGER_ONLY_LIBRARY
|
#ifndef FLAC__INTEGER_ONLY_LIBRARY
|
||||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation;
|
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation;
|
||||||
@ -1005,14 +1008,16 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
|
|||||||
FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_X86_64);
|
FLAC__ASSERT(encoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_X86_64);
|
||||||
# if FLAC__HAS_X86INTRIN
|
# if FLAC__HAS_X86INTRIN
|
||||||
# ifdef FLAC__SSE2_SUPPORTED
|
# ifdef FLAC__SSE2_SUPPORTED
|
||||||
if(encoder->protected_->max_lpc_order < 8)
|
if(encoder->private_->cpuinfo.x86.sse2) { /* For fuzzing */
|
||||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_8;
|
if(encoder->protected_->max_lpc_order < 8)
|
||||||
else if(encoder->protected_->max_lpc_order < 10)
|
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_8;
|
||||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_10;
|
else if(encoder->protected_->max_lpc_order < 10)
|
||||||
else if(encoder->protected_->max_lpc_order < 14)
|
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_10;
|
||||||
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14;
|
else if(encoder->protected_->max_lpc_order < 14)
|
||||||
|
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_sse2_lag_14;
|
||||||
|
|
||||||
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2;
|
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2;
|
||||||
|
}
|
||||||
# endif
|
# endif
|
||||||
# ifdef FLAC__SSE4_1_SUPPORTED
|
# ifdef FLAC__SSE4_1_SUPPORTED
|
||||||
if(encoder->private_->cpuinfo.x86.sse41) {
|
if(encoder->private_->cpuinfo.x86.sse41) {
|
||||||
@ -1026,10 +1031,23 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
|
|||||||
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2;
|
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2;
|
||||||
}
|
}
|
||||||
# endif
|
# endif
|
||||||
|
# ifdef FLAC__FMA_SUPPORTED
|
||||||
|
if(encoder->private_->cpuinfo.x86.fma) {
|
||||||
|
if(encoder->protected_->max_lpc_order < 8)
|
||||||
|
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_fma_lag_8;
|
||||||
|
else if(encoder->protected_->max_lpc_order < 12)
|
||||||
|
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_fma_lag_12;
|
||||||
|
else if(encoder->protected_->max_lpc_order < 16)
|
||||||
|
encoder->private_->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_intrin_fma_lag_16;
|
||||||
|
}
|
||||||
|
# endif
|
||||||
|
|
||||||
|
|
||||||
# ifdef FLAC__SSE2_SUPPORTED
|
# ifdef FLAC__SSE2_SUPPORTED
|
||||||
encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_intrin_sse2;
|
if(encoder->private_->cpuinfo.x86.sse2) { /* For fuzzing */
|
||||||
encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide_intrin_sse2;
|
encoder->private_->local_fixed_compute_best_predictor = FLAC__fixed_compute_best_predictor_intrin_sse2;
|
||||||
|
encoder->private_->local_fixed_compute_best_predictor_wide = FLAC__fixed_compute_best_predictor_wide_intrin_sse2;
|
||||||
|
}
|
||||||
# endif
|
# endif
|
||||||
# ifdef FLAC__SSSE3_SUPPORTED
|
# ifdef FLAC__SSSE3_SUPPORTED
|
||||||
if (encoder->private_->cpuinfo.x86.ssse3) {
|
if (encoder->private_->cpuinfo.x86.ssse3) {
|
||||||
@ -1957,6 +1975,7 @@ FLAC_API FLAC__bool FLAC__stream_encoder_disable_instruction_set(FLAC__StreamEnc
|
|||||||
encoder->private_->disable_ssse3 = value & 4;
|
encoder->private_->disable_ssse3 = value & 4;
|
||||||
encoder->private_->disable_sse41 = value & 8;
|
encoder->private_->disable_sse41 = value & 8;
|
||||||
encoder->private_->disable_avx2 = value & 16;
|
encoder->private_->disable_avx2 = value & 16;
|
||||||
|
encoder->private_->disable_fma = value & 32;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user