Adding ARM64 support and optimized Neon implementation (#270)
Add NEON intrinsics routines for lpc_compute_residual_from_qlp_coefficients and lpc_compute_residual_from_qlp_coefficients_wide
This commit is contained in:
parent
5df56dbcf5
commit
95e2c52980
@ -104,6 +104,8 @@ enable_testing()
|
||||
check_include_file("byteswap.h" HAVE_BYTESWAP_H)
|
||||
check_include_file("inttypes.h" HAVE_INTTYPES_H)
|
||||
check_include_file("stdint.h" HAVE_STDINT_H)
|
||||
check_include_file("arm_neon.h" FLAC__HAS_NEONINTRIN)
|
||||
|
||||
if(MSVC)
|
||||
check_include_file("intrin.h" FLAC__HAS_X86INTRIN)
|
||||
else()
|
||||
|
6
cmake/CheckA64NEON.c.in
Normal file
6
cmake/CheckA64NEON.c.in
Normal file
@ -0,0 +1,6 @@
|
||||
#include <arm_neon.h>
|
||||
int main (void)
|
||||
{
|
||||
float64x2_t tmp;
|
||||
tmp = vdupq_n_f64(0.0f);
|
||||
}
|
14
cmake/CheckA64NEON.cmake
Normal file
14
cmake/CheckA64NEON.cmake
Normal file
@ -0,0 +1,14 @@
|
||||
macro(CHECK_A64NEON VARIABLE)
|
||||
if(NOT DEFINED HAVE_${VARIABLE})
|
||||
message(STATUS "Check whether A64 NEON can be used")
|
||||
configure_file(${PROJECT_SOURCE_DIR}/cmake/CheckA64NEON.c.in ${PROJECT_BINARY_DIR}/CMakeFiles/CMakeTmp/CheckA64NEON.c @ONLY)
|
||||
try_compile(HAVE_${VARIABLE} "${PROJECT_BINARY_DIR}"
|
||||
"${PROJECT_BINARY_DIR}/CMakeFiles/CMakeTmp/CheckA64NEON.c")
|
||||
if(HAVE_${VARIABLE})
|
||||
message(STATUS "Check whether A64 NEON can be used - yes")
|
||||
set(${VARIABLE} 1 CACHE INTERNAL "Result of CHECK_A64NEON" FORCE)
|
||||
else ()
|
||||
message(STATUS "Check whether A64 NEON can be used - no")
|
||||
endif()
|
||||
endif ()
|
||||
endmacro(CHECK_A64NEON)
|
@ -7,7 +7,7 @@ macro(_CHECK_CPU_ARCH ARCH ARCH_DEFINES VARIABLE)
|
||||
"${PROJECT_BINARY_DIR}/CMakeFiles/CMakeTmp/CheckCPUArch.c")
|
||||
if(HAVE_${VARIABLE})
|
||||
message(STATUS "Check CPU architecture is ${ARCH} - yes")
|
||||
set(${VARIABLE} 1 CACHE INTERNAL "Result of CHECK_CPU_ARCH_X64" FORCE)
|
||||
set(${VARIABLE} 1 CACHE INTERNAL "Result of CHECK_CPU_ARCH" FORCE)
|
||||
else ()
|
||||
message(STATUS "Check CPU architecture is ${ARCH} - no")
|
||||
endif()
|
||||
@ -24,4 +24,8 @@ endmacro(CHECK_CPU_ARCH_X86)
|
||||
|
||||
macro(CHECK_CPU_ARCH_PPC64 VARIABLE)
|
||||
_CHECK_CPU_ARCH(ppc64 "defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) ||defined(_ARCH_PPC64)" ${VARIABLE})
|
||||
endmacro(CHECK_CPU_ARCH_PPC64)
|
||||
endmacro(CHECK_CPU_ARCH_PPC64)
|
||||
|
||||
macro(CHECK_CPU_ARCH_ARM64 VARIABLE)
|
||||
_CHECK_CPU_ARCH(arm64 "defined(__aarch64__) || defined(__arm64__)" ${VARIABLE})
|
||||
endmacro(CHECK_CPU_ARCH_ARM64)
|
||||
|
@ -40,6 +40,9 @@
|
||||
/* Set to 1 if <x86intrin.h> is available. */
|
||||
#cmakedefine01 FLAC__HAS_X86INTRIN
|
||||
|
||||
/* Set to 1 if <arm_neon.h> is available. */
|
||||
#cmakedefine01 FLAC__HAS_NEONINTRIN
|
||||
|
||||
/* define if building for Darwin / MacOS X */
|
||||
#cmakedefine FLAC__SYS_DARWIN
|
||||
|
||||
|
30
configure.ac
30
configure.ac
@ -58,7 +58,7 @@ AM_PROG_CC_C_O
|
||||
AC_C_INLINE
|
||||
AC_C_TYPEOF
|
||||
|
||||
AC_CHECK_HEADERS([stdint.h inttypes.h byteswap.h sys/param.h sys/ioctl.h termios.h x86intrin.h cpuid.h])
|
||||
AC_CHECK_HEADERS([stdint.h inttypes.h byteswap.h sys/param.h sys/ioctl.h termios.h x86intrin.h cpuid.h arm_neon.h])
|
||||
|
||||
XIPH_C_BSWAP32
|
||||
XIPH_C_BSWAP16
|
||||
@ -145,6 +145,12 @@ case "$host_cpu" in
|
||||
AH_TEMPLATE(FLAC__CPU_PPC, [define if building for PowerPC])
|
||||
asm_optimisation=$asm_opt
|
||||
;;
|
||||
arm64|aarch64)
|
||||
cpu_arm64=true
|
||||
AC_DEFINE(FLAC__CPU_ARM64)
|
||||
AH_TEMPLATE(FLAC__CPU_ARM64, [define if building for ARM])
|
||||
asm_optimisation=$asm_opt
|
||||
;;
|
||||
sparc)
|
||||
cpu_sparc=true
|
||||
AC_DEFINE(FLAC__CPU_SPARC)
|
||||
@ -156,6 +162,7 @@ AM_CONDITIONAL(FLAC__CPU_X86_64, test "x$cpu_x86_64" = xtrue)
|
||||
AM_CONDITIONAL(FLaC__CPU_IA32, test "x$cpu_ia32" = xtrue)
|
||||
AM_CONDITIONAL(FLaC__CPU_PPC, test "x$cpu_ppc" = xtrue)
|
||||
AM_CONDITIONAL(FLaC__CPU_PPC64, test "x$cpu_ppc64" = xtrue)
|
||||
AM_CONDITIONAL(FLAC__CPU_ARM64, test "x$cpu_arm64" = xtrue)
|
||||
AM_CONDITIONAL(FLaC__CPU_SPARC, test "x$cpu_sparc" = xtrue)
|
||||
|
||||
if test "x$ac_cv_header_x86intrin_h" = xyes; then
|
||||
@ -164,6 +171,26 @@ else
|
||||
AC_DEFINE([FLAC__HAS_X86INTRIN], 0)
|
||||
fi
|
||||
|
||||
neon=no
|
||||
if test "x$ac_cv_header_arm_neon_h" = xyes; then
|
||||
AC_DEFINE([FLAC__HAS_NEONINTRIN], 1, [Set to 1 if <arm_neon.h> is available.])
|
||||
neon=yes
|
||||
AC_MSG_CHECKING([whether arm_neon.h has A64 functions])
|
||||
AC_COMPILE_IFELSE(
|
||||
[AC_LANG_PROGRAM([[#include <arm_neon.h>]],
|
||||
[[float64x2_t sum5; sum5 = vdupq_n_f64(0.0f);]])],
|
||||
[AC_MSG_RESULT([yes])
|
||||
has_a64neon=yes],
|
||||
[AC_MSG_RESULT([no])])
|
||||
if test "x$has_a64neon" = xyes; then
|
||||
AC_DEFINE([FLAC__HAS_A64NEONINTRIN], 1, [Set to 1 if <arm_neon.h> has A64 instructions.])
|
||||
else
|
||||
AC_DEFINE([FLAC__HAS_A64NEONINTRIN], 0)
|
||||
fi
|
||||
else
|
||||
AC_DEFINE([FLAC__HAS_NEONINTRIN], 0)
|
||||
fi
|
||||
|
||||
if test x"$cpu_ppc64" = xtrue ; then
|
||||
|
||||
AC_C_ATTRIBUTE([target("cpu=power8")],
|
||||
@ -664,6 +691,7 @@ if test x$ac_cv_c_compiler_gnu = xyes ; then
|
||||
fi
|
||||
echo " Compiler is Clang : ....................... ${xiph_cv_c_compiler_clang}"
|
||||
echo " SSE optimizations : ....................... ${sse_os}"
|
||||
echo " Neon optimizations : ...................... ${neon}"
|
||||
echo " Asm optimizations : ....................... ${asm_optimisation}"
|
||||
echo " Ogg/FLAC support : ........................ ${have_ogg}"
|
||||
echo " Stack protector : ........................ ${enable_stack_smash_protection}"
|
||||
|
@ -10,6 +10,7 @@ include(CheckCSourceCompiles)
|
||||
include(CheckCPUArch)
|
||||
include(CheckAttribute)
|
||||
include(CheckVSX)
|
||||
include(CheckA64NEON)
|
||||
|
||||
check_cpu_arch_x64(FLAC__CPU_X86_64)
|
||||
if(NOT FLAC__CPU_X86_64)
|
||||
@ -26,9 +27,15 @@ else()
|
||||
check_attribute_power8(FLAC__HAS_TARGET_POWER8)
|
||||
check_attribute_power9(FLAC__HAS_TARGET_POWER9)
|
||||
check_vsx(FLAC__USE_VSX)
|
||||
else()
|
||||
check_cpu_arch_arm64(FLAC__CPU_ARM64)
|
||||
if(FLAC__CPU_ARM64)
|
||||
check_a64neon(FLAC__HAS_A64NEONINTRIN)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
include(CheckLanguage)
|
||||
check_language(ASM_NASM)
|
||||
if(CMAKE_ASM_NASM_COMPILER)
|
||||
@ -68,6 +75,7 @@ add_library(FLAC
|
||||
float.c
|
||||
format.c
|
||||
lpc.c
|
||||
lpc_intrin_neon.c
|
||||
lpc_intrin_sse2.c
|
||||
lpc_intrin_sse41.c
|
||||
lpc_intrin_avx2.c
|
||||
|
@ -117,6 +117,7 @@ libFLAC_sources = \
|
||||
lpc_intrin_sse41.c \
|
||||
lpc_intrin_avx2.c \
|
||||
lpc_intrin_vsx.c \
|
||||
lpc_intrin_neon.c \
|
||||
md5.c \
|
||||
memory.c \
|
||||
metadata_iterators.c \
|
||||
|
@ -90,6 +90,7 @@ SRCS_C = \
|
||||
lpc_intrin_sse2.c \
|
||||
lpc_intrin_sse41.c \
|
||||
lpc_intrin_avx2.c \
|
||||
lpc_intrin_neon.c \
|
||||
md5.c \
|
||||
memory.c \
|
||||
metadata_iterators.c \
|
||||
|
@ -151,7 +151,13 @@ int FLAC__lpc_quantize_coefficients(const FLAC__real lp_coeff[], uint32_t order,
|
||||
*/
|
||||
void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
|
||||
void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
|
||||
|
||||
#ifndef FLAC__NO_ASM
|
||||
# ifdef FLAC__CPU_ARM64
|
||||
void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
|
||||
void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
|
||||
# endif
|
||||
|
||||
# ifdef FLAC__CPU_IA32
|
||||
# ifdef FLAC__HAS_NASM
|
||||
void FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
|
||||
|
1249
src/libFLAC/lpc_intrin_neon.c
Normal file
1249
src/libFLAC/lpc_intrin_neon.c
Normal file
File diff suppressed because it is too large
Load Diff
@ -1003,8 +1003,16 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
|
||||
# endif
|
||||
# endif /* FLAC__HAS_X86INTRIN */
|
||||
# endif /* FLAC__CPU_... */
|
||||
|
||||
#if defined FLAC__CPU_ARM64
|
||||
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon;
|
||||
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon;
|
||||
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon;
|
||||
# endif
|
||||
|
||||
}
|
||||
# endif /* !FLAC__NO_ASM */
|
||||
|
||||
#endif /* !FLAC__INTEGER_ONLY_LIBRARY */
|
||||
#if !defined FLAC__NO_ASM && FLAC__HAS_X86INTRIN
|
||||
if(encoder->private_->cpuinfo.use_asm) {
|
||||
|
Loading…
Reference in New Issue
Block a user