Adding ARM64 support and optimized Neon implementation (#270)
Add NEON intrinsics routines for lpc_compute_residual_from_qlp_coefficients and lpc_compute_residual_from_qlp_coefficients_wide
This commit is contained in:
parent
5df56dbcf5
commit
95e2c52980
@ -104,6 +104,8 @@ enable_testing()
|
|||||||
check_include_file("byteswap.h" HAVE_BYTESWAP_H)
|
check_include_file("byteswap.h" HAVE_BYTESWAP_H)
|
||||||
check_include_file("inttypes.h" HAVE_INTTYPES_H)
|
check_include_file("inttypes.h" HAVE_INTTYPES_H)
|
||||||
check_include_file("stdint.h" HAVE_STDINT_H)
|
check_include_file("stdint.h" HAVE_STDINT_H)
|
||||||
|
check_include_file("arm_neon.h" FLAC__HAS_NEONINTRIN)
|
||||||
|
|
||||||
if(MSVC)
|
if(MSVC)
|
||||||
check_include_file("intrin.h" FLAC__HAS_X86INTRIN)
|
check_include_file("intrin.h" FLAC__HAS_X86INTRIN)
|
||||||
else()
|
else()
|
||||||
|
6
cmake/CheckA64NEON.c.in
Normal file
6
cmake/CheckA64NEON.c.in
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
#include <arm_neon.h>
|
||||||
|
int main (void)
|
||||||
|
{
|
||||||
|
float64x2_t tmp;
|
||||||
|
tmp = vdupq_n_f64(0.0f);
|
||||||
|
}
|
14
cmake/CheckA64NEON.cmake
Normal file
14
cmake/CheckA64NEON.cmake
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
macro(CHECK_A64NEON VARIABLE)
|
||||||
|
if(NOT DEFINED HAVE_${VARIABLE})
|
||||||
|
message(STATUS "Check whether A64 NEON can be used")
|
||||||
|
configure_file(${PROJECT_SOURCE_DIR}/cmake/CheckA64NEON.c.in ${PROJECT_BINARY_DIR}/CMakeFiles/CMakeTmp/CheckA64NEON.c @ONLY)
|
||||||
|
try_compile(HAVE_${VARIABLE} "${PROJECT_BINARY_DIR}"
|
||||||
|
"${PROJECT_BINARY_DIR}/CMakeFiles/CMakeTmp/CheckA64NEON.c")
|
||||||
|
if(HAVE_${VARIABLE})
|
||||||
|
message(STATUS "Check whether A64 NEON can be used - yes")
|
||||||
|
set(${VARIABLE} 1 CACHE INTERNAL "Result of CHECK_A64NEON" FORCE)
|
||||||
|
else ()
|
||||||
|
message(STATUS "Check whether A64 NEON can be used - no")
|
||||||
|
endif()
|
||||||
|
endif ()
|
||||||
|
endmacro(CHECK_A64NEON)
|
@ -7,7 +7,7 @@ macro(_CHECK_CPU_ARCH ARCH ARCH_DEFINES VARIABLE)
|
|||||||
"${PROJECT_BINARY_DIR}/CMakeFiles/CMakeTmp/CheckCPUArch.c")
|
"${PROJECT_BINARY_DIR}/CMakeFiles/CMakeTmp/CheckCPUArch.c")
|
||||||
if(HAVE_${VARIABLE})
|
if(HAVE_${VARIABLE})
|
||||||
message(STATUS "Check CPU architecture is ${ARCH} - yes")
|
message(STATUS "Check CPU architecture is ${ARCH} - yes")
|
||||||
set(${VARIABLE} 1 CACHE INTERNAL "Result of CHECK_CPU_ARCH_X64" FORCE)
|
set(${VARIABLE} 1 CACHE INTERNAL "Result of CHECK_CPU_ARCH" FORCE)
|
||||||
else ()
|
else ()
|
||||||
message(STATUS "Check CPU architecture is ${ARCH} - no")
|
message(STATUS "Check CPU architecture is ${ARCH} - no")
|
||||||
endif()
|
endif()
|
||||||
@ -24,4 +24,8 @@ endmacro(CHECK_CPU_ARCH_X86)
|
|||||||
|
|
||||||
macro(CHECK_CPU_ARCH_PPC64 VARIABLE)
|
macro(CHECK_CPU_ARCH_PPC64 VARIABLE)
|
||||||
_CHECK_CPU_ARCH(ppc64 "defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) ||defined(_ARCH_PPC64)" ${VARIABLE})
|
_CHECK_CPU_ARCH(ppc64 "defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) ||defined(_ARCH_PPC64)" ${VARIABLE})
|
||||||
endmacro(CHECK_CPU_ARCH_PPC64)
|
endmacro(CHECK_CPU_ARCH_PPC64)
|
||||||
|
|
||||||
|
macro(CHECK_CPU_ARCH_ARM64 VARIABLE)
|
||||||
|
_CHECK_CPU_ARCH(arm64 "defined(__aarch64__) || defined(__arm64__)" ${VARIABLE})
|
||||||
|
endmacro(CHECK_CPU_ARCH_ARM64)
|
||||||
|
@ -40,6 +40,9 @@
|
|||||||
/* Set to 1 if <x86intrin.h> is available. */
|
/* Set to 1 if <x86intrin.h> is available. */
|
||||||
#cmakedefine01 FLAC__HAS_X86INTRIN
|
#cmakedefine01 FLAC__HAS_X86INTRIN
|
||||||
|
|
||||||
|
/* Set to 1 if <arm_neon.h> is available. */
|
||||||
|
#cmakedefine01 FLAC__HAS_NEONINTRIN
|
||||||
|
|
||||||
/* define if building for Darwin / MacOS X */
|
/* define if building for Darwin / MacOS X */
|
||||||
#cmakedefine FLAC__SYS_DARWIN
|
#cmakedefine FLAC__SYS_DARWIN
|
||||||
|
|
||||||
|
30
configure.ac
30
configure.ac
@ -58,7 +58,7 @@ AM_PROG_CC_C_O
|
|||||||
AC_C_INLINE
|
AC_C_INLINE
|
||||||
AC_C_TYPEOF
|
AC_C_TYPEOF
|
||||||
|
|
||||||
AC_CHECK_HEADERS([stdint.h inttypes.h byteswap.h sys/param.h sys/ioctl.h termios.h x86intrin.h cpuid.h])
|
AC_CHECK_HEADERS([stdint.h inttypes.h byteswap.h sys/param.h sys/ioctl.h termios.h x86intrin.h cpuid.h arm_neon.h])
|
||||||
|
|
||||||
XIPH_C_BSWAP32
|
XIPH_C_BSWAP32
|
||||||
XIPH_C_BSWAP16
|
XIPH_C_BSWAP16
|
||||||
@ -145,6 +145,12 @@ case "$host_cpu" in
|
|||||||
AH_TEMPLATE(FLAC__CPU_PPC, [define if building for PowerPC])
|
AH_TEMPLATE(FLAC__CPU_PPC, [define if building for PowerPC])
|
||||||
asm_optimisation=$asm_opt
|
asm_optimisation=$asm_opt
|
||||||
;;
|
;;
|
||||||
|
arm64|aarch64)
|
||||||
|
cpu_arm64=true
|
||||||
|
AC_DEFINE(FLAC__CPU_ARM64)
|
||||||
|
AH_TEMPLATE(FLAC__CPU_ARM64, [define if building for ARM])
|
||||||
|
asm_optimisation=$asm_opt
|
||||||
|
;;
|
||||||
sparc)
|
sparc)
|
||||||
cpu_sparc=true
|
cpu_sparc=true
|
||||||
AC_DEFINE(FLAC__CPU_SPARC)
|
AC_DEFINE(FLAC__CPU_SPARC)
|
||||||
@ -156,6 +162,7 @@ AM_CONDITIONAL(FLAC__CPU_X86_64, test "x$cpu_x86_64" = xtrue)
|
|||||||
AM_CONDITIONAL(FLaC__CPU_IA32, test "x$cpu_ia32" = xtrue)
|
AM_CONDITIONAL(FLaC__CPU_IA32, test "x$cpu_ia32" = xtrue)
|
||||||
AM_CONDITIONAL(FLaC__CPU_PPC, test "x$cpu_ppc" = xtrue)
|
AM_CONDITIONAL(FLaC__CPU_PPC, test "x$cpu_ppc" = xtrue)
|
||||||
AM_CONDITIONAL(FLaC__CPU_PPC64, test "x$cpu_ppc64" = xtrue)
|
AM_CONDITIONAL(FLaC__CPU_PPC64, test "x$cpu_ppc64" = xtrue)
|
||||||
|
AM_CONDITIONAL(FLAC__CPU_ARM64, test "x$cpu_arm64" = xtrue)
|
||||||
AM_CONDITIONAL(FLaC__CPU_SPARC, test "x$cpu_sparc" = xtrue)
|
AM_CONDITIONAL(FLaC__CPU_SPARC, test "x$cpu_sparc" = xtrue)
|
||||||
|
|
||||||
if test "x$ac_cv_header_x86intrin_h" = xyes; then
|
if test "x$ac_cv_header_x86intrin_h" = xyes; then
|
||||||
@ -164,6 +171,26 @@ else
|
|||||||
AC_DEFINE([FLAC__HAS_X86INTRIN], 0)
|
AC_DEFINE([FLAC__HAS_X86INTRIN], 0)
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
neon=no
|
||||||
|
if test "x$ac_cv_header_arm_neon_h" = xyes; then
|
||||||
|
AC_DEFINE([FLAC__HAS_NEONINTRIN], 1, [Set to 1 if <arm_neon.h> is available.])
|
||||||
|
neon=yes
|
||||||
|
AC_MSG_CHECKING([whether arm_neon.h has A64 functions])
|
||||||
|
AC_COMPILE_IFELSE(
|
||||||
|
[AC_LANG_PROGRAM([[#include <arm_neon.h>]],
|
||||||
|
[[float64x2_t sum5; sum5 = vdupq_n_f64(0.0f);]])],
|
||||||
|
[AC_MSG_RESULT([yes])
|
||||||
|
has_a64neon=yes],
|
||||||
|
[AC_MSG_RESULT([no])])
|
||||||
|
if test "x$has_a64neon" = xyes; then
|
||||||
|
AC_DEFINE([FLAC__HAS_A64NEONINTRIN], 1, [Set to 1 if <arm_neon.h> has A64 instructions.])
|
||||||
|
else
|
||||||
|
AC_DEFINE([FLAC__HAS_A64NEONINTRIN], 0)
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
AC_DEFINE([FLAC__HAS_NEONINTRIN], 0)
|
||||||
|
fi
|
||||||
|
|
||||||
if test x"$cpu_ppc64" = xtrue ; then
|
if test x"$cpu_ppc64" = xtrue ; then
|
||||||
|
|
||||||
AC_C_ATTRIBUTE([target("cpu=power8")],
|
AC_C_ATTRIBUTE([target("cpu=power8")],
|
||||||
@ -664,6 +691,7 @@ if test x$ac_cv_c_compiler_gnu = xyes ; then
|
|||||||
fi
|
fi
|
||||||
echo " Compiler is Clang : ....................... ${xiph_cv_c_compiler_clang}"
|
echo " Compiler is Clang : ....................... ${xiph_cv_c_compiler_clang}"
|
||||||
echo " SSE optimizations : ....................... ${sse_os}"
|
echo " SSE optimizations : ....................... ${sse_os}"
|
||||||
|
echo " Neon optimizations : ...................... ${neon}"
|
||||||
echo " Asm optimizations : ....................... ${asm_optimisation}"
|
echo " Asm optimizations : ....................... ${asm_optimisation}"
|
||||||
echo " Ogg/FLAC support : ........................ ${have_ogg}"
|
echo " Ogg/FLAC support : ........................ ${have_ogg}"
|
||||||
echo " Stack protector : ........................ ${enable_stack_smash_protection}"
|
echo " Stack protector : ........................ ${enable_stack_smash_protection}"
|
||||||
|
@ -10,6 +10,7 @@ include(CheckCSourceCompiles)
|
|||||||
include(CheckCPUArch)
|
include(CheckCPUArch)
|
||||||
include(CheckAttribute)
|
include(CheckAttribute)
|
||||||
include(CheckVSX)
|
include(CheckVSX)
|
||||||
|
include(CheckA64NEON)
|
||||||
|
|
||||||
check_cpu_arch_x64(FLAC__CPU_X86_64)
|
check_cpu_arch_x64(FLAC__CPU_X86_64)
|
||||||
if(NOT FLAC__CPU_X86_64)
|
if(NOT FLAC__CPU_X86_64)
|
||||||
@ -26,9 +27,15 @@ else()
|
|||||||
check_attribute_power8(FLAC__HAS_TARGET_POWER8)
|
check_attribute_power8(FLAC__HAS_TARGET_POWER8)
|
||||||
check_attribute_power9(FLAC__HAS_TARGET_POWER9)
|
check_attribute_power9(FLAC__HAS_TARGET_POWER9)
|
||||||
check_vsx(FLAC__USE_VSX)
|
check_vsx(FLAC__USE_VSX)
|
||||||
|
else()
|
||||||
|
check_cpu_arch_arm64(FLAC__CPU_ARM64)
|
||||||
|
if(FLAC__CPU_ARM64)
|
||||||
|
check_a64neon(FLAC__HAS_A64NEONINTRIN)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
include(CheckLanguage)
|
include(CheckLanguage)
|
||||||
check_language(ASM_NASM)
|
check_language(ASM_NASM)
|
||||||
if(CMAKE_ASM_NASM_COMPILER)
|
if(CMAKE_ASM_NASM_COMPILER)
|
||||||
@ -68,6 +75,7 @@ add_library(FLAC
|
|||||||
float.c
|
float.c
|
||||||
format.c
|
format.c
|
||||||
lpc.c
|
lpc.c
|
||||||
|
lpc_intrin_neon.c
|
||||||
lpc_intrin_sse2.c
|
lpc_intrin_sse2.c
|
||||||
lpc_intrin_sse41.c
|
lpc_intrin_sse41.c
|
||||||
lpc_intrin_avx2.c
|
lpc_intrin_avx2.c
|
||||||
|
@ -117,6 +117,7 @@ libFLAC_sources = \
|
|||||||
lpc_intrin_sse41.c \
|
lpc_intrin_sse41.c \
|
||||||
lpc_intrin_avx2.c \
|
lpc_intrin_avx2.c \
|
||||||
lpc_intrin_vsx.c \
|
lpc_intrin_vsx.c \
|
||||||
|
lpc_intrin_neon.c \
|
||||||
md5.c \
|
md5.c \
|
||||||
memory.c \
|
memory.c \
|
||||||
metadata_iterators.c \
|
metadata_iterators.c \
|
||||||
|
@ -90,6 +90,7 @@ SRCS_C = \
|
|||||||
lpc_intrin_sse2.c \
|
lpc_intrin_sse2.c \
|
||||||
lpc_intrin_sse41.c \
|
lpc_intrin_sse41.c \
|
||||||
lpc_intrin_avx2.c \
|
lpc_intrin_avx2.c \
|
||||||
|
lpc_intrin_neon.c \
|
||||||
md5.c \
|
md5.c \
|
||||||
memory.c \
|
memory.c \
|
||||||
metadata_iterators.c \
|
metadata_iterators.c \
|
||||||
|
@ -151,7 +151,13 @@ int FLAC__lpc_quantize_coefficients(const FLAC__real lp_coeff[], uint32_t order,
|
|||||||
*/
|
*/
|
||||||
void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
|
void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
|
||||||
void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
|
void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
|
||||||
|
|
||||||
#ifndef FLAC__NO_ASM
|
#ifndef FLAC__NO_ASM
|
||||||
|
# ifdef FLAC__CPU_ARM64
|
||||||
|
void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
|
||||||
|
void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
|
||||||
|
# endif
|
||||||
|
|
||||||
# ifdef FLAC__CPU_IA32
|
# ifdef FLAC__CPU_IA32
|
||||||
# ifdef FLAC__HAS_NASM
|
# ifdef FLAC__HAS_NASM
|
||||||
void FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
|
void FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
|
||||||
|
1249
src/libFLAC/lpc_intrin_neon.c
Normal file
1249
src/libFLAC/lpc_intrin_neon.c
Normal file
File diff suppressed because it is too large
Load Diff
@ -1003,8 +1003,16 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
|
|||||||
# endif
|
# endif
|
||||||
# endif /* FLAC__HAS_X86INTRIN */
|
# endif /* FLAC__HAS_X86INTRIN */
|
||||||
# endif /* FLAC__CPU_... */
|
# endif /* FLAC__CPU_... */
|
||||||
|
|
||||||
|
#if defined FLAC__CPU_ARM64
|
||||||
|
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon;
|
||||||
|
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon;
|
||||||
|
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon;
|
||||||
|
# endif
|
||||||
|
|
||||||
}
|
}
|
||||||
# endif /* !FLAC__NO_ASM */
|
# endif /* !FLAC__NO_ASM */
|
||||||
|
|
||||||
#endif /* !FLAC__INTEGER_ONLY_LIBRARY */
|
#endif /* !FLAC__INTEGER_ONLY_LIBRARY */
|
||||||
#if !defined FLAC__NO_ASM && FLAC__HAS_X86INTRIN
|
#if !defined FLAC__NO_ASM && FLAC__HAS_X86INTRIN
|
||||||
if(encoder->private_->cpuinfo.use_asm) {
|
if(encoder->private_->cpuinfo.use_asm) {
|
||||||
|
Loading…
Reference in New Issue
Block a user