Adding ARM64 support and optimized Neon implementation (#270)

Add NEON intrinsics routines for lpc_compute_residual_from_qlp_coefficients
and lpc_compute_residual_from_qlp_coefficients_wide
This commit is contained in:
RonenGvili 2022-04-29 15:46:07 +03:00 committed by GitHub
parent 5df56dbcf5
commit 95e2c52980
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 1333 additions and 3 deletions

View File

@ -104,6 +104,8 @@ enable_testing()
check_include_file("byteswap.h" HAVE_BYTESWAP_H)
check_include_file("inttypes.h" HAVE_INTTYPES_H)
check_include_file("stdint.h" HAVE_STDINT_H)
check_include_file("arm_neon.h" FLAC__HAS_NEONINTRIN)
if(MSVC)
check_include_file("intrin.h" FLAC__HAS_X86INTRIN)
else()

6
cmake/CheckA64NEON.c.in Normal file
View File

@ -0,0 +1,6 @@
#include <arm_neon.h>
int main (void)
{
float64x2_t tmp;
tmp = vdupq_n_f64(0.0f);
}

14
cmake/CheckA64NEON.cmake Normal file
View File

@ -0,0 +1,14 @@
macro(CHECK_A64NEON VARIABLE)
if(NOT DEFINED HAVE_${VARIABLE})
message(STATUS "Check whether A64 NEON can be used")
configure_file(${PROJECT_SOURCE_DIR}/cmake/CheckA64NEON.c.in ${PROJECT_BINARY_DIR}/CMakeFiles/CMakeTmp/CheckA64NEON.c @ONLY)
try_compile(HAVE_${VARIABLE} "${PROJECT_BINARY_DIR}"
"${PROJECT_BINARY_DIR}/CMakeFiles/CMakeTmp/CheckA64NEON.c")
if(HAVE_${VARIABLE})
message(STATUS "Check whether A64 NEON can be used - yes")
set(${VARIABLE} 1 CACHE INTERNAL "Result of CHECK_A64NEON" FORCE)
else ()
message(STATUS "Check whether A64 NEON can be used - no")
endif()
endif ()
endmacro(CHECK_A64NEON)

View File

@ -7,7 +7,7 @@ macro(_CHECK_CPU_ARCH ARCH ARCH_DEFINES VARIABLE)
"${PROJECT_BINARY_DIR}/CMakeFiles/CMakeTmp/CheckCPUArch.c")
if(HAVE_${VARIABLE})
message(STATUS "Check CPU architecture is ${ARCH} - yes")
set(${VARIABLE} 1 CACHE INTERNAL "Result of CHECK_CPU_ARCH_X64" FORCE)
set(${VARIABLE} 1 CACHE INTERNAL "Result of CHECK_CPU_ARCH" FORCE)
else ()
message(STATUS "Check CPU architecture is ${ARCH} - no")
endif()
@ -25,3 +25,7 @@ endmacro(CHECK_CPU_ARCH_X86)
macro(CHECK_CPU_ARCH_PPC64 VARIABLE)
_CHECK_CPU_ARCH(ppc64 "defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) ||defined(_ARCH_PPC64)" ${VARIABLE})
endmacro(CHECK_CPU_ARCH_PPC64)
macro(CHECK_CPU_ARCH_ARM64 VARIABLE)
_CHECK_CPU_ARCH(arm64 "defined(__aarch64__) || defined(__arm64__)" ${VARIABLE})
endmacro(CHECK_CPU_ARCH_ARM64)

View File

@ -40,6 +40,9 @@
/* Set to 1 if <x86intrin.h> is available. */
#cmakedefine01 FLAC__HAS_X86INTRIN
/* Set to 1 if <arm_neon.h> is available. */
#cmakedefine01 FLAC__HAS_NEONINTRIN
/* define if building for Darwin / MacOS X */
#cmakedefine FLAC__SYS_DARWIN

View File

@ -58,7 +58,7 @@ AM_PROG_CC_C_O
AC_C_INLINE
AC_C_TYPEOF
AC_CHECK_HEADERS([stdint.h inttypes.h byteswap.h sys/param.h sys/ioctl.h termios.h x86intrin.h cpuid.h])
AC_CHECK_HEADERS([stdint.h inttypes.h byteswap.h sys/param.h sys/ioctl.h termios.h x86intrin.h cpuid.h arm_neon.h])
XIPH_C_BSWAP32
XIPH_C_BSWAP16
@ -145,6 +145,12 @@ case "$host_cpu" in
AH_TEMPLATE(FLAC__CPU_PPC, [define if building for PowerPC])
asm_optimisation=$asm_opt
;;
arm64|aarch64)
cpu_arm64=true
AC_DEFINE(FLAC__CPU_ARM64)
AH_TEMPLATE(FLAC__CPU_ARM64, [define if building for ARM])
asm_optimisation=$asm_opt
;;
sparc)
cpu_sparc=true
AC_DEFINE(FLAC__CPU_SPARC)
@ -156,6 +162,7 @@ AM_CONDITIONAL(FLAC__CPU_X86_64, test "x$cpu_x86_64" = xtrue)
AM_CONDITIONAL(FLaC__CPU_IA32, test "x$cpu_ia32" = xtrue)
AM_CONDITIONAL(FLaC__CPU_PPC, test "x$cpu_ppc" = xtrue)
AM_CONDITIONAL(FLaC__CPU_PPC64, test "x$cpu_ppc64" = xtrue)
AM_CONDITIONAL(FLAC__CPU_ARM64, test "x$cpu_arm64" = xtrue)
AM_CONDITIONAL(FLaC__CPU_SPARC, test "x$cpu_sparc" = xtrue)
if test "x$ac_cv_header_x86intrin_h" = xyes; then
@ -164,6 +171,26 @@ else
AC_DEFINE([FLAC__HAS_X86INTRIN], 0)
fi
neon=no
if test "x$ac_cv_header_arm_neon_h" = xyes; then
AC_DEFINE([FLAC__HAS_NEONINTRIN], 1, [Set to 1 if <arm_neon.h> is available.])
neon=yes
AC_MSG_CHECKING([whether arm_neon.h has A64 functions])
AC_COMPILE_IFELSE(
[AC_LANG_PROGRAM([[#include <arm_neon.h>]],
[[float64x2_t sum5; sum5 = vdupq_n_f64(0.0f);]])],
[AC_MSG_RESULT([yes])
has_a64neon=yes],
[AC_MSG_RESULT([no])])
if test "x$has_a64neon" = xyes; then
AC_DEFINE([FLAC__HAS_A64NEONINTRIN], 1, [Set to 1 if <arm_neon.h> has A64 instructions.])
else
AC_DEFINE([FLAC__HAS_A64NEONINTRIN], 0)
fi
else
AC_DEFINE([FLAC__HAS_NEONINTRIN], 0)
fi
if test x"$cpu_ppc64" = xtrue ; then
AC_C_ATTRIBUTE([target("cpu=power8")],
@ -664,6 +691,7 @@ if test x$ac_cv_c_compiler_gnu = xyes ; then
fi
echo " Compiler is Clang : ....................... ${xiph_cv_c_compiler_clang}"
echo " SSE optimizations : ....................... ${sse_os}"
echo " Neon optimizations : ...................... ${neon}"
echo " Asm optimizations : ....................... ${asm_optimisation}"
echo " Ogg/FLAC support : ........................ ${have_ogg}"
echo " Stack protector : ........................ ${enable_stack_smash_protection}"

View File

@ -10,6 +10,7 @@ include(CheckCSourceCompiles)
include(CheckCPUArch)
include(CheckAttribute)
include(CheckVSX)
include(CheckA64NEON)
check_cpu_arch_x64(FLAC__CPU_X86_64)
if(NOT FLAC__CPU_X86_64)
@ -26,9 +27,15 @@ else()
check_attribute_power8(FLAC__HAS_TARGET_POWER8)
check_attribute_power9(FLAC__HAS_TARGET_POWER9)
check_vsx(FLAC__USE_VSX)
else()
check_cpu_arch_arm64(FLAC__CPU_ARM64)
if(FLAC__CPU_ARM64)
check_a64neon(FLAC__HAS_A64NEONINTRIN)
endif()
endif()
endif()
include(CheckLanguage)
check_language(ASM_NASM)
if(CMAKE_ASM_NASM_COMPILER)
@ -68,6 +75,7 @@ add_library(FLAC
float.c
format.c
lpc.c
lpc_intrin_neon.c
lpc_intrin_sse2.c
lpc_intrin_sse41.c
lpc_intrin_avx2.c

View File

@ -117,6 +117,7 @@ libFLAC_sources = \
lpc_intrin_sse41.c \
lpc_intrin_avx2.c \
lpc_intrin_vsx.c \
lpc_intrin_neon.c \
md5.c \
memory.c \
metadata_iterators.c \

View File

@ -90,6 +90,7 @@ SRCS_C = \
lpc_intrin_sse2.c \
lpc_intrin_sse41.c \
lpc_intrin_avx2.c \
lpc_intrin_neon.c \
md5.c \
memory.c \
metadata_iterators.c \

View File

@ -151,7 +151,13 @@ int FLAC__lpc_quantize_coefficients(const FLAC__real lp_coeff[], uint32_t order,
*/
void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
#ifndef FLAC__NO_ASM
# ifdef FLAC__CPU_ARM64
void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);
# endif
# ifdef FLAC__CPU_IA32
# ifdef FLAC__HAS_NASM
void FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32(const FLAC__int32 *data, uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 residual[]);

File diff suppressed because it is too large Load Diff

View File

@ -1003,8 +1003,16 @@ static FLAC__StreamEncoderInitStatus init_stream_internal_(
# endif
# endif /* FLAC__HAS_X86INTRIN */
# endif /* FLAC__CPU_... */
#if defined FLAC__CPU_ARM64
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_16bit = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon;
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients = FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon;
encoder->private_->local_lpc_compute_residual_from_qlp_coefficients_64bit = FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon;
# endif
}
# endif /* !FLAC__NO_ASM */
#endif /* !FLAC__INTEGER_ONLY_LIBRARY */
#if !defined FLAC__NO_ASM && FLAC__HAS_X86INTRIN
if(encoder->private_->cpuinfo.use_asm) {