From a75b87059213a130e5dcdaae0969d9e89a822262 Mon Sep 17 00:00:00 2001 From: Erik de Castro Lopo Date: Sat, 4 Oct 2014 09:14:18 +1000 Subject: [PATCH] Add AVX/AVX2/FMA support to CPU detection code. Patch-from: lvqcl --- src/libFLAC/cpu.c | 143 +++++++++++++++++++++++++----- src/libFLAC/include/private/cpu.h | 35 +++++++- 2 files changed, 155 insertions(+), 23 deletions(-) diff --git a/src/libFLAC/cpu.c b/src/libFLAC/cpu.c index 02aa9877..0f6c304e 100644 --- a/src/libFLAC/cpu.c +++ b/src/libFLAC/cpu.c @@ -54,6 +54,21 @@ static void disable_sse(FLAC__CPUInfo *info) info->ia32.sse42 = false; } +static void disable_avx(FLAC__CPUInfo *info) +{ + info->ia32.avx = false; + info->ia32.avx2 = false; + info->ia32.fma = false; +} + +#elif defined FLAC__CPU_X86_64 + +static void disable_avx(FLAC__CPUInfo *info) +{ + info->x86.avx = false; + info->x86.avx2 = false; + info->x86.fma = false; +} #endif #if defined (__NetBSD__) || defined(__OpenBSD__) @@ -86,6 +101,15 @@ static const unsigned FLAC__CPUINFO_IA32_CPUID_SSSE3 = 0x00000200; static const unsigned FLAC__CPUINFO_IA32_CPUID_SSE41 = 0x00080000; static const unsigned FLAC__CPUINFO_IA32_CPUID_SSE42 = 0x00100000; +#if defined FLAC__AVX_SUPPORTED +/* these are flags in ECX of CPUID AX=00000001 */ +static const unsigned FLAC__CPUINFO_IA32_CPUID_OSXSAVE = 0x08000000; +static const unsigned FLAC__CPUINFO_IA32_CPUID_AVX = 0x10000000; +static const unsigned FLAC__CPUINFO_IA32_CPUID_FMA = 0x00001000; +/* these are flags in EBX of CPUID AX=00000007 */ +static const unsigned FLAC__CPUINFO_IA32_CPUID_AVX2 = 0x00000020; +#endif + /* * Extra stuff needed for detection of OS support for SSE on IA-32 */ @@ -123,7 +147,8 @@ void FLAC__cpu_info(FLAC__CPUInfo *info) */ #ifdef FLAC__CPU_IA32 FLAC__bool ia32_fxsr = false; - (void) ia32_fxsr; /* to avoid warnings about unused variables */ + FLAC__bool ia32_osxsave = false; + (void) ia32_fxsr; (void) ia32_osxsave; /* to avoid warnings about unused variables */ memset(info, 0, sizeof(*info)); info->type = FLAC__CPUINFO_TYPE_IA32; #if !defined FLAC__NO_ASM && (defined FLAC__HAS_NASM || defined FLAC__HAS_X86INTRIN) @@ -137,10 +162,11 @@ void FLAC__cpu_info(FLAC__CPUInfo *info) #endif { /* http://www.sandpile.org/x86/cpuid.htm */ - FLAC__uint32 flags_edx, flags_ecx; #ifdef FLAC__HAS_X86INTRIN - FLAC__cpu_info_x86(&flags_edx, &flags_ecx); + FLAC__uint32 flags_eax, flags_ebx, flags_ecx, flags_edx; + FLAC__cpu_info_x86(1, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx); #else + FLAC__uint32 flags_ecx, flags_edx; FLAC__cpu_info_asm_ia32(&flags_edx, &flags_ecx); #endif info->ia32.cmov = (flags_edx & FLAC__CPUINFO_IA32_CPUID_CMOV )? true : false; @@ -152,6 +178,13 @@ void FLAC__cpu_info(FLAC__CPUInfo *info) info->ia32.ssse3 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSSE3)? true : false; info->ia32.sse41 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE41)? true : false; info->ia32.sse42 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE42)? true : false; +#if defined FLAC__HAS_X86INTRIN && defined FLAC__AVX_SUPPORTED + ia32_osxsave = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_OSXSAVE)? true : false; + info->ia32.avx = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_AVX )? true : false; + info->ia32.fma = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_FMA )? true : false; + FLAC__cpu_info_x86(7, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx); + info->ia32.avx2 = (flags_ebx & FLAC__CPUINFO_IA32_CPUID_AVX2 )? true : false; +#endif } #ifdef DEBUG @@ -164,6 +197,11 @@ void FLAC__cpu_info(FLAC__CPUInfo *info) fprintf(stderr, " SSSE3 ...... %c\n", info->ia32.ssse3 ? 'Y' : 'n'); fprintf(stderr, " SSE41 ...... %c\n", info->ia32.sse41 ? 'Y' : 'n'); fprintf(stderr, " SSE42 ...... %c\n", info->ia32.sse42 ? 'Y' : 'n'); +# if defined FLAC__HAS_X86INTRIN && defined FLAC__AVX_SUPPORTED + fprintf(stderr, " AVX ........ %c\n", info->ia32.avx ? 'Y' : 'n'); + fprintf(stderr, " FMA ........ %c\n", info->ia32.fma ? 'Y' : 'n'); + fprintf(stderr, " AVX2 ....... %c\n", info->ia32.avx2 ? 'Y' : 'n'); +# endif #endif /* @@ -279,6 +317,19 @@ void FLAC__cpu_info(FLAC__CPUInfo *info) else /* info->ia32.sse == false */ disable_sse(info); + /* + * now have to check for OS support of AVX instructions + */ + if(info->ia32.avx && ia32_osxsave) { + FLAC__uint32 ecr = FLAC__cpu_xgetbv_x86(); + if ((ecr & 0x6) != 0x6) + disable_avx(info); +#ifdef DEBUG + fprintf(stderr, " AVX OS sup . %c\n", info->ia32.avx ? 'Y' : 'n'); +#endif + } + else /* no OS AVX support*/ + disable_avx(info); #else info->use_asm = false; #endif @@ -287,27 +338,54 @@ void FLAC__cpu_info(FLAC__CPUInfo *info) * x86-64-specific */ #elif defined FLAC__CPU_X86_64 + FLAC__bool x86_osxsave = false; + (void) x86_osxsave; /* to avoid warnings about unused variables */ memset(info, 0, sizeof(*info)); info->type = FLAC__CPUINFO_TYPE_X86_64; #if !defined FLAC__NO_ASM && defined FLAC__HAS_X86INTRIN info->use_asm = true; { /* http://www.sandpile.org/x86/cpuid.htm */ - FLAC__uint32 flags_edx, flags_ecx; - FLAC__cpu_info_x86(&flags_edx, &flags_ecx); + FLAC__uint32 flags_eax, flags_ebx, flags_ecx, flags_edx; + FLAC__cpu_info_x86(1, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx); info->x86.sse3 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE3 )? true : false; info->x86.ssse3 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSSE3)? true : false; info->x86.sse41 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE41)? true : false; info->x86.sse42 = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_SSE42)? true : false; +#if defined FLAC__AVX_SUPPORTED + x86_osxsave = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_OSXSAVE)? true : false; + info->x86.avx = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_AVX )? true : false; + info->x86.fma = (flags_ecx & FLAC__CPUINFO_IA32_CPUID_FMA )? true : false; + FLAC__cpu_info_x86(7, &flags_eax, &flags_ebx, &flags_ecx, &flags_edx); + info->x86.avx2 = (flags_ebx & FLAC__CPUINFO_IA32_CPUID_AVX2 )? true : false; +#endif } #ifdef DEBUG fprintf(stderr, "CPU info (x86-64):\n"); - fprintf(stderr, " SSE3 ....... %c\n", info->x86.sse3 ? 'Y' : 'n'); - fprintf(stderr, " SSSE3 ...... %c\n", info->x86.ssse3 ? 'Y' : 'n'); - fprintf(stderr, " SSE41 ...... %c\n", info->x86.sse41 ? 'Y' : 'n'); - fprintf(stderr, " SSE42 ...... %c\n", info->x86.sse42 ? 'Y' : 'n'); + fprintf(stderr, " SSE3 ....... %c\n", info->x86.sse3 ? 'Y' : 'n'); + fprintf(stderr, " SSSE3 ...... %c\n", info->x86.ssse3 ? 'Y' : 'n'); + fprintf(stderr, " SSE41 ...... %c\n", info->x86.sse41 ? 'Y' : 'n'); + fprintf(stderr, " SSE42 ...... %c\n", info->x86.sse42 ? 'Y' : 'n'); +# if defined FLAC__AVX_SUPPORTED + fprintf(stderr, " AVX ........ %c\n", info->x86.avx ? 'Y' : 'n'); + fprintf(stderr, " FMA ........ %c\n", info->x86.fma ? 'Y' : 'n'); + fprintf(stderr, " AVX2 ....... %c\n", info->x86.avx2 ? 'Y' : 'n'); +# endif #endif + /* + * now have to check for OS support of AVX instructions + */ + if(info->x86.avx && x86_osxsave) { + FLAC__uint32 ecr = FLAC__cpu_xgetbv_x86(); + if ((ecr & 0x6) != 0x6) + disable_avx(info); +#ifdef DEBUG + fprintf(stderr, " AVX OS sup . %c\n", info->x86.avx ? 'Y' : 'n'); +#endif + } + else /* no OS AVX support*/ + disable_avx(info); #else info->use_asm = false; #endif @@ -324,7 +402,7 @@ void FLAC__cpu_info(FLAC__CPUInfo *info) #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN #if defined _MSC_VER -#include /* for __cpuid() */ +#include /* for __cpuid() and _xgetbv() */ #elif defined __GNUC__ && defined HAVE_CPUID_H #include /* for __get_cpuid() and __get_cpuid_max() */ #endif @@ -364,24 +442,45 @@ FLAC__uint32 FLAC__cpu_have_cpuid_x86(void) #endif } -void FLAC__cpu_info_x86(FLAC__uint32 *flags_edx, FLAC__uint32 *flags_ecx) +void FLAC__cpu_info_x86(FLAC__uint32 level, FLAC__uint32 *eax, FLAC__uint32 *ebx, FLAC__uint32 *ecx, FLAC__uint32 *edx) { #if defined _MSC_VER || defined __INTEL_COMPILER int cpuinfo[4]; - __cpuid(cpuinfo, 0); - if(cpuinfo[0] < 1) { - *flags_ecx = *flags_edx = 0; + int ext = level & 0x80000000; + __cpuid(cpuinfo, ext); + if((unsigned)cpuinfo[0] < level) { + *eax = *ebx = *ecx = *edx = 0; return; } - __cpuid(cpuinfo, 1); - *flags_ecx = cpuinfo[2]; - *flags_edx = cpuinfo[3]; -#elif defined __GNUC__ && defined HAVE_CPUID_H - FLAC__uint32 flags_eax, flags_ebx; - if (0 == __get_cpuid(1, &flags_eax, &flags_ebx, flags_ecx, flags_edx)) - *flags_ecx = *flags_edx = 0; +#if defined FLAC__AVX_SUPPORTED + __cpuidex(cpuinfo, level, 0); /* for AVX2 detection */ #else - *flags_ecx = *flags_edx = 0; + __cpuid(cpuinfo, level); /* some old compilers don't support __cpuidex */ +#endif + *eax = cpuinfo[0]; *ebx = cpuinfo[1]; *ecx = cpuinfo[2]; *edx = cpuinfo[3]; +#elif defined __GNUC__ && defined HAVE_CPUID_H + FLAC__uint32 ext = level & 0x80000000; + __cpuid(ext, *eax, *ebx, *ecx, *edx); + if (*eax < level) { + *eax = *ebx = *ecx = *edx = 0; + return; + } + __cpuid_count(level, 0, *eax, *ebx, *ecx, *edx); +#else + *eax = *ebx = *ecx = *edx = 0; +#endif +} + +FLAC__uint32 FLAC__cpu_xgetbv_x86(void) +{ +#if (defined _MSC_VER || defined __INTEL_COMPILER) && defined FLAC__AVX_SUPPORTED + return (FLAC__uint32)_xgetbv(0); +#elif defined __GNUC__ + FLAC__uint32 lo, hi; + asm volatile (".byte 0x0f, 0x01, 0xd0" : "=a"(lo), "=d"(hi) : "c" (0)); + return lo; +#else + return 0; #endif } diff --git a/src/libFLAC/include/private/cpu.h b/src/libFLAC/include/private/cpu.h index dd337705..e199f203 100644 --- a/src/libFLAC/include/private/cpu.h +++ b/src/libFLAC/include/private/cpu.h @@ -49,6 +49,13 @@ #define FLAC__SSSE3_SUPPORTED 1 #define FLAC__SSE4_1_SUPPORTED 1 #endif + #if (__INTEL_COMPILER >= 1110) /* Intel C++ Compiler 11.1 */ + #define FLAC__AVX_SUPPORTED 1 + #endif + #if (__INTEL_COMPILER >= 1300) /* Intel C++ Compiler 13.0 */ + #define FLAC__AVX2_SUPPORTED 1 + #define FLAC__FMA_SUPPORTED 1 + #endif #elif defined _MSC_VER #define FLAC__SSE_TARGET(x) #define FLAC__SSE_SUPPORTED 1 @@ -57,6 +64,13 @@ #define FLAC__SSSE3_SUPPORTED 1 #define FLAC__SSE4_1_SUPPORTED 1 #endif + #if (_MSC_FULL_VER >= 160040219) /* MS Visual Studio 2010 SP1 */ + #define FLAC__AVX_SUPPORTED 1 + #endif + #if (_MSC_VER >= 1700) /* MS Visual Studio 2012 */ + #define FLAC__AVX2_SUPPORTED 1 + #define FLAC__FMA_SUPPORTED 1 + #endif #elif defined __GNUC__ #if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) /* since GCC 4.9 -msse.. compiler options aren't necessary */ #define FLAC__SSE_TARGET(x) __attribute__ ((__target__ (x))) @@ -64,6 +78,9 @@ #define FLAC__SSE2_SUPPORTED 1 #define FLAC__SSSE3_SUPPORTED 1 #define FLAC__SSE4_1_SUPPORTED 1 + #define FLAC__AVX_SUPPORTED 1 + #define FLAC__AVX2_SUPPORTED 1 + #define FLAC__FMA_SUPPORTED 1 #else /* for GCC older than 4.9 */ #define FLAC__SSE_TARGET(x) #ifdef __SSE__ @@ -78,6 +95,15 @@ #ifdef __SSE4_1__ #define FLAC__SSE4_1_SUPPORTED 1 #endif + #ifdef __AVX__ + #define FLAC__AVX_SUPPORTED 1 + #endif + #ifdef __AVX2__ + #define FLAC__AVX2_SUPPORTED 1 + #endif + #ifdef __FMA__ + #define FLAC__FMA_SUPPORTED 1 + #endif #endif /* GCC version */ #endif /* compiler version */ #endif /* intrinsics support */ @@ -99,6 +125,9 @@ typedef struct { FLAC__bool ssse3; FLAC__bool sse41; FLAC__bool sse42; + FLAC__bool avx; + FLAC__bool avx2; + FLAC__bool fma; } FLAC__CPUInfo_IA32; #elif defined FLAC__CPU_X86_64 typedef struct { @@ -106,6 +135,9 @@ typedef struct { FLAC__bool ssse3; FLAC__bool sse41; FLAC__bool sse42; + FLAC__bool avx; + FLAC__bool avx2; + FLAC__bool fma; } FLAC__CPUInfo_x86; #endif @@ -128,7 +160,8 @@ void FLAC__cpu_info_asm_ia32(FLAC__uint32 *flags_edx, FLAC__uint32 *flag # endif # if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN FLAC__uint32 FLAC__cpu_have_cpuid_x86(void); -void FLAC__cpu_info_x86(FLAC__uint32 *flags_edx, FLAC__uint32 *flags_ecx); +void FLAC__cpu_info_x86(FLAC__uint32 level, FLAC__uint32 *eax, FLAC__uint32 *ebx, FLAC__uint32 *ecx, FLAC__uint32 *edx); +FLAC__uint32 FLAC__cpu_xgetbv_x86(void); # endif #endif