diff --git a/wolfcrypt/src/chacha.c b/wolfcrypt/src/chacha.c index f1ea22533..25995431c 100644 --- a/wolfcrypt/src/chacha.c +++ b/wolfcrypt/src/chacha.c @@ -58,8 +58,20 @@ #ifdef USE_INTEL_CHACHA_SPEEDUP #include #include + + #if defined(__GNUC__) && ((__GNUC__ < 4) || \ + (__GNUC__ == 4 && __GNUC_MINOR__ <= 8)) + #define NO_AVX2_SUPPORT + #endif + #if defined(__clang__) && ((__clang_major__ < 3) || \ + (__clang_major__ == 3 && __clang_minor__ <= 5)) + #define NO_AVX2_SUPPORT + #endif + #define HAVE_INTEL_AVX1 - #define HAVE_INTEL_AVX2 + #ifndef NO_AVX2_SUPPORT + #define HAVE_INTEL_AVX2 + #endif #endif #ifdef BIG_ENDIAN_ORDER @@ -408,12 +420,10 @@ static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c, byte* output; word32 i; word32 cnt = 0; - static const word64 add[2] = { 0x0000000100000000UL,0x0000000300000002UL }; - static const word64 four[2] = { 0x0000000400000004UL,0x0000000400000004UL }; - static const word64 rotl8[2] = - { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL }; - static const word64 rotl16[2] = - { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL }; + static const __m128i add = { 0x0000000100000000UL,0x0000000300000002UL }; + static const __m128i four = { 0x0000000400000004UL,0x0000000400000004UL }; + static const __m128i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL }; + static const __m128i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL }; if (bytes == 0) return; @@ -638,8 +648,8 @@ static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c, : [bytes] "+r" (bytes), [cnt] "+r" (cnt), [in] "+r" (m), [out] "+r" (c) : [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X), - [add] "m" (add), [four] "m" (four), - [rotl8] "m" (rotl8), [rotl16] "m" (rotl16) + [add] "xrm" (add), [four] "xrm" (four), + [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", @@ -675,17 +685,14 @@ static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c, byte* output; word32 i; word32 cnt = 0; - static const word64 add[4] = { 0x0000000100000000UL, 0x0000000300000002UL, - 0x0000000500000004UL, 0x0000000700000006UL }; - static const word64 eight[4] = - { 0x0000000800000008UL, 0x0000000800000008UL, - 0x0000000800000008UL, 0x0000000800000008UL }; - static const word64 rotl8[4] = - { 0x0605040702010003UL, 0x0e0d0c0f0a09080bUL, - 0x0605040702010003UL, 0x0e0d0c0f0a09080bUL }; - static const word64 rotl16[4] = - { 0x0504070601000302UL, 0x0d0c0f0e09080b0aUL, - 0x0504070601000302UL, 0x0d0c0f0e09080b0aUL }; + static const __m256i add = { 0x0000000100000000UL,0x0000000300000002UL, + 0x0000000500000004UL,0x0000000700000006UL }; + static const __m256i eight = { 0x0000000800000008UL,0x0000000800000008UL, + 0x0000000800000008UL,0x0000000800000008UL }; + static const __m256i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL, + 0x0605040702010003UL,0x0e0d0c0f0a09080bUL }; + static const __m256i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL, + 0x0504070601000302UL,0x0d0c0f0e09080b0aUL }; if (bytes == 0) return; @@ -926,8 +933,8 @@ static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c, : [bytes] "+r" (bytes), [cnt] "+r" (cnt), [in] "+r" (m), [out] "+r" (c) : [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X), - [add] "m" (add), [eight] "m" (eight), - [rotl8] "m" (rotl8), [rotl16] "m" (rotl16) + [add] "xrm" (add), [eight] "xrm" (eight), + [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) : "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",