diff --git a/IDE/ECLIPSE/MICRIUM/README.md b/IDE/ECLIPSE/MICRIUM/README.md index 8675f0fed..8932c813b 100644 --- a/IDE/ECLIPSE/MICRIUM/README.md +++ b/IDE/ECLIPSE/MICRIUM/README.md @@ -40,7 +40,7 @@ The folder hierarchy is the same as the wolfSSL folders with an exception of the 4. Right click on each folders, add or link all the source code in the corresponding folder in wolfSSL. -5. Remove non-C platform dependent files from your build. At the moment, only aes_asm.asm and aes_asm.s must be removed from your wolfssl/wolfcrypt/src folder. +5. Remove non-C platform dependent files from your build. At the moment, only aes_asm.asm, aes_gcm_asm.asm and aes_asm.s must be removed from your wolfssl/wolfcrypt/src folder. 6. In your C/C++ compiler preprocessor settings, add the wolfSSL directories to your include paths. Here's an example of the paths that must be added. diff --git a/IDE/WIN/user_settings.h b/IDE/WIN/user_settings.h index 7475aa87f..2722854a7 100644 --- a/IDE/WIN/user_settings.h +++ b/IDE/WIN/user_settings.h @@ -46,6 +46,7 @@ #define HAVE_SECURE_RENEGOTIATION #define HAVE_AESGCM + #define WOLFSSL_AESGCM_STREAM #define WOLFSSL_SHA384 #define WOLFSSL_SHA512 @@ -56,12 +57,19 @@ #define ECC_SHAMIR #define ECC_TIMING_RESISTANT + #define WOLFSSL_SP_X86_64 + #define SP_INT_BITS 4096 + /* Optional Performance Speedups */ #if 0 /* AESNI on x64 */ #ifdef _WIN64 #define HAVE_INTEL_RDSEED #define WOLFSSL_AESNI + #define HAVE_INTEL_AVX1 + #if 0 + #define HAVE_INTEL_AVX2 + #endif #endif /* Single Precision Support for RSA/DH 1024/2048/3072 and @@ -82,7 +90,6 @@ #define WOLFSSL_SP_X86_64_ASM #endif #endif - #else /* The servers and clients */ #define OPENSSL_EXTRA diff --git a/IDE/WIN/user_settings_dtls.h b/IDE/WIN/user_settings_dtls.h index 6059137f9..06880e413 100644 --- a/IDE/WIN/user_settings_dtls.h +++ b/IDE/WIN/user_settings_dtls.h @@ -51,6 +51,7 @@ #define HAVE_SECURE_RENEGOTIATION #define HAVE_AESGCM + #define WOLFSSL_AESGCM_STREAM #define WOLFSSL_SHA384 #define WOLFSSL_SHA512 @@ -61,12 +62,19 @@ #define ECC_SHAMIR #define ECC_TIMING_RESISTANT + #define WOLFSSL_SP_X86_64 + #define SP_INT_BITS 4096 + /* Optional Performance Speedups */ #if 0 /* AESNI on x64 */ #ifdef _WIN64 #define HAVE_INTEL_RDSEED #define WOLFSSL_AESNI + #define HAVE_INTEL_AVX1 + #if 0 + #define HAVE_INTEL_AVX2 + #endif #endif /* Single Precision Support for RSA/DH 1024/2048/3072 and diff --git a/IDE/WIN/wolfssl-fips.vcxproj b/IDE/WIN/wolfssl-fips.vcxproj index 614b1d1bb..3bfb4cd3b 100644 --- a/IDE/WIN/wolfssl-fips.vcxproj +++ b/IDE/WIN/wolfssl-fips.vcxproj @@ -331,6 +331,22 @@ $(IntDir)%(Filename).obj + + + false + false + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(IntDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(IntDir)%(Filename).obj + $(IntDir)%(Filename).obj + + diff --git a/IDE/WIN10/wolfssl-fips.vcxproj b/IDE/WIN10/wolfssl-fips.vcxproj index f1640a1e6..92d1dda83 100644 --- a/IDE/WIN10/wolfssl-fips.vcxproj +++ b/IDE/WIN10/wolfssl-fips.vcxproj @@ -307,6 +307,22 @@ $(IntDir)%(Filename).obj + + + false + false + ml64.exe /DHAVE_FIPS /DHAVE_FIPS_VERSION=5 /DHAVE_FIPS_VERSION_MINOR=1 /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + ml64.exe /DHAVE_FIPS /DHAVE_FIPS_VERSION=5 /DHAVE_FIPS_VERSION_MINOR=1 /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(IntDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /DHAVE_FIPS /DHAVE_FIPS_VERSION=5 /DHAVE_FIPS_VERSION_MINOR=1 /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + ml64.exe /DHAVE_FIPS /DHAVE_FIPS_VERSION=5 /DHAVE_FIPS_VERSION_MINOR=1 /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(IntDir)%(Filename).obj + $(IntDir)%(Filename).obj + + diff --git a/IDE/WORKBENCH/README.md b/IDE/WORKBENCH/README.md index 0052df85f..6020d6ac2 100644 --- a/IDE/WORKBENCH/README.md +++ b/IDE/WORKBENCH/README.md @@ -9,6 +9,7 @@ src and wolfcrypt directories. Uncheck the following: ``` wolfcrypt/src/aes_asm.asm + wolfcrypt/src/aes_gcm_asm.asm wolfcrypt/src/aes_asm.s examples/echoclient/ examples/echoserver/ diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index 63928bafe..305cb1d84 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -4833,8 +4833,6 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len) #define HAVE_INTEL_AVX2 #endif /* USE_INTEL_SPEEDUP */ -#ifndef _MSC_VER - void AES_GCM_encrypt(const unsigned char *in, unsigned char *out, const unsigned char* addt, const unsigned char* ivec, unsigned char *tag, word32 nbytes, @@ -4885,1146 +4883,6 @@ void AES_GCM_decrypt_avx2(const unsigned char *in, unsigned char *out, #endif /* HAVE_INTEL_AVX1 */ #endif /* HAVE_AES_DECRYPT */ -#else /* _MSC_VER */ - -/* AESNI with Microsoft */ -#ifdef __clang__ - /* With Clang the __m128i in emmintrin.h is union using: - * "unsigned __int64 m128i_u64[2];" - * Notes: Must add "-maes -msse4.1 -mpclmul" to compiler flags. - * Must mark "aes_asm.asm" as included/compiled C file. - */ - #define M128_INIT(x,y) { (long long)x, (long long)y } -#else - /* Typically this is array of 16 int8's */ - #define S(w,z) ((char)((unsigned long long)(w) >> (8*(7-(z))) & 0xFF)) - #define M128_INIT(x,y) { S((x),7), S((x),6), S((x),5), S((x),4), \ - S((x),3), S((x),2), S((x),1), S((x),0), \ - S((y),7), S((y),6), S((y),5), S((y),4), \ - S((y),3), S((y),2), S((y),1), S((y),0) } -#endif - -static const __m128i MOD2_128 = - M128_INIT(0x1, (long long int)0xc200000000000000UL); - - -/* See Intel Carry-Less Multiplication Instruction - * and its Usage for Computing the GCM Mode White Paper - * by Shay Gueron, Intel Mobility Group, Israel Development Center; - * and Michael E. Kounavis, Intel Labs, Circuits and Systems Research */ - - -/* Figure 9. AES-GCM - Encrypt With Single Block Ghash at a Time */ - -static const __m128i ONE = M128_INIT(0x0, 0x1); -#ifndef AES_GCM_AESNI_NO_UNROLL -static const __m128i TWO = M128_INIT(0x0, 0x2); -static const __m128i THREE = M128_INIT(0x0, 0x3); -static const __m128i FOUR = M128_INIT(0x0, 0x4); -static const __m128i FIVE = M128_INIT(0x0, 0x5); -static const __m128i SIX = M128_INIT(0x0, 0x6); -static const __m128i SEVEN = M128_INIT(0x0, 0x7); -static const __m128i EIGHT = M128_INIT(0x0, 0x8); -#endif -static const __m128i BSWAP_EPI64 = - M128_INIT(0x0001020304050607, 0x08090a0b0c0d0e0f); -static const __m128i BSWAP_MASK = - M128_INIT(0x08090a0b0c0d0e0f, 0x0001020304050607); - - -/* The following are for MSC based builds which do not allow - * inline assembly. Intrinsic functions are used instead. */ - -#define aes_gcm_calc_iv_12(KEY, ivec, nr, H, Y, T) \ -do \ -{ \ - word32 iv12[4]; \ - iv12[0] = *(word32*)&ivec[0]; \ - iv12[1] = *(word32*)&ivec[4]; \ - iv12[2] = *(word32*)&ivec[8]; \ - iv12[3] = 0x01000000; \ - Y = _mm_loadu_si128((__m128i*)iv12); \ - \ - /* (Compute E[ZERO, KS] and E[Y0, KS] together */ \ - tmp1 = _mm_load_si128(&KEY[0]); \ - tmp2 = _mm_xor_si128(Y, KEY[0]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); \ - lastKey = KEY[10]; \ - if (nr > 10) { \ - tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ - tmp2 = _mm_aesenc_si128(tmp2, lastKey); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); \ - lastKey = KEY[12]; \ - if (nr > 12) { \ - tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ - tmp2 = _mm_aesenc_si128(tmp2, lastKey); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); \ - lastKey = KEY[14]; \ - } \ - } \ - H = _mm_aesenclast_si128(tmp1, lastKey); \ - T = _mm_aesenclast_si128(tmp2, lastKey); \ - H = _mm_shuffle_epi8(H, BSWAP_MASK); \ -} \ -while (0) - - -#ifdef _M_X64 - /* 64-bit */ - #define AES_GCM_INSERT_EPI(tmp1, a, b) \ - tmp1 = _mm_insert_epi64(tmp1, ((word64)(a))*8, 0); \ - tmp1 = _mm_insert_epi64(tmp1, ((word64)(b))*8, 1); -#else - /* 32-bit */ - #define AES_GCM_INSERT_EPI(tmp1, a, b) \ - tmp1 = _mm_insert_epi32(tmp1, ((int)(a))*8, 0); \ - tmp1 = _mm_insert_epi32(tmp1, 0, 1); \ - tmp1 = _mm_insert_epi32(tmp1, ((int)(b))*8, 2); \ - tmp1 = _mm_insert_epi32(tmp1, 0, 3); -#endif - -#define aes_gcm_calc_iv(KEY, ivec, ibytes, nr, H, Y, T) \ -do \ -{ \ - if (ibytes % 16) { \ - i = ibytes / 16; \ - for (j=0; j < (int)(ibytes%16); j++) \ - ((unsigned char*)&last_block)[j] = ivec[i*16+j]; \ - } \ - tmp1 = _mm_load_si128(&KEY[0]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); \ - lastKey = KEY[10]; \ - if (nr > 10) { \ - tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); \ - lastKey = KEY[12]; \ - if (nr > 12) { \ - tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); \ - lastKey = KEY[14]; \ - } \ - } \ - H = _mm_aesenclast_si128(tmp1, lastKey); \ - H = _mm_shuffle_epi8(H, BSWAP_MASK); \ - Y = _mm_setzero_si128(); \ - for (i=0; i < (int)(ibytes/16); i++) { \ - tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); \ - tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); \ - Y = _mm_xor_si128(Y, tmp1); \ - Y = gfmul_sw(Y, H); \ - } \ - if (ibytes % 16) { \ - tmp1 = last_block; \ - tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); \ - Y = _mm_xor_si128(Y, tmp1); \ - Y = gfmul_sw(Y, H); \ - } \ - AES_GCM_INSERT_EPI(tmp1, ibytes, 0); \ - Y = _mm_xor_si128(Y, tmp1); \ - Y = gfmul_sw(Y, H); \ - Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /* Compute E(K, Y0) */ \ - tmp1 = _mm_xor_si128(Y, KEY[0]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); \ - lastKey = KEY[10]; \ - if (nr > 10) { \ - tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); \ - lastKey = KEY[12]; \ - if (nr > 12) { \ - tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); \ - lastKey = KEY[14]; \ - } \ - } \ - T = _mm_aesenclast_si128(tmp1, lastKey); \ -} \ -while (0) - -#define AES_ENC_8(j) \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); \ - tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); \ - tmp4 = _mm_aesenc_si128(tmp4, KEY[j]); \ - tmp5 = _mm_aesenc_si128(tmp5, KEY[j]); \ - tmp6 = _mm_aesenc_si128(tmp6, KEY[j]); \ - tmp7 = _mm_aesenc_si128(tmp7, KEY[j]); \ - tmp8 = _mm_aesenc_si128(tmp8, KEY[j]); - -#define AES_ENC_LAST_8() \ - tmp1 =_mm_aesenclast_si128(tmp1, lastKey); \ - tmp2 =_mm_aesenclast_si128(tmp2, lastKey); \ - tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[i*8+0])); \ - tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[i*8+1])); \ - _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1); \ - _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2); \ - tmp3 =_mm_aesenclast_si128(tmp3, lastKey); \ - tmp4 =_mm_aesenclast_si128(tmp4, lastKey); \ - tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[i*8+2])); \ - tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[i*8+3])); \ - _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3); \ - _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4); \ - tmp5 =_mm_aesenclast_si128(tmp5, lastKey); \ - tmp6 =_mm_aesenclast_si128(tmp6, lastKey); \ - tmp5 = _mm_xor_si128(tmp5, _mm_loadu_si128(&((__m128i*)in)[i*8+4])); \ - tmp6 = _mm_xor_si128(tmp6, _mm_loadu_si128(&((__m128i*)in)[i*8+5])); \ - _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5); \ - _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6); \ - tmp7 =_mm_aesenclast_si128(tmp7, lastKey); \ - tmp8 =_mm_aesenclast_si128(tmp8, lastKey); \ - tmp7 = _mm_xor_si128(tmp7, _mm_loadu_si128(&((__m128i*)in)[i*8+6])); \ - tmp8 = _mm_xor_si128(tmp8, _mm_loadu_si128(&((__m128i*)in)[i*8+7])); \ - _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); \ - _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); - - -static WARN_UNUSED_RESULT __m128i gfmul_sw(__m128i a, __m128i b) -{ - __m128i r, t1, t2, t3, t4, t5, t6, t7; - t2 = _mm_shuffle_epi32(b, 78); - t3 = _mm_shuffle_epi32(a, 78); - t2 = _mm_xor_si128(t2, b); - t3 = _mm_xor_si128(t3, a); - t4 = _mm_clmulepi64_si128(b, a, 0x11); - t1 = _mm_clmulepi64_si128(b, a, 0x00); - t2 = _mm_clmulepi64_si128(t2, t3, 0x00); - t2 = _mm_xor_si128(t2, t1); - t2 = _mm_xor_si128(t2, t4); - t3 = _mm_slli_si128(t2, 8); - t2 = _mm_srli_si128(t2, 8); - t1 = _mm_xor_si128(t1, t3); - t4 = _mm_xor_si128(t4, t2); - - t5 = _mm_srli_epi32(t1, 31); - t6 = _mm_srli_epi32(t4, 31); - t1 = _mm_slli_epi32(t1, 1); - t4 = _mm_slli_epi32(t4, 1); - t7 = _mm_srli_si128(t5, 12); - t5 = _mm_slli_si128(t5, 4); - t6 = _mm_slli_si128(t6, 4); - t4 = _mm_or_si128(t4, t7); - t1 = _mm_or_si128(t1, t5); - t4 = _mm_or_si128(t4, t6); - - t5 = _mm_slli_epi32(t1, 31); - t6 = _mm_slli_epi32(t1, 30); - t7 = _mm_slli_epi32(t1, 25); - t5 = _mm_xor_si128(t5, t6); - t5 = _mm_xor_si128(t5, t7); - - t6 = _mm_srli_si128(t5, 4); - t5 = _mm_slli_si128(t5, 12); - t1 = _mm_xor_si128(t1, t5); - t7 = _mm_srli_epi32(t1, 1); - t3 = _mm_srli_epi32(t1, 2); - t2 = _mm_srli_epi32(t1, 7); - - t7 = _mm_xor_si128(t7, t3); - t7 = _mm_xor_si128(t7, t2); - t7 = _mm_xor_si128(t7, t6); - t7 = _mm_xor_si128(t7, t1); - r = _mm_xor_si128(t4, t7); - - return r; -} - -static void gfmul_only(__m128i a, __m128i b, __m128i* r0, __m128i* r1) -{ - __m128i t1, t2, t3, t4; - - /* 128 x 128 Carryless Multiply */ - t2 = _mm_shuffle_epi32(b, 78); - t3 = _mm_shuffle_epi32(a, 78); - t2 = _mm_xor_si128(t2, b); - t3 = _mm_xor_si128(t3, a); - t4 = _mm_clmulepi64_si128(b, a, 0x11); - t1 = _mm_clmulepi64_si128(b, a, 0x00); - t2 = _mm_clmulepi64_si128(t2, t3, 0x00); - t2 = _mm_xor_si128(t2, t1); - t2 = _mm_xor_si128(t2, t4); - t3 = _mm_slli_si128(t2, 8); - t2 = _mm_srli_si128(t2, 8); - t1 = _mm_xor_si128(t1, t3); - t4 = _mm_xor_si128(t4, t2); - *r0 = _mm_xor_si128(t1, *r0); - *r1 = _mm_xor_si128(t4, *r1); -} - -static WARN_UNUSED_RESULT __m128i gfmul_shl1(__m128i a) -{ - __m128i t1 = a, t2; - t2 = _mm_srli_epi64(t1, 63); - t1 = _mm_slli_epi64(t1, 1); - t2 = _mm_slli_si128(t2, 8); - t1 = _mm_or_si128(t1, t2); - /* if (a[1] >> 63) t1 = _mm_xor_si128(t1, MOD2_128); */ - a = _mm_shuffle_epi32(a, 0xff); - a = _mm_srai_epi32(a, 31); - a = _mm_and_si128(a, MOD2_128); - t1 = _mm_xor_si128(t1, a); - return t1; -} - -static WARN_UNUSED_RESULT __m128i ghash_red(__m128i r0, __m128i r1) -{ - __m128i t2, t3; - __m128i t5, t6, t7; - - t5 = _mm_slli_epi32(r0, 31); - t6 = _mm_slli_epi32(r0, 30); - t7 = _mm_slli_epi32(r0, 25); - t5 = _mm_xor_si128(t5, t6); - t5 = _mm_xor_si128(t5, t7); - - t6 = _mm_srli_si128(t5, 4); - t5 = _mm_slli_si128(t5, 12); - r0 = _mm_xor_si128(r0, t5); - t7 = _mm_srli_epi32(r0, 1); - t3 = _mm_srli_epi32(r0, 2); - t2 = _mm_srli_epi32(r0, 7); - - t7 = _mm_xor_si128(t7, t3); - t7 = _mm_xor_si128(t7, t2); - t7 = _mm_xor_si128(t7, t6); - t7 = _mm_xor_si128(t7, r0); - return _mm_xor_si128(r1, t7); -} - -static WARN_UNUSED_RESULT __m128i gfmul_shifted(__m128i a, __m128i b) -{ - __m128i t0 = _mm_setzero_si128(), t1 = _mm_setzero_si128(); - gfmul_only(a, b, &t0, &t1); - return ghash_red(t0, t1); -} - -#ifndef AES_GCM_AESNI_NO_UNROLL -static WARN_UNUSED_RESULT __m128i gfmul8( - __m128i a1, __m128i a2, __m128i a3, __m128i a4, - __m128i a5, __m128i a6, __m128i a7, __m128i a8, - __m128i b1, __m128i b2, __m128i b3, __m128i b4, - __m128i b5, __m128i b6, __m128i b7, __m128i b8) -{ - __m128i t0 = _mm_setzero_si128(), t1 = _mm_setzero_si128(); - gfmul_only(a1, b8, &t0, &t1); - gfmul_only(a2, b7, &t0, &t1); - gfmul_only(a3, b6, &t0, &t1); - gfmul_only(a4, b5, &t0, &t1); - gfmul_only(a5, b4, &t0, &t1); - gfmul_only(a6, b3, &t0, &t1); - gfmul_only(a7, b2, &t0, &t1); - gfmul_only(a8, b1, &t0, &t1); - return ghash_red(t0, t1); -} -#endif - - -static WARN_UNUSED_RESULT int AES_GCM_encrypt( - const unsigned char *in, unsigned char *out, - const unsigned char* addt, - const unsigned char* ivec, unsigned char *tag, - word32 nbytes, word32 abytes, word32 ibytes, - word32 tbytes, const unsigned char* key, int nr) -{ - int i, j ,k; - __m128i ctr1; - __m128i H, Y, T; - __m128i X = _mm_setzero_si128(); - __m128i *KEY = (__m128i*)key, lastKey; - __m128i last_block = _mm_setzero_si128(); - __m128i tmp1, tmp2; -#ifndef AES_GCM_AESNI_NO_UNROLL - __m128i HT[8]; - __m128i r0, r1; - __m128i XV; - __m128i tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; -#endif - - if (ibytes == GCM_NONCE_MID_SZ) - aes_gcm_calc_iv_12(KEY, ivec, nr, H, Y, T); - else - aes_gcm_calc_iv(KEY, ivec, ibytes, nr, H, Y, T); - - for (i=0; i < (int)(abytes/16); i++) { - tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]); - tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); - X = _mm_xor_si128(X, tmp1); - X = gfmul_sw(X, H); - } - if (abytes%16) { - last_block = _mm_setzero_si128(); - for (j=0; j < (int)(abytes%16); j++) - ((unsigned char*)&last_block)[j] = addt[i*16+j]; - tmp1 = last_block; - tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); - X = _mm_xor_si128(X, tmp1); - X = gfmul_sw(X, H); - } - tmp1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); - ctr1 = _mm_add_epi32(tmp1, ONE); - H = gfmul_shl1(H); - -#ifndef AES_GCM_AESNI_NO_UNROLL - i = 0; - if (nbytes >= 16*8) { - HT[0] = H; - HT[1] = gfmul_shifted(H, H); - HT[2] = gfmul_shifted(H, HT[1]); - HT[3] = gfmul_shifted(HT[1], HT[1]); - HT[4] = gfmul_shifted(HT[1], HT[2]); - HT[5] = gfmul_shifted(HT[2], HT[2]); - HT[6] = gfmul_shifted(HT[2], HT[3]); - HT[7] = gfmul_shifted(HT[3], HT[3]); - - tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); - tmp2 = _mm_add_epi32(ctr1, ONE); - tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_EPI64); - tmp3 = _mm_add_epi32(ctr1, TWO); - tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_EPI64); - tmp4 = _mm_add_epi32(ctr1, THREE); - tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_EPI64); - tmp5 = _mm_add_epi32(ctr1, FOUR); - tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_EPI64); - tmp6 = _mm_add_epi32(ctr1, FIVE); - tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_EPI64); - tmp7 = _mm_add_epi32(ctr1, SIX); - tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_EPI64); - tmp8 = _mm_add_epi32(ctr1, SEVEN); - tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_EPI64); - ctr1 = _mm_add_epi32(ctr1, EIGHT); - tmp1 =_mm_xor_si128(tmp1, KEY[0]); - tmp2 =_mm_xor_si128(tmp2, KEY[0]); - tmp3 =_mm_xor_si128(tmp3, KEY[0]); - tmp4 =_mm_xor_si128(tmp4, KEY[0]); - tmp5 =_mm_xor_si128(tmp5, KEY[0]); - tmp6 =_mm_xor_si128(tmp6, KEY[0]); - tmp7 =_mm_xor_si128(tmp7, KEY[0]); - tmp8 =_mm_xor_si128(tmp8, KEY[0]); - AES_ENC_8(1); - AES_ENC_8(2); - AES_ENC_8(3); - AES_ENC_8(4); - AES_ENC_8(5); - AES_ENC_8(6); - AES_ENC_8(7); - AES_ENC_8(8); - AES_ENC_8(9); - lastKey = KEY[10]; - if (nr > 10) { - AES_ENC_8(10); - AES_ENC_8(11); - lastKey = KEY[12]; - if (nr > 12) { - AES_ENC_8(12); - AES_ENC_8(13); - lastKey = KEY[14]; - } - } - AES_ENC_LAST_8(); - - for (i=1; i < (int)(nbytes/16/8); i++) { - r0 = _mm_setzero_si128(); - r1 = _mm_setzero_si128(); - tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); - tmp2 = _mm_add_epi32(ctr1, ONE); - tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_EPI64); - tmp3 = _mm_add_epi32(ctr1, TWO); - tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_EPI64); - tmp4 = _mm_add_epi32(ctr1, THREE); - tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_EPI64); - tmp5 = _mm_add_epi32(ctr1, FOUR); - tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_EPI64); - tmp6 = _mm_add_epi32(ctr1, FIVE); - tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_EPI64); - tmp7 = _mm_add_epi32(ctr1, SIX); - tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_EPI64); - tmp8 = _mm_add_epi32(ctr1, SEVEN); - tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_EPI64); - ctr1 = _mm_add_epi32(ctr1, EIGHT); - tmp1 =_mm_xor_si128(tmp1, KEY[0]); - tmp2 =_mm_xor_si128(tmp2, KEY[0]); - tmp3 =_mm_xor_si128(tmp3, KEY[0]); - tmp4 =_mm_xor_si128(tmp4, KEY[0]); - tmp5 =_mm_xor_si128(tmp5, KEY[0]); - tmp6 =_mm_xor_si128(tmp6, KEY[0]); - tmp7 =_mm_xor_si128(tmp7, KEY[0]); - tmp8 =_mm_xor_si128(tmp8, KEY[0]); - /* 128 x 128 Carryless Multiply */ - XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+0]); - XV = _mm_shuffle_epi8(XV, BSWAP_MASK); - XV = _mm_xor_si128(XV, X); - gfmul_only(XV, HT[7], &r0, &r1); - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[1]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[1]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[1]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[1]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[1]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[1]); - /* 128 x 128 Carryless Multiply */ - XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+1]); - XV = _mm_shuffle_epi8(XV, BSWAP_MASK); - gfmul_only(XV, HT[6], &r0, &r1); - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[2]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[2]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[2]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[2]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[2]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[2]); - /* 128 x 128 Carryless Multiply */ - XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+2]); - XV = _mm_shuffle_epi8(XV, BSWAP_MASK); - gfmul_only(XV, HT[5], &r0, &r1); - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[3]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[3]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[3]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[3]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[3]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[3]); - /* 128 x 128 Carryless Multiply */ - XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+3]); - XV = _mm_shuffle_epi8(XV, BSWAP_MASK); - gfmul_only(XV, HT[4], &r0, &r1); - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[4]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[4]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[4]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[4]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[4]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[4]); - /* 128 x 128 Carryless Multiply */ - XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+4]); - XV = _mm_shuffle_epi8(XV, BSWAP_MASK); - gfmul_only(XV, HT[3], &r0, &r1); - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[5]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[5]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[5]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[5]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[5]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[5]); - /* 128 x 128 Carryless Multiply */ - XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+5]); - XV = _mm_shuffle_epi8(XV, BSWAP_MASK); - gfmul_only(XV, HT[2], &r0, &r1); - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[6]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[6]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[6]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[6]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[6]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[6]); - /* 128 x 128 Carryless Multiply */ - XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+6]); - XV = _mm_shuffle_epi8(XV, BSWAP_MASK); - gfmul_only(XV, HT[1], &r0, &r1); - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[7]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[7]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[7]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[7]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[7]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[7]); - /* 128 x 128 Carryless Multiply */ - XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+7]); - XV = _mm_shuffle_epi8(XV, BSWAP_MASK); - gfmul_only(XV, HT[0], &r0, &r1); - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[8]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[8]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[8]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[8]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[8]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[8]); - /* Reduction */ - X = ghash_red(r0, r1); - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[9]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[9]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[9]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[9]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[9]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[9]); - lastKey = KEY[10]; - if (nr > 10) { - tmp1 = _mm_aesenc_si128(tmp1, KEY[10]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[10]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[10]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[10]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[10]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[10]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[10]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[10]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[11]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[11]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[11]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[11]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[11]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[11]); - lastKey = KEY[12]; - if (nr > 12) { - tmp1 = _mm_aesenc_si128(tmp1, KEY[12]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[12]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[12]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[12]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[12]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[12]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[12]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[12]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[13]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[13]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[13]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[13]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[13]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[13]); - lastKey = KEY[14]; - } - } - AES_ENC_LAST_8(); - } - - tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); - tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); - tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); - tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); - tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK); - tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK); - tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK); - tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK); - tmp1 = _mm_xor_si128(X, tmp1); - X = gfmul8(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, - HT[0], HT[1], HT[2], HT[3], HT[4], HT[5], HT[6], HT[7]); - } - for (k = i*8; k < (int)(nbytes/16); k++) { - tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); - ctr1 = _mm_add_epi32(ctr1, ONE); - tmp1 = _mm_xor_si128(tmp1, KEY[0]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); - lastKey = KEY[10]; - if (nr > 10) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); - lastKey = KEY[12]; - if (nr > 12) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); - lastKey = KEY[14]; - } - } - tmp1 = _mm_aesenclast_si128(tmp1, lastKey); - tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); - _mm_storeu_si128(&((__m128i*)out)[k], tmp1); - tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); - X =_mm_xor_si128(X, tmp1); - X = gfmul_shifted(X, H); - } -#else /* AES_GCM_AESNI_NO_UNROLL */ - for (k = 0; k < (int)(nbytes/16) && k < 1; k++) { - tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); - ctr1 = _mm_add_epi32(ctr1, ONE); - tmp1 = _mm_xor_si128(tmp1, KEY[0]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); - lastKey = KEY[10]; - if (nr > 10) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); - lastKey = KEY[12]; - if (nr > 12) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); - lastKey = KEY[14]; - } - } - tmp1 = _mm_aesenclast_si128(tmp1, lastKey); - tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); - _mm_storeu_si128(&((__m128i*)out)[k], tmp1); - tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); - X =_mm_xor_si128(X, tmp1); - } - for (; k < (int)(nbytes/16); k++) { - tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); - ctr1 = _mm_add_epi32(ctr1, ONE); - tmp1 = _mm_xor_si128(tmp1, KEY[0]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); - X = gfmul_shifted(X, H); - lastKey = KEY[10]; - if (nr > 10) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); - lastKey = KEY[12]; - if (nr > 12) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); - lastKey = KEY[14]; - } - } - tmp1 = _mm_aesenclast_si128(tmp1, lastKey); - tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); - _mm_storeu_si128(&((__m128i*)out)[k], tmp1); - tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); - X =_mm_xor_si128(X, tmp1); - } - if (k > 0) { - X = gfmul_shifted(X, H); - } -#endif /* AES_GCM_AESNI_NO_UNROLL */ - - /* If one partial block remains */ - if (nbytes % 16) { - tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); - tmp1 = _mm_xor_si128(tmp1, KEY[0]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); - lastKey = KEY[10]; - if (nr > 10) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); - lastKey = KEY[12]; - if (nr > 12) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); - lastKey = KEY[14]; - } - } - tmp1 = _mm_aesenclast_si128(tmp1, lastKey); - last_block = tmp1; - for (j=0; j < (int)(nbytes%16); j++) - ((unsigned char*)&last_block)[j] = in[k*16+j]; - tmp1 = _mm_xor_si128(tmp1, last_block); - last_block = tmp1; - for (j=0; j < (int)(nbytes%16); j++) - out[k*16+j] = ((unsigned char*)&last_block)[j]; - tmp1 = last_block; - tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); - X =_mm_xor_si128(X, tmp1); - X = gfmul_shifted(X, H); - } - AES_GCM_INSERT_EPI(tmp1, nbytes, abytes); - X = _mm_xor_si128(X, tmp1); - X = gfmul_shifted(X, H); - X = _mm_shuffle_epi8(X, BSWAP_MASK); - T = _mm_xor_si128(X, T); - /*_mm_storeu_si128((__m128i*)tag, T);*/ - XMEMCPY(tag, &T, tbytes); - ForceZero(&lastKey, sizeof(lastKey)); - - return 0; -} - -#ifdef HAVE_AES_DECRYPT - -static WARN_UNUSED_RESULT int AES_GCM_decrypt( - const unsigned char *in, unsigned char *out, - const unsigned char* addt, - const unsigned char* ivec, const unsigned char *tag, - word32 nbytes, word32 abytes, word32 ibytes, - word32 tbytes, const unsigned char* key, int nr, - int* res) -{ - int i, j ,k; - __m128i H, Y, T; - __m128i *KEY = (__m128i*)key, lastKey; - __m128i ctr1; - __m128i last_block = _mm_setzero_si128(); - __m128i X = _mm_setzero_si128(); - __m128i tmp1, tmp2, XV; -#ifndef AES_GCM_AESNI_NO_UNROLL - __m128i HT[8]; - __m128i r0, r1; - __m128i tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; -#endif /* AES_GCM_AESNI_NO_UNROLL */ - - if (ibytes == GCM_NONCE_MID_SZ) - aes_gcm_calc_iv_12(KEY, ivec, nr, H, Y, T); - else - aes_gcm_calc_iv(KEY, ivec, ibytes, nr, H, Y, T); - - for (i=0; i<(int)(abytes/16); i++) { - tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]); - tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); - X = _mm_xor_si128(X, tmp1); - X = gfmul_sw(X, H); - } - if (abytes%16) { - last_block = _mm_setzero_si128(); - for (j=0; j<(int)(abytes%16); j++) - ((unsigned char*)&last_block)[j] = addt[i*16+j]; - tmp1 = last_block; - tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); - X = _mm_xor_si128(X, tmp1); - X = gfmul_sw(X, H); - } - - tmp1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); - ctr1 = _mm_add_epi32(tmp1, ONE); - H = gfmul_shl1(H); - i = 0; - -#ifndef AES_GCM_AESNI_NO_UNROLL - - if (0 < nbytes/16/8) { - HT[0] = H; - HT[1] = gfmul_shifted(H, H); - HT[2] = gfmul_shifted(H, HT[1]); - HT[3] = gfmul_shifted(HT[1], HT[1]); - HT[4] = gfmul_shifted(HT[1], HT[2]); - HT[5] = gfmul_shifted(HT[2], HT[2]); - HT[6] = gfmul_shifted(HT[2], HT[3]); - HT[7] = gfmul_shifted(HT[3], HT[3]); - - for (; i < (int)(nbytes/16/8); i++) { - r0 = _mm_setzero_si128(); - r1 = _mm_setzero_si128(); - - tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); - tmp2 = _mm_add_epi32(ctr1, ONE); - tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_EPI64); - tmp3 = _mm_add_epi32(ctr1, TWO); - tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_EPI64); - tmp4 = _mm_add_epi32(ctr1, THREE); - tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_EPI64); - tmp5 = _mm_add_epi32(ctr1, FOUR); - tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_EPI64); - tmp6 = _mm_add_epi32(ctr1, FIVE); - tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_EPI64); - tmp7 = _mm_add_epi32(ctr1, SIX); - tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_EPI64); - tmp8 = _mm_add_epi32(ctr1, SEVEN); - tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_EPI64); - ctr1 = _mm_add_epi32(ctr1, EIGHT); - tmp1 =_mm_xor_si128(tmp1, KEY[0]); - tmp2 =_mm_xor_si128(tmp2, KEY[0]); - tmp3 =_mm_xor_si128(tmp3, KEY[0]); - tmp4 =_mm_xor_si128(tmp4, KEY[0]); - tmp5 =_mm_xor_si128(tmp5, KEY[0]); - tmp6 =_mm_xor_si128(tmp6, KEY[0]); - tmp7 =_mm_xor_si128(tmp7, KEY[0]); - tmp8 =_mm_xor_si128(tmp8, KEY[0]); - /* 128 x 128 Carryless Multiply */ - XV = _mm_loadu_si128(&((__m128i*)in)[i*8+0]); - XV = _mm_shuffle_epi8(XV, BSWAP_MASK); - XV = _mm_xor_si128(XV, X); - gfmul_only(XV, HT[7], &r0, &r1); - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[1]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[1]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[1]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[1]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[1]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[1]); - /* 128 x 128 Carryless Multiply */ - XV = _mm_loadu_si128(&((__m128i*)in)[i*8+1]); - XV = _mm_shuffle_epi8(XV, BSWAP_MASK); - gfmul_only(XV, HT[6], &r0, &r1); - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[2]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[2]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[2]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[2]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[2]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[2]); - /* 128 x 128 Carryless Multiply */ - XV = _mm_loadu_si128(&((__m128i*)in)[i*8+2]); - XV = _mm_shuffle_epi8(XV, BSWAP_MASK); - gfmul_only(XV, HT[5], &r0, &r1); - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[3]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[3]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[3]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[3]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[3]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[3]); - /* 128 x 128 Carryless Multiply */ - XV = _mm_loadu_si128(&((__m128i*)in)[i*8+3]); - XV = _mm_shuffle_epi8(XV, BSWAP_MASK); - gfmul_only(XV, HT[4], &r0, &r1); - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[4]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[4]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[4]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[4]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[4]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[4]); - /* 128 x 128 Carryless Multiply */ - XV = _mm_loadu_si128(&((__m128i*)in)[i*8+4]); - XV = _mm_shuffle_epi8(XV, BSWAP_MASK); - gfmul_only(XV, HT[3], &r0, &r1); - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[5]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[5]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[5]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[5]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[5]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[5]); - /* 128 x 128 Carryless Multiply */ - XV = _mm_loadu_si128(&((__m128i*)in)[i*8+5]); - XV = _mm_shuffle_epi8(XV, BSWAP_MASK); - gfmul_only(XV, HT[2], &r0, &r1); - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[6]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[6]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[6]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[6]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[6]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[6]); - /* 128 x 128 Carryless Multiply */ - XV = _mm_loadu_si128(&((__m128i*)in)[i*8+6]); - XV = _mm_shuffle_epi8(XV, BSWAP_MASK); - gfmul_only(XV, HT[1], &r0, &r1); - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[7]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[7]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[7]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[7]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[7]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[7]); - /* 128 x 128 Carryless Multiply */ - XV = _mm_loadu_si128(&((__m128i*)in)[i*8+7]); - XV = _mm_shuffle_epi8(XV, BSWAP_MASK); - gfmul_only(XV, HT[0], &r0, &r1); - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[8]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[8]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[8]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[8]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[8]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[8]); - /* Reduction */ - X = ghash_red(r0, r1); - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[9]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[9]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[9]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[9]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[9]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[9]); - lastKey = KEY[10]; - if (nr > 10) { - tmp1 = _mm_aesenc_si128(tmp1, KEY[10]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[10]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[10]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[10]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[10]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[10]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[10]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[10]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[11]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[11]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[11]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[11]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[11]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[11]); - lastKey = KEY[12]; - if (nr > 12) { - tmp1 = _mm_aesenc_si128(tmp1, KEY[12]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[12]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[12]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[12]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[12]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[12]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[12]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[12]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); - tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); - tmp3 = _mm_aesenc_si128(tmp3, KEY[13]); - tmp4 = _mm_aesenc_si128(tmp4, KEY[13]); - tmp5 = _mm_aesenc_si128(tmp5, KEY[13]); - tmp6 = _mm_aesenc_si128(tmp6, KEY[13]); - tmp7 = _mm_aesenc_si128(tmp7, KEY[13]); - tmp8 = _mm_aesenc_si128(tmp8, KEY[13]); - lastKey = KEY[14]; - } - } - AES_ENC_LAST_8(); - } - } - -#endif /* AES_GCM_AESNI_NO_UNROLL */ - - for (k = i*8; k < (int)(nbytes/16); k++) { - tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); - ctr1 = _mm_add_epi32(ctr1, ONE); - tmp1 = _mm_xor_si128(tmp1, KEY[0]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); - /* 128 x 128 Carryless Multiply */ - XV = _mm_loadu_si128(&((__m128i*)in)[k]); - XV = _mm_shuffle_epi8(XV, BSWAP_MASK); - XV = _mm_xor_si128(XV, X); - X = gfmul_shifted(XV, H); - lastKey = KEY[10]; - if (nr > 10) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); - lastKey = KEY[12]; - if (nr > 12) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); - lastKey = KEY[14]; - } - } - tmp1 = _mm_aesenclast_si128(tmp1, lastKey); - tmp2 = _mm_loadu_si128(&((__m128i*)in)[k]); - tmp1 = _mm_xor_si128(tmp1, tmp2); - _mm_storeu_si128(&((__m128i*)out)[k], tmp1); - } - - /* If one partial block remains */ - if (nbytes % 16) { - tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); - tmp1 = _mm_xor_si128(tmp1, KEY[0]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); - lastKey = KEY[10]; - if (nr > 10) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); - lastKey = KEY[12]; - if (nr > 12) { - tmp1 = _mm_aesenc_si128(tmp1, lastKey); - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); - lastKey = KEY[14]; - } - } - tmp1 = _mm_aesenclast_si128(tmp1, lastKey); - last_block = _mm_setzero_si128(); - for (j=0; j < (int)(nbytes%16); j++) - ((unsigned char*)&last_block)[j] = in[k*16+j]; - XV = last_block; - tmp1 = _mm_xor_si128(tmp1, last_block); - last_block = tmp1; - for (j=0; j < (int)(nbytes%16); j++) - out[k*16+j] = ((unsigned char*)&last_block)[j]; - XV = _mm_shuffle_epi8(XV, BSWAP_MASK); - XV = _mm_xor_si128(XV, X); - X = gfmul_shifted(XV, H); - } - - AES_GCM_INSERT_EPI(tmp1, nbytes, abytes); - - /* 128 x 128 Carryless Multiply */ - X = _mm_xor_si128(X, tmp1); - X = gfmul_shifted(X, H); - X = _mm_shuffle_epi8(X, BSWAP_MASK); - T = _mm_xor_si128(X, T); - -/* if (0xffff != - _mm_movemask_epi8(_mm_cmpeq_epi8(T, _mm_loadu_si128((__m128i*)tag)))) */ - if (XMEMCMP(tag, &T, tbytes) != 0) - *res = 0; /* in case the authentication failed */ - else - *res = 1; /* when successful returns 1 */ - ForceZero(&lastKey, sizeof(lastKey)); - - return 0; -} - -#endif /* HAVE_AES_DECRYPT */ -#endif /* _MSC_VER */ #endif /* WOLFSSL_AESNI */ #if defined(GCM_SMALL) @@ -7255,7 +6113,7 @@ static void GHASH_UPDATE(Aes* aes, const byte* a, word32 aSz, const byte* c, /* Calculate amount we can use - fill up the block. */ byte sz = AES_BLOCK_SIZE - aes->aOver; if (sz > aSz) { - sz = aSz; + sz = (byte)aSz; } /* Copy extra into last GHASH block array and update count. */ XMEMCPY(AES_LASTGBLOCK(aes) + aes->aOver, a, sz); @@ -7304,7 +6162,7 @@ static void GHASH_UPDATE(Aes* aes, const byte* a, word32 aSz, const byte* c, /* Calculate amount we can use - fill up the block. */ byte sz = AES_BLOCK_SIZE - aes->cOver; if (sz > cSz) { - sz = cSz; + sz = (byte)cSz; } XMEMCPY(AES_LASTGBLOCK(aes) + aes->cOver, c, sz); /* Update count of unsed encrypted counter. */ @@ -8505,7 +7363,7 @@ static WARN_UNUSED_RESULT int AesGcmCryptUpdate_C( /* Check if previous encrypted block was not used up. */ if (aes->over > 0) { byte pSz = AES_BLOCK_SIZE - aes->over; - if (pSz > sz) pSz = sz; + if (pSz > sz) pSz = (byte)sz; /* Use some/all of last encrypted block. */ xorbufout(out, AES_LASTBLOCK(aes) + aes->over, in, pSz); @@ -8572,7 +7430,7 @@ static WARN_UNUSED_RESULT int AesGcmCryptUpdate_C( /* XOR plain text into encrypted counter into cipher text buffer. */ xorbufout(out, AES_LASTBLOCK(aes), in, partial); /* Keep amount of encrypted block used. */ - aes->over = partial; + aes->over = (byte)partial; } return 0; diff --git a/wolfcrypt/src/aes_gcm_asm.S b/wolfcrypt/src/aes_gcm_asm.S index 7624bd9a7..82db934af 100644 --- a/wolfcrypt/src/aes_gcm_asm.S +++ b/wolfcrypt/src/aes_gcm_asm.S @@ -266,7 +266,7 @@ L_AES_GCM_encrypt_calc_iv_12_last: aesenclast %xmm7, %xmm5 aesenclast %xmm7, %xmm1 pshufb L_aes_gcm_bswap_mask(%rip), %xmm5 - movdqa %xmm1, 144(%rsp) + movdqu %xmm1, 144(%rsp) jmp L_AES_GCM_encrypt_iv_done L_AES_GCM_encrypt_iv_not_12: # Calculate values when IV is not 12 bytes @@ -371,7 +371,7 @@ L_AES_GCM_encrypt_calc_iv_lt16: subq $16, %rsp pxor %xmm8, %xmm8 xorl %ebx, %ebx - movdqa %xmm8, (%rsp) + movdqu %xmm8, (%rsp) L_AES_GCM_encrypt_calc_iv_loop: movzbl (%rax,%rcx,1), %r13d movb %r13b, (%rsp,%rbx,1) @@ -379,7 +379,7 @@ L_AES_GCM_encrypt_calc_iv_loop: incl %ebx cmpl %edx, %ecx jl L_AES_GCM_encrypt_calc_iv_loop - movdqa (%rsp), %xmm8 + movdqu (%rsp), %xmm8 addq $16, %rsp pshufb L_aes_gcm_bswap_mask(%rip), %xmm8 pxor %xmm8, %xmm4 @@ -523,7 +523,7 @@ L_AES_GCM_encrypt_calc_iv_done: movdqa 224(%r15), %xmm9 L_AES_GCM_encrypt_calc_iv_2_aesenc_avx_last: aesenclast %xmm9, %xmm8 - movdqa %xmm8, 144(%rsp) + movdqu %xmm8, 144(%rsp) L_AES_GCM_encrypt_iv_done: # Additional authentication data movl %r11d, %edx @@ -601,7 +601,7 @@ L_AES_GCM_encrypt_calc_aad_lt16: subq $16, %rsp pxor %xmm8, %xmm8 xorl %ebx, %ebx - movdqa %xmm8, (%rsp) + movdqu %xmm8, (%rsp) L_AES_GCM_encrypt_calc_aad_loop: movzbl (%r12,%rcx,1), %r13d movb %r13b, (%rsp,%rbx,1) @@ -609,7 +609,7 @@ L_AES_GCM_encrypt_calc_aad_loop: incl %ebx cmpl %edx, %ecx jl L_AES_GCM_encrypt_calc_aad_loop - movdqa (%rsp), %xmm8 + movdqu (%rsp), %xmm8 addq $16, %rsp pshufb L_aes_gcm_bswap_mask(%rip), %xmm8 pxor %xmm8, %xmm6 @@ -673,7 +673,7 @@ L_AES_GCM_encrypt_calc_aad_done: movdqa %xmm5, %xmm9 paddd L_aes_gcm_one(%rip), %xmm4 movdqa %xmm5, %xmm8 - movdqa %xmm4, 128(%rsp) + movdqu %xmm4, 128(%rsp) psrlq $63, %xmm9 psllq $0x01, %xmm8 pslldq $8, %xmm9 @@ -689,7 +689,7 @@ L_AES_GCM_encrypt_calc_aad_done: andl $0xffffff80, %r13d movdqa %xmm6, %xmm2 # H ^ 1 - movdqa %xmm5, (%rsp) + movdqu %xmm5, (%rsp) # H ^ 2 pshufd $0x4e, %xmm5, %xmm9 pshufd $0x4e, %xmm5, %xmm10 @@ -731,7 +731,7 @@ L_AES_GCM_encrypt_calc_aad_done: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm0 - movdqa %xmm0, 16(%rsp) + movdqu %xmm0, 16(%rsp) # H ^ 3 pshufd $0x4e, %xmm5, %xmm9 pshufd $0x4e, %xmm0, %xmm10 @@ -773,7 +773,7 @@ L_AES_GCM_encrypt_calc_aad_done: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm1 - movdqa %xmm1, 32(%rsp) + movdqu %xmm1, 32(%rsp) # H ^ 4 pshufd $0x4e, %xmm0, %xmm9 pshufd $0x4e, %xmm0, %xmm10 @@ -815,7 +815,7 @@ L_AES_GCM_encrypt_calc_aad_done: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm3 - movdqa %xmm3, 48(%rsp) + movdqu %xmm3, 48(%rsp) # H ^ 5 pshufd $0x4e, %xmm0, %xmm9 pshufd $0x4e, %xmm1, %xmm10 @@ -857,7 +857,7 @@ L_AES_GCM_encrypt_calc_aad_done: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm7 - movdqa %xmm7, 64(%rsp) + movdqu %xmm7, 64(%rsp) # H ^ 6 pshufd $0x4e, %xmm1, %xmm9 pshufd $0x4e, %xmm1, %xmm10 @@ -899,7 +899,7 @@ L_AES_GCM_encrypt_calc_aad_done: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm7 - movdqa %xmm7, 80(%rsp) + movdqu %xmm7, 80(%rsp) # H ^ 7 pshufd $0x4e, %xmm1, %xmm9 pshufd $0x4e, %xmm3, %xmm10 @@ -941,7 +941,7 @@ L_AES_GCM_encrypt_calc_aad_done: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm7 - movdqa %xmm7, 96(%rsp) + movdqu %xmm7, 96(%rsp) # H ^ 8 pshufd $0x4e, %xmm3, %xmm9 pshufd $0x4e, %xmm3, %xmm10 @@ -983,9 +983,9 @@ L_AES_GCM_encrypt_calc_aad_done: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm7 - movdqa %xmm7, 112(%rsp) + movdqu %xmm7, 112(%rsp) # First 128 bytes of input - movdqa 128(%rsp), %xmm8 + movdqu 128(%rsp), %xmm8 movdqa L_aes_gcm_bswap_epi64(%rip), %xmm1 movdqa %xmm8, %xmm0 pshufb %xmm1, %xmm8 @@ -1012,7 +1012,7 @@ L_AES_GCM_encrypt_calc_aad_done: pshufb %xmm1, %xmm15 paddd L_aes_gcm_eight(%rip), %xmm0 movdqa (%r15), %xmm7 - movdqa %xmm0, 128(%rsp) + movdqu %xmm0, 128(%rsp) pxor %xmm7, %xmm8 pxor %xmm7, %xmm9 pxor %xmm7, %xmm10 @@ -1183,7 +1183,7 @@ L_AES_GCM_encrypt_enc_done: L_AES_GCM_encrypt_ghash_128: leaq (%rdi,%rbx,1), %rcx leaq (%rsi,%rbx,1), %rdx - movdqa 128(%rsp), %xmm8 + movdqu 128(%rsp), %xmm8 movdqa L_aes_gcm_bswap_epi64(%rip), %xmm1 movdqa %xmm8, %xmm0 pshufb %xmm1, %xmm8 @@ -1210,7 +1210,7 @@ L_AES_GCM_encrypt_ghash_128: pshufb %xmm1, %xmm15 paddd L_aes_gcm_eight(%rip), %xmm0 movdqa (%r15), %xmm7 - movdqa %xmm0, 128(%rsp) + movdqu %xmm0, 128(%rsp) pxor %xmm7, %xmm8 pxor %xmm7, %xmm9 pxor %xmm7, %xmm10 @@ -1219,7 +1219,7 @@ L_AES_GCM_encrypt_ghash_128: pxor %xmm7, %xmm13 pxor %xmm7, %xmm14 pxor %xmm7, %xmm15 - movdqa 112(%rsp), %xmm7 + movdqu 112(%rsp), %xmm7 movdqu -128(%rdx), %xmm0 aesenc 16(%r15), %xmm8 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -1242,7 +1242,7 @@ L_AES_GCM_encrypt_ghash_128: aesenc 16(%r15), %xmm15 pxor %xmm2, %xmm1 pxor %xmm3, %xmm1 - movdqa 96(%rsp), %xmm7 + movdqu 96(%rsp), %xmm7 movdqu -112(%rdx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -1266,7 +1266,7 @@ L_AES_GCM_encrypt_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa 80(%rsp), %xmm7 + movdqu 80(%rsp), %xmm7 movdqu -96(%rdx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -1290,7 +1290,7 @@ L_AES_GCM_encrypt_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa 64(%rsp), %xmm7 + movdqu 64(%rsp), %xmm7 movdqu -80(%rdx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -1314,7 +1314,7 @@ L_AES_GCM_encrypt_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa 48(%rsp), %xmm7 + movdqu 48(%rsp), %xmm7 movdqu -64(%rdx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -1338,7 +1338,7 @@ L_AES_GCM_encrypt_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa 32(%rsp), %xmm7 + movdqu 32(%rsp), %xmm7 movdqu -48(%rdx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -1362,7 +1362,7 @@ L_AES_GCM_encrypt_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa 16(%rsp), %xmm7 + movdqu 16(%rsp), %xmm7 movdqu -32(%rdx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -1386,7 +1386,7 @@ L_AES_GCM_encrypt_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa (%rsp), %xmm7 + movdqu (%rsp), %xmm7 movdqu -16(%rdx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -1534,7 +1534,7 @@ L_AES_GCM_encrypt_end_128: pshufb %xmm4, %xmm13 pshufb %xmm4, %xmm14 pshufb %xmm4, %xmm15 - movdqa 112(%rsp), %xmm7 + movdqu 112(%rsp), %xmm7 pshufd $0x4e, %xmm8, %xmm1 pshufd $0x4e, %xmm7, %xmm2 movdqa %xmm7, %xmm3 @@ -1553,7 +1553,7 @@ L_AES_GCM_encrypt_end_128: psrldq $8, %xmm1 pxor %xmm2, %xmm4 pxor %xmm1, %xmm6 - movdqa 96(%rsp), %xmm7 + movdqu 96(%rsp), %xmm7 pshufd $0x4e, %xmm9, %xmm1 pshufd $0x4e, %xmm7, %xmm2 movdqa %xmm7, %xmm3 @@ -1572,7 +1572,7 @@ L_AES_GCM_encrypt_end_128: psrldq $8, %xmm1 pxor %xmm2, %xmm4 pxor %xmm1, %xmm6 - movdqa 80(%rsp), %xmm7 + movdqu 80(%rsp), %xmm7 pshufd $0x4e, %xmm10, %xmm1 pshufd $0x4e, %xmm7, %xmm2 movdqa %xmm7, %xmm3 @@ -1591,7 +1591,7 @@ L_AES_GCM_encrypt_end_128: psrldq $8, %xmm1 pxor %xmm2, %xmm4 pxor %xmm1, %xmm6 - movdqa 64(%rsp), %xmm7 + movdqu 64(%rsp), %xmm7 pshufd $0x4e, %xmm11, %xmm1 pshufd $0x4e, %xmm7, %xmm2 movdqa %xmm7, %xmm3 @@ -1610,7 +1610,7 @@ L_AES_GCM_encrypt_end_128: psrldq $8, %xmm1 pxor %xmm2, %xmm4 pxor %xmm1, %xmm6 - movdqa 48(%rsp), %xmm7 + movdqu 48(%rsp), %xmm7 pshufd $0x4e, %xmm12, %xmm1 pshufd $0x4e, %xmm7, %xmm2 movdqa %xmm7, %xmm3 @@ -1629,7 +1629,7 @@ L_AES_GCM_encrypt_end_128: psrldq $8, %xmm1 pxor %xmm2, %xmm4 pxor %xmm1, %xmm6 - movdqa 32(%rsp), %xmm7 + movdqu 32(%rsp), %xmm7 pshufd $0x4e, %xmm13, %xmm1 pshufd $0x4e, %xmm7, %xmm2 movdqa %xmm7, %xmm3 @@ -1648,7 +1648,7 @@ L_AES_GCM_encrypt_end_128: psrldq $8, %xmm1 pxor %xmm2, %xmm4 pxor %xmm1, %xmm6 - movdqa 16(%rsp), %xmm7 + movdqu 16(%rsp), %xmm7 pshufd $0x4e, %xmm14, %xmm1 pshufd $0x4e, %xmm7, %xmm2 movdqa %xmm7, %xmm3 @@ -1667,7 +1667,7 @@ L_AES_GCM_encrypt_end_128: psrldq $8, %xmm1 pxor %xmm2, %xmm4 pxor %xmm1, %xmm6 - movdqa (%rsp), %xmm7 + movdqu (%rsp), %xmm7 pshufd $0x4e, %xmm15, %xmm1 pshufd $0x4e, %xmm7, %xmm2 movdqa %xmm7, %xmm3 @@ -1709,7 +1709,7 @@ L_AES_GCM_encrypt_end_128: pxor %xmm1, %xmm2 pxor %xmm4, %xmm2 pxor %xmm2, %xmm6 - movdqa (%rsp), %xmm5 + movdqu (%rsp), %xmm5 L_AES_GCM_encrypt_done_128: movl %r9d, %edx cmpl %edx, %ebx @@ -1720,12 +1720,12 @@ L_AES_GCM_encrypt_done_128: jge L_AES_GCM_encrypt_last_block_done leaq (%rdi,%rbx,1), %rcx leaq (%rsi,%rbx,1), %rdx - movdqa 128(%rsp), %xmm8 + movdqu 128(%rsp), %xmm8 movdqa %xmm8, %xmm9 pshufb L_aes_gcm_bswap_epi64(%rip), %xmm8 paddd L_aes_gcm_one(%rip), %xmm9 pxor (%r15), %xmm8 - movdqa %xmm9, 128(%rsp) + movdqu %xmm9, 128(%rsp) aesenc 16(%r15), %xmm8 aesenc 32(%r15), %xmm8 aesenc 48(%r15), %xmm8 @@ -1759,12 +1759,12 @@ L_AES_GCM_encrypt_aesenc_block_aesenc_avx_last: L_AES_GCM_encrypt_last_block_start: leaq (%rdi,%rbx,1), %rcx leaq (%rsi,%rbx,1), %rdx - movdqa 128(%rsp), %xmm8 + movdqu 128(%rsp), %xmm8 movdqa %xmm8, %xmm9 pshufb L_aes_gcm_bswap_epi64(%rip), %xmm8 paddd L_aes_gcm_one(%rip), %xmm9 pxor (%r15), %xmm8 - movdqa %xmm9, 128(%rsp) + movdqu %xmm9, 128(%rsp) movdqa %xmm6, %xmm10 pclmulqdq $16, %xmm5, %xmm10 aesenc 16(%r15), %xmm8 @@ -1866,7 +1866,7 @@ L_AES_GCM_encrypt_last_block_done: movl %ecx, %edx andl $15, %ecx jz L_AES_GCM_encrypt_aesenc_last15_enc_avx_done - movdqa 128(%rsp), %xmm4 + movdqu 128(%rsp), %xmm4 pshufb L_aes_gcm_bswap_epi64(%rip), %xmm4 pxor (%r15), %xmm4 aesenc 16(%r15), %xmm4 @@ -1893,7 +1893,7 @@ L_AES_GCM_encrypt_aesenc_last15_enc_avx_aesenc_avx_last: aesenclast %xmm9, %xmm4 subq $16, %rsp xorl %ecx, %ecx - movdqa %xmm4, (%rsp) + movdqu %xmm4, (%rsp) L_AES_GCM_encrypt_aesenc_last15_enc_avx_loop: movzbl (%rdi,%rbx,1), %r13d xorb (%rsp,%rcx,1), %r13b @@ -1912,7 +1912,7 @@ L_AES_GCM_encrypt_aesenc_last15_enc_avx_byte_loop: cmpl $16, %ecx jl L_AES_GCM_encrypt_aesenc_last15_enc_avx_byte_loop L_AES_GCM_encrypt_aesenc_last15_enc_avx_finish_enc: - movdqa (%rsp), %xmm4 + movdqu (%rsp), %xmm4 addq $16, %rsp pshufb L_aes_gcm_bswap_mask(%rip), %xmm4 pxor %xmm4, %xmm6 @@ -2006,12 +2006,12 @@ L_AES_GCM_encrypt_done_enc: pxor %xmm8, %xmm14 pxor %xmm14, %xmm6 pshufb L_aes_gcm_bswap_mask(%rip), %xmm6 - movdqa 144(%rsp), %xmm0 + movdqu 144(%rsp), %xmm0 pxor %xmm6, %xmm0 cmpl $16, %r14d je L_AES_GCM_encrypt_store_tag_16 xorq %rcx, %rcx - movdqa %xmm0, (%rsp) + movdqu %xmm0, (%rsp) L_AES_GCM_encrypt_store_tag_loop: movzbl (%rsp,%rcx,1), %r13d movb %r13b, (%r8,%rcx,1) @@ -2122,7 +2122,7 @@ L_AES_GCM_decrypt_calc_iv_12_last: aesenclast %xmm7, %xmm5 aesenclast %xmm7, %xmm1 pshufb L_aes_gcm_bswap_mask(%rip), %xmm5 - movdqa %xmm1, 144(%rsp) + movdqu %xmm1, 144(%rsp) jmp L_AES_GCM_decrypt_iv_done L_AES_GCM_decrypt_iv_not_12: # Calculate values when IV is not 12 bytes @@ -2227,7 +2227,7 @@ L_AES_GCM_decrypt_calc_iv_lt16: subq $16, %rsp pxor %xmm8, %xmm8 xorl %ebx, %ebx - movdqa %xmm8, (%rsp) + movdqu %xmm8, (%rsp) L_AES_GCM_decrypt_calc_iv_loop: movzbl (%rax,%rcx,1), %r13d movb %r13b, (%rsp,%rbx,1) @@ -2235,7 +2235,7 @@ L_AES_GCM_decrypt_calc_iv_loop: incl %ebx cmpl %edx, %ecx jl L_AES_GCM_decrypt_calc_iv_loop - movdqa (%rsp), %xmm8 + movdqu (%rsp), %xmm8 addq $16, %rsp pshufb L_aes_gcm_bswap_mask(%rip), %xmm8 pxor %xmm8, %xmm4 @@ -2379,7 +2379,7 @@ L_AES_GCM_decrypt_calc_iv_done: movdqa 224(%r15), %xmm9 L_AES_GCM_decrypt_calc_iv_2_aesenc_avx_last: aesenclast %xmm9, %xmm8 - movdqa %xmm8, 144(%rsp) + movdqu %xmm8, 144(%rsp) L_AES_GCM_decrypt_iv_done: # Additional authentication data movl %r11d, %edx @@ -2457,7 +2457,7 @@ L_AES_GCM_decrypt_calc_aad_lt16: subq $16, %rsp pxor %xmm8, %xmm8 xorl %ebx, %ebx - movdqa %xmm8, (%rsp) + movdqu %xmm8, (%rsp) L_AES_GCM_decrypt_calc_aad_loop: movzbl (%r12,%rcx,1), %r13d movb %r13b, (%rsp,%rbx,1) @@ -2465,7 +2465,7 @@ L_AES_GCM_decrypt_calc_aad_loop: incl %ebx cmpl %edx, %ecx jl L_AES_GCM_decrypt_calc_aad_loop - movdqa (%rsp), %xmm8 + movdqu (%rsp), %xmm8 addq $16, %rsp pshufb L_aes_gcm_bswap_mask(%rip), %xmm8 pxor %xmm8, %xmm6 @@ -2529,7 +2529,7 @@ L_AES_GCM_decrypt_calc_aad_done: movdqa %xmm5, %xmm9 paddd L_aes_gcm_one(%rip), %xmm4 movdqa %xmm5, %xmm8 - movdqa %xmm4, 128(%rsp) + movdqu %xmm4, 128(%rsp) psrlq $63, %xmm9 psllq $0x01, %xmm8 pslldq $8, %xmm9 @@ -2545,7 +2545,7 @@ L_AES_GCM_decrypt_calc_aad_done: andl $0xffffff80, %r13d movdqa %xmm6, %xmm2 # H ^ 1 - movdqa %xmm5, (%rsp) + movdqu %xmm5, (%rsp) # H ^ 2 pshufd $0x4e, %xmm5, %xmm9 pshufd $0x4e, %xmm5, %xmm10 @@ -2587,7 +2587,7 @@ L_AES_GCM_decrypt_calc_aad_done: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm0 - movdqa %xmm0, 16(%rsp) + movdqu %xmm0, 16(%rsp) # H ^ 3 pshufd $0x4e, %xmm5, %xmm9 pshufd $0x4e, %xmm0, %xmm10 @@ -2629,7 +2629,7 @@ L_AES_GCM_decrypt_calc_aad_done: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm1 - movdqa %xmm1, 32(%rsp) + movdqu %xmm1, 32(%rsp) # H ^ 4 pshufd $0x4e, %xmm0, %xmm9 pshufd $0x4e, %xmm0, %xmm10 @@ -2671,7 +2671,7 @@ L_AES_GCM_decrypt_calc_aad_done: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm3 - movdqa %xmm3, 48(%rsp) + movdqu %xmm3, 48(%rsp) # H ^ 5 pshufd $0x4e, %xmm0, %xmm9 pshufd $0x4e, %xmm1, %xmm10 @@ -2713,7 +2713,7 @@ L_AES_GCM_decrypt_calc_aad_done: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm7 - movdqa %xmm7, 64(%rsp) + movdqu %xmm7, 64(%rsp) # H ^ 6 pshufd $0x4e, %xmm1, %xmm9 pshufd $0x4e, %xmm1, %xmm10 @@ -2755,7 +2755,7 @@ L_AES_GCM_decrypt_calc_aad_done: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm7 - movdqa %xmm7, 80(%rsp) + movdqu %xmm7, 80(%rsp) # H ^ 7 pshufd $0x4e, %xmm1, %xmm9 pshufd $0x4e, %xmm3, %xmm10 @@ -2797,7 +2797,7 @@ L_AES_GCM_decrypt_calc_aad_done: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm7 - movdqa %xmm7, 96(%rsp) + movdqu %xmm7, 96(%rsp) # H ^ 8 pshufd $0x4e, %xmm3, %xmm9 pshufd $0x4e, %xmm3, %xmm10 @@ -2839,11 +2839,11 @@ L_AES_GCM_decrypt_calc_aad_done: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm7 - movdqa %xmm7, 112(%rsp) + movdqu %xmm7, 112(%rsp) L_AES_GCM_decrypt_ghash_128: leaq (%rdi,%rbx,1), %rcx leaq (%rsi,%rbx,1), %rdx - movdqa 128(%rsp), %xmm8 + movdqu 128(%rsp), %xmm8 movdqa L_aes_gcm_bswap_epi64(%rip), %xmm1 movdqa %xmm8, %xmm0 pshufb %xmm1, %xmm8 @@ -2870,7 +2870,7 @@ L_AES_GCM_decrypt_ghash_128: pshufb %xmm1, %xmm15 paddd L_aes_gcm_eight(%rip), %xmm0 movdqa (%r15), %xmm7 - movdqa %xmm0, 128(%rsp) + movdqu %xmm0, 128(%rsp) pxor %xmm7, %xmm8 pxor %xmm7, %xmm9 pxor %xmm7, %xmm10 @@ -2879,7 +2879,7 @@ L_AES_GCM_decrypt_ghash_128: pxor %xmm7, %xmm13 pxor %xmm7, %xmm14 pxor %xmm7, %xmm15 - movdqa 112(%rsp), %xmm7 + movdqu 112(%rsp), %xmm7 movdqu (%rcx), %xmm0 aesenc 16(%r15), %xmm8 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -2902,7 +2902,7 @@ L_AES_GCM_decrypt_ghash_128: aesenc 16(%r15), %xmm15 pxor %xmm2, %xmm1 pxor %xmm3, %xmm1 - movdqa 96(%rsp), %xmm7 + movdqu 96(%rsp), %xmm7 movdqu 16(%rcx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -2926,7 +2926,7 @@ L_AES_GCM_decrypt_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa 80(%rsp), %xmm7 + movdqu 80(%rsp), %xmm7 movdqu 32(%rcx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -2950,7 +2950,7 @@ L_AES_GCM_decrypt_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa 64(%rsp), %xmm7 + movdqu 64(%rsp), %xmm7 movdqu 48(%rcx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -2974,7 +2974,7 @@ L_AES_GCM_decrypt_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa 48(%rsp), %xmm7 + movdqu 48(%rsp), %xmm7 movdqu 64(%rcx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -2998,7 +2998,7 @@ L_AES_GCM_decrypt_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa 32(%rsp), %xmm7 + movdqu 32(%rsp), %xmm7 movdqu 80(%rcx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -3022,7 +3022,7 @@ L_AES_GCM_decrypt_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa 16(%rsp), %xmm7 + movdqu 16(%rsp), %xmm7 movdqu 96(%rcx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -3046,7 +3046,7 @@ L_AES_GCM_decrypt_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa (%rsp), %xmm7 + movdqu (%rsp), %xmm7 movdqu 112(%rcx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -3184,7 +3184,7 @@ L_AES_GCM_decrypt_aesenc_128_ghash_avx_done: cmpl %r13d, %ebx jl L_AES_GCM_decrypt_ghash_128 movdqa %xmm2, %xmm6 - movdqa (%rsp), %xmm5 + movdqu (%rsp), %xmm5 L_AES_GCM_decrypt_done_128: movl %r9d, %edx cmpl %edx, %ebx @@ -3200,12 +3200,12 @@ L_AES_GCM_decrypt_last_block_start: movdqa %xmm5, %xmm0 pshufb L_aes_gcm_bswap_mask(%rip), %xmm1 pxor %xmm6, %xmm1 - movdqa 128(%rsp), %xmm8 + movdqu 128(%rsp), %xmm8 movdqa %xmm8, %xmm9 pshufb L_aes_gcm_bswap_epi64(%rip), %xmm8 paddd L_aes_gcm_one(%rip), %xmm9 pxor (%r15), %xmm8 - movdqa %xmm9, 128(%rsp) + movdqu %xmm9, 128(%rsp) movdqa %xmm1, %xmm10 pclmulqdq $16, %xmm0, %xmm10 aesenc 16(%r15), %xmm8 @@ -3264,7 +3264,7 @@ L_AES_GCM_decrypt_last_block_done: movl %ecx, %edx andl $15, %ecx jz L_AES_GCM_decrypt_aesenc_last15_dec_avx_done - movdqa 128(%rsp), %xmm4 + movdqu 128(%rsp), %xmm4 pshufb L_aes_gcm_bswap_epi64(%rip), %xmm4 pxor (%r15), %xmm4 aesenc 16(%r15), %xmm4 @@ -3291,9 +3291,9 @@ L_AES_GCM_decrypt_aesenc_last15_dec_avx_aesenc_avx_last: aesenclast %xmm9, %xmm4 subq $32, %rsp xorl %ecx, %ecx - movdqa %xmm4, (%rsp) + movdqu %xmm4, (%rsp) pxor %xmm0, %xmm0 - movdqa %xmm0, 16(%rsp) + movdqu %xmm0, 16(%rsp) L_AES_GCM_decrypt_aesenc_last15_dec_avx_loop: movzbl (%rdi,%rbx,1), %r13d movb %r13b, 16(%rsp,%rcx,1) @@ -3303,7 +3303,7 @@ L_AES_GCM_decrypt_aesenc_last15_dec_avx_loop: incl %ecx cmpl %edx, %ebx jl L_AES_GCM_decrypt_aesenc_last15_dec_avx_loop - movdqa 16(%rsp), %xmm4 + movdqu 16(%rsp), %xmm4 addq $32, %rsp pshufb L_aes_gcm_bswap_mask(%rip), %xmm4 pxor %xmm4, %xmm6 @@ -3397,14 +3397,14 @@ L_AES_GCM_decrypt_done_dec: pxor %xmm8, %xmm14 pxor %xmm14, %xmm6 pshufb L_aes_gcm_bswap_mask(%rip), %xmm6 - movdqa 144(%rsp), %xmm0 + movdqu 144(%rsp), %xmm0 pxor %xmm6, %xmm0 cmpl $16, %r14d je L_AES_GCM_decrypt_cmp_tag_16 subq $16, %rsp xorq %rcx, %rcx xorq %rbx, %rbx - movdqa %xmm0, (%rsp) + movdqu %xmm0, (%rsp) L_AES_GCM_decrypt_cmp_tag_loop: movzbl (%rsp,%rcx,1), %r13d xorb (%r8,%rcx,1), %r13b @@ -3520,7 +3520,7 @@ L_AES_GCM_init_aesni_calc_iv_12_last: aesenclast %xmm7, %xmm5 aesenclast %xmm7, %xmm1 pshufb L_aes_gcm_bswap_mask(%rip), %xmm5 - movdqa %xmm1, %xmm15 + movdqu %xmm1, %xmm15 jmp L_AES_GCM_init_aesni_iv_done L_AES_GCM_init_aesni_iv_not_12: # Calculate values when IV is not 12 bytes @@ -3625,7 +3625,7 @@ L_AES_GCM_init_aesni_calc_iv_lt16: subq $16, %rsp pxor %xmm8, %xmm8 xorl %r13d, %r13d - movdqa %xmm8, (%rsp) + movdqu %xmm8, (%rsp) L_AES_GCM_init_aesni_calc_iv_loop: movzbl (%r10,%rcx,1), %r12d movb %r12b, (%rsp,%r13,1) @@ -3633,7 +3633,7 @@ L_AES_GCM_init_aesni_calc_iv_loop: incl %r13d cmpl %edx, %ecx jl L_AES_GCM_init_aesni_calc_iv_loop - movdqa (%rsp), %xmm8 + movdqu (%rsp), %xmm8 addq $16, %rsp pshufb L_aes_gcm_bswap_mask(%rip), %xmm8 pxor %xmm8, %xmm4 @@ -3777,7 +3777,7 @@ L_AES_GCM_init_aesni_calc_iv_done: movdqa 224(%rdi), %xmm9 L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last: aesenclast %xmm9, %xmm8 - movdqa %xmm8, %xmm15 + movdqu %xmm8, %xmm15 L_AES_GCM_init_aesni_iv_done: movdqa %xmm15, (%rax) pshufb L_aes_gcm_bswap_epi64(%rip), %xmm4 @@ -3888,12 +3888,12 @@ _AES_GCM_encrypt_block_aesni: #endif /* __APPLE__ */ movq %rdx, %r10 movq %rcx, %r11 - movdqa (%r8), %xmm8 + movdqu (%r8), %xmm8 movdqa %xmm8, %xmm9 pshufb L_aes_gcm_bswap_epi64(%rip), %xmm8 paddd L_aes_gcm_one(%rip), %xmm9 pxor (%rdi), %xmm8 - movdqa %xmm9, (%r8) + movdqu %xmm9, (%r8) aesenc 16(%rdi), %xmm8 aesenc 32(%rdi), %xmm8 aesenc 48(%rdi), %xmm8 @@ -4039,7 +4039,7 @@ _AES_GCM_encrypt_update_aesni: andl $0xffffff80, %r13d movdqa %xmm6, %xmm2 # H ^ 1 - movdqa %xmm5, (%rsp) + movdqu %xmm5, (%rsp) # H ^ 2 pshufd $0x4e, %xmm5, %xmm9 pshufd $0x4e, %xmm5, %xmm10 @@ -4081,7 +4081,7 @@ _AES_GCM_encrypt_update_aesni: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm0 - movdqa %xmm0, 16(%rsp) + movdqu %xmm0, 16(%rsp) # H ^ 3 pshufd $0x4e, %xmm5, %xmm9 pshufd $0x4e, %xmm0, %xmm10 @@ -4123,7 +4123,7 @@ _AES_GCM_encrypt_update_aesni: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm1 - movdqa %xmm1, 32(%rsp) + movdqu %xmm1, 32(%rsp) # H ^ 4 pshufd $0x4e, %xmm0, %xmm9 pshufd $0x4e, %xmm0, %xmm10 @@ -4165,7 +4165,7 @@ _AES_GCM_encrypt_update_aesni: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm3 - movdqa %xmm3, 48(%rsp) + movdqu %xmm3, 48(%rsp) # H ^ 5 pshufd $0x4e, %xmm0, %xmm9 pshufd $0x4e, %xmm1, %xmm10 @@ -4207,7 +4207,7 @@ _AES_GCM_encrypt_update_aesni: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm7 - movdqa %xmm7, 64(%rsp) + movdqu %xmm7, 64(%rsp) # H ^ 6 pshufd $0x4e, %xmm1, %xmm9 pshufd $0x4e, %xmm1, %xmm10 @@ -4249,7 +4249,7 @@ _AES_GCM_encrypt_update_aesni: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm7 - movdqa %xmm7, 80(%rsp) + movdqu %xmm7, 80(%rsp) # H ^ 7 pshufd $0x4e, %xmm1, %xmm9 pshufd $0x4e, %xmm3, %xmm10 @@ -4291,7 +4291,7 @@ _AES_GCM_encrypt_update_aesni: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm7 - movdqa %xmm7, 96(%rsp) + movdqu %xmm7, 96(%rsp) # H ^ 8 pshufd $0x4e, %xmm3, %xmm9 pshufd $0x4e, %xmm3, %xmm10 @@ -4333,9 +4333,9 @@ _AES_GCM_encrypt_update_aesni: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm7 - movdqa %xmm7, 112(%rsp) + movdqu %xmm7, 112(%rsp) # First 128 bytes of input - movdqa (%r12), %xmm8 + movdqu (%r12), %xmm8 movdqa L_aes_gcm_bswap_epi64(%rip), %xmm1 movdqa %xmm8, %xmm0 pshufb %xmm1, %xmm8 @@ -4362,7 +4362,7 @@ _AES_GCM_encrypt_update_aesni: pshufb %xmm1, %xmm15 paddd L_aes_gcm_eight(%rip), %xmm0 movdqa (%rdi), %xmm7 - movdqa %xmm0, (%r12) + movdqu %xmm0, (%r12) pxor %xmm7, %xmm8 pxor %xmm7, %xmm9 pxor %xmm7, %xmm10 @@ -4533,7 +4533,7 @@ L_AES_GCM_encrypt_update_aesni_enc_done: L_AES_GCM_encrypt_update_aesni_ghash_128: leaq (%r11,%r14,1), %rcx leaq (%r10,%r14,1), %rdx - movdqa (%r12), %xmm8 + movdqu (%r12), %xmm8 movdqa L_aes_gcm_bswap_epi64(%rip), %xmm1 movdqa %xmm8, %xmm0 pshufb %xmm1, %xmm8 @@ -4560,7 +4560,7 @@ L_AES_GCM_encrypt_update_aesni_ghash_128: pshufb %xmm1, %xmm15 paddd L_aes_gcm_eight(%rip), %xmm0 movdqa (%rdi), %xmm7 - movdqa %xmm0, (%r12) + movdqu %xmm0, (%r12) pxor %xmm7, %xmm8 pxor %xmm7, %xmm9 pxor %xmm7, %xmm10 @@ -4569,7 +4569,7 @@ L_AES_GCM_encrypt_update_aesni_ghash_128: pxor %xmm7, %xmm13 pxor %xmm7, %xmm14 pxor %xmm7, %xmm15 - movdqa 112(%rsp), %xmm7 + movdqu 112(%rsp), %xmm7 movdqu -128(%rdx), %xmm0 aesenc 16(%rdi), %xmm8 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -4592,7 +4592,7 @@ L_AES_GCM_encrypt_update_aesni_ghash_128: aesenc 16(%rdi), %xmm15 pxor %xmm2, %xmm1 pxor %xmm3, %xmm1 - movdqa 96(%rsp), %xmm7 + movdqu 96(%rsp), %xmm7 movdqu -112(%rdx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -4616,7 +4616,7 @@ L_AES_GCM_encrypt_update_aesni_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa 80(%rsp), %xmm7 + movdqu 80(%rsp), %xmm7 movdqu -96(%rdx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -4640,7 +4640,7 @@ L_AES_GCM_encrypt_update_aesni_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa 64(%rsp), %xmm7 + movdqu 64(%rsp), %xmm7 movdqu -80(%rdx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -4664,7 +4664,7 @@ L_AES_GCM_encrypt_update_aesni_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa 48(%rsp), %xmm7 + movdqu 48(%rsp), %xmm7 movdqu -64(%rdx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -4688,7 +4688,7 @@ L_AES_GCM_encrypt_update_aesni_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa 32(%rsp), %xmm7 + movdqu 32(%rsp), %xmm7 movdqu -48(%rdx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -4712,7 +4712,7 @@ L_AES_GCM_encrypt_update_aesni_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa 16(%rsp), %xmm7 + movdqu 16(%rsp), %xmm7 movdqu -32(%rdx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -4736,7 +4736,7 @@ L_AES_GCM_encrypt_update_aesni_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa (%rsp), %xmm7 + movdqu (%rsp), %xmm7 movdqu -16(%rdx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -4884,7 +4884,7 @@ L_AES_GCM_encrypt_update_aesni_end_128: pshufb %xmm4, %xmm13 pshufb %xmm4, %xmm14 pshufb %xmm4, %xmm15 - movdqa 112(%rsp), %xmm7 + movdqu 112(%rsp), %xmm7 pshufd $0x4e, %xmm8, %xmm1 pshufd $0x4e, %xmm7, %xmm2 movdqa %xmm7, %xmm3 @@ -4903,7 +4903,7 @@ L_AES_GCM_encrypt_update_aesni_end_128: psrldq $8, %xmm1 pxor %xmm2, %xmm4 pxor %xmm1, %xmm6 - movdqa 96(%rsp), %xmm7 + movdqu 96(%rsp), %xmm7 pshufd $0x4e, %xmm9, %xmm1 pshufd $0x4e, %xmm7, %xmm2 movdqa %xmm7, %xmm3 @@ -4922,7 +4922,7 @@ L_AES_GCM_encrypt_update_aesni_end_128: psrldq $8, %xmm1 pxor %xmm2, %xmm4 pxor %xmm1, %xmm6 - movdqa 80(%rsp), %xmm7 + movdqu 80(%rsp), %xmm7 pshufd $0x4e, %xmm10, %xmm1 pshufd $0x4e, %xmm7, %xmm2 movdqa %xmm7, %xmm3 @@ -4941,7 +4941,7 @@ L_AES_GCM_encrypt_update_aesni_end_128: psrldq $8, %xmm1 pxor %xmm2, %xmm4 pxor %xmm1, %xmm6 - movdqa 64(%rsp), %xmm7 + movdqu 64(%rsp), %xmm7 pshufd $0x4e, %xmm11, %xmm1 pshufd $0x4e, %xmm7, %xmm2 movdqa %xmm7, %xmm3 @@ -4960,7 +4960,7 @@ L_AES_GCM_encrypt_update_aesni_end_128: psrldq $8, %xmm1 pxor %xmm2, %xmm4 pxor %xmm1, %xmm6 - movdqa 48(%rsp), %xmm7 + movdqu 48(%rsp), %xmm7 pshufd $0x4e, %xmm12, %xmm1 pshufd $0x4e, %xmm7, %xmm2 movdqa %xmm7, %xmm3 @@ -4979,7 +4979,7 @@ L_AES_GCM_encrypt_update_aesni_end_128: psrldq $8, %xmm1 pxor %xmm2, %xmm4 pxor %xmm1, %xmm6 - movdqa 32(%rsp), %xmm7 + movdqu 32(%rsp), %xmm7 pshufd $0x4e, %xmm13, %xmm1 pshufd $0x4e, %xmm7, %xmm2 movdqa %xmm7, %xmm3 @@ -4998,7 +4998,7 @@ L_AES_GCM_encrypt_update_aesni_end_128: psrldq $8, %xmm1 pxor %xmm2, %xmm4 pxor %xmm1, %xmm6 - movdqa 16(%rsp), %xmm7 + movdqu 16(%rsp), %xmm7 pshufd $0x4e, %xmm14, %xmm1 pshufd $0x4e, %xmm7, %xmm2 movdqa %xmm7, %xmm3 @@ -5017,7 +5017,7 @@ L_AES_GCM_encrypt_update_aesni_end_128: psrldq $8, %xmm1 pxor %xmm2, %xmm4 pxor %xmm1, %xmm6 - movdqa (%rsp), %xmm7 + movdqu (%rsp), %xmm7 pshufd $0x4e, %xmm15, %xmm1 pshufd $0x4e, %xmm7, %xmm2 movdqa %xmm7, %xmm3 @@ -5059,7 +5059,7 @@ L_AES_GCM_encrypt_update_aesni_end_128: pxor %xmm1, %xmm2 pxor %xmm4, %xmm2 pxor %xmm2, %xmm6 - movdqa (%rsp), %xmm5 + movdqu (%rsp), %xmm5 L_AES_GCM_encrypt_update_aesni_done_128: movl %r8d, %edx cmpl %edx, %r14d @@ -5070,12 +5070,12 @@ L_AES_GCM_encrypt_update_aesni_done_128: jge L_AES_GCM_encrypt_update_aesni_last_block_done leaq (%r11,%r14,1), %rcx leaq (%r10,%r14,1), %rdx - movdqa (%r12), %xmm8 + movdqu (%r12), %xmm8 movdqa %xmm8, %xmm9 pshufb L_aes_gcm_bswap_epi64(%rip), %xmm8 paddd L_aes_gcm_one(%rip), %xmm9 pxor (%rdi), %xmm8 - movdqa %xmm9, (%r12) + movdqu %xmm9, (%r12) aesenc 16(%rdi), %xmm8 aesenc 32(%rdi), %xmm8 aesenc 48(%rdi), %xmm8 @@ -5109,12 +5109,12 @@ L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last: L_AES_GCM_encrypt_update_aesni_last_block_start: leaq (%r11,%r14,1), %rcx leaq (%r10,%r14,1), %rdx - movdqa (%r12), %xmm8 + movdqu (%r12), %xmm8 movdqa %xmm8, %xmm9 pshufb L_aes_gcm_bswap_epi64(%rip), %xmm8 paddd L_aes_gcm_one(%rip), %xmm9 pxor (%rdi), %xmm8 - movdqa %xmm9, (%r12) + movdqu %xmm9, (%r12) movdqa %xmm6, %xmm10 pclmulqdq $16, %xmm5, %xmm10 aesenc 16(%rdi), %xmm8 @@ -5235,7 +5235,7 @@ AES_GCM_encrypt_final_aesni: _AES_GCM_encrypt_final_aesni: #endif /* __APPLE__ */ pushq %r13 - movq %rdx, %rax + movl %edx, %eax movl %ecx, %r10d movl %r8d, %r11d movq 16(%rsp), %r8 @@ -5301,12 +5301,12 @@ _AES_GCM_encrypt_final_aesni: pxor %xmm8, %xmm14 pxor %xmm14, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm4 - movdqa %xmm6, %xmm0 + movdqu %xmm6, %xmm0 pxor %xmm4, %xmm0 cmpl $16, %eax je L_AES_GCM_encrypt_final_aesni_store_tag_16 xorq %rcx, %rcx - movdqa %xmm0, (%rsp) + movdqu %xmm0, (%rsp) L_AES_GCM_encrypt_final_aesni_store_tag_loop: movzbl (%rsp,%rcx,1), %r13d movb %r13b, (%rsi,%rcx,1) @@ -5363,7 +5363,7 @@ _AES_GCM_decrypt_update_aesni: andl $0xffffff80, %r13d movdqa %xmm6, %xmm2 # H ^ 1 - movdqa %xmm5, (%rsp) + movdqu %xmm5, (%rsp) # H ^ 2 pshufd $0x4e, %xmm5, %xmm9 pshufd $0x4e, %xmm5, %xmm10 @@ -5405,7 +5405,7 @@ _AES_GCM_decrypt_update_aesni: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm0 - movdqa %xmm0, 16(%rsp) + movdqu %xmm0, 16(%rsp) # H ^ 3 pshufd $0x4e, %xmm5, %xmm9 pshufd $0x4e, %xmm0, %xmm10 @@ -5447,7 +5447,7 @@ _AES_GCM_decrypt_update_aesni: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm1 - movdqa %xmm1, 32(%rsp) + movdqu %xmm1, 32(%rsp) # H ^ 4 pshufd $0x4e, %xmm0, %xmm9 pshufd $0x4e, %xmm0, %xmm10 @@ -5489,7 +5489,7 @@ _AES_GCM_decrypt_update_aesni: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm3 - movdqa %xmm3, 48(%rsp) + movdqu %xmm3, 48(%rsp) # H ^ 5 pshufd $0x4e, %xmm0, %xmm9 pshufd $0x4e, %xmm1, %xmm10 @@ -5531,7 +5531,7 @@ _AES_GCM_decrypt_update_aesni: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm7 - movdqa %xmm7, 64(%rsp) + movdqu %xmm7, 64(%rsp) # H ^ 6 pshufd $0x4e, %xmm1, %xmm9 pshufd $0x4e, %xmm1, %xmm10 @@ -5573,7 +5573,7 @@ _AES_GCM_decrypt_update_aesni: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm7 - movdqa %xmm7, 80(%rsp) + movdqu %xmm7, 80(%rsp) # H ^ 7 pshufd $0x4e, %xmm1, %xmm9 pshufd $0x4e, %xmm3, %xmm10 @@ -5615,7 +5615,7 @@ _AES_GCM_decrypt_update_aesni: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm7 - movdqa %xmm7, 96(%rsp) + movdqu %xmm7, 96(%rsp) # H ^ 8 pshufd $0x4e, %xmm3, %xmm9 pshufd $0x4e, %xmm3, %xmm10 @@ -5657,11 +5657,11 @@ _AES_GCM_decrypt_update_aesni: pxor %xmm13, %xmm14 pxor %xmm8, %xmm14 pxor %xmm14, %xmm7 - movdqa %xmm7, 112(%rsp) + movdqu %xmm7, 112(%rsp) L_AES_GCM_decrypt_update_aesni_ghash_128: leaq (%r11,%r14,1), %rcx leaq (%r10,%r14,1), %rdx - movdqa (%r12), %xmm8 + movdqu (%r12), %xmm8 movdqa L_aes_gcm_bswap_epi64(%rip), %xmm1 movdqa %xmm8, %xmm0 pshufb %xmm1, %xmm8 @@ -5688,7 +5688,7 @@ L_AES_GCM_decrypt_update_aesni_ghash_128: pshufb %xmm1, %xmm15 paddd L_aes_gcm_eight(%rip), %xmm0 movdqa (%rdi), %xmm7 - movdqa %xmm0, (%r12) + movdqu %xmm0, (%r12) pxor %xmm7, %xmm8 pxor %xmm7, %xmm9 pxor %xmm7, %xmm10 @@ -5697,7 +5697,7 @@ L_AES_GCM_decrypt_update_aesni_ghash_128: pxor %xmm7, %xmm13 pxor %xmm7, %xmm14 pxor %xmm7, %xmm15 - movdqa 112(%rsp), %xmm7 + movdqu 112(%rsp), %xmm7 movdqu (%rcx), %xmm0 aesenc 16(%rdi), %xmm8 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -5720,7 +5720,7 @@ L_AES_GCM_decrypt_update_aesni_ghash_128: aesenc 16(%rdi), %xmm15 pxor %xmm2, %xmm1 pxor %xmm3, %xmm1 - movdqa 96(%rsp), %xmm7 + movdqu 96(%rsp), %xmm7 movdqu 16(%rcx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -5744,7 +5744,7 @@ L_AES_GCM_decrypt_update_aesni_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa 80(%rsp), %xmm7 + movdqu 80(%rsp), %xmm7 movdqu 32(%rcx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -5768,7 +5768,7 @@ L_AES_GCM_decrypt_update_aesni_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa 64(%rsp), %xmm7 + movdqu 64(%rsp), %xmm7 movdqu 48(%rcx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -5792,7 +5792,7 @@ L_AES_GCM_decrypt_update_aesni_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa 48(%rsp), %xmm7 + movdqu 48(%rsp), %xmm7 movdqu 64(%rcx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -5816,7 +5816,7 @@ L_AES_GCM_decrypt_update_aesni_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa 32(%rsp), %xmm7 + movdqu 32(%rsp), %xmm7 movdqu 80(%rcx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -5840,7 +5840,7 @@ L_AES_GCM_decrypt_update_aesni_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa 16(%rsp), %xmm7 + movdqu 16(%rsp), %xmm7 movdqu 96(%rcx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -5864,7 +5864,7 @@ L_AES_GCM_decrypt_update_aesni_ghash_128: pxor %xmm6, %xmm1 pxor %xmm6, %xmm3 pxor %xmm4, %xmm1 - movdqa (%rsp), %xmm7 + movdqu (%rsp), %xmm7 movdqu 112(%rcx), %xmm0 pshufd $0x4e, %xmm7, %xmm4 pshufb L_aes_gcm_bswap_mask(%rip), %xmm0 @@ -6002,7 +6002,7 @@ L_AES_GCM_decrypt_update_aesni_aesenc_128_ghash_avx_done: cmpl %r13d, %r14d jl L_AES_GCM_decrypt_update_aesni_ghash_128 movdqa %xmm2, %xmm6 - movdqa (%rsp), %xmm5 + movdqu (%rsp), %xmm5 L_AES_GCM_decrypt_update_aesni_done_128: movl %r8d, %edx cmpl %edx, %r14d @@ -6018,12 +6018,12 @@ L_AES_GCM_decrypt_update_aesni_last_block_start: movdqa %xmm5, %xmm0 pshufb L_aes_gcm_bswap_mask(%rip), %xmm1 pxor %xmm6, %xmm1 - movdqa (%r12), %xmm8 + movdqu (%r12), %xmm8 movdqa %xmm8, %xmm9 pshufb L_aes_gcm_bswap_epi64(%rip), %xmm8 paddd L_aes_gcm_one(%rip), %xmm9 pxor (%rdi), %xmm8 - movdqa %xmm9, (%r12) + movdqu %xmm9, (%r12) movdqa %xmm1, %xmm10 pclmulqdq $16, %xmm0, %xmm10 aesenc 16(%rdi), %xmm8 @@ -6104,7 +6104,7 @@ _AES_GCM_decrypt_final_aesni: pushq %r13 pushq %rbp pushq %r12 - movq %rdx, %rax + movl %edx, %eax movl %ecx, %r10d movl %r8d, %r11d movq 32(%rsp), %r8 @@ -6171,14 +6171,14 @@ _AES_GCM_decrypt_final_aesni: pxor %xmm8, %xmm14 pxor %xmm14, %xmm6 pshufb L_aes_gcm_bswap_mask(%rip), %xmm6 - movdqa %xmm15, %xmm0 + movdqu %xmm15, %xmm0 pxor %xmm6, %xmm0 cmpl $16, %eax je L_AES_GCM_decrypt_final_aesni_cmp_tag_16 subq $16, %rsp xorq %rcx, %rcx xorq %r12, %r12 - movdqa %xmm0, (%rsp) + movdqu %xmm0, (%rsp) L_AES_GCM_decrypt_final_aesni_cmp_tag_loop: movzbl (%rsp,%rcx,1), %r13d xorb (%rsi,%rcx,1), %r13b @@ -6376,7 +6376,7 @@ _AES_GCM_encrypt_avx1: # # Calculate values when IV is 12 bytes # Set counter based on IV movl $0x1000000, %ecx - vpinsrq $0x00, (%rax), %xmm4, %xmm4 + vmovq (%rax), %xmm4 vpinsrd $2, 8(%rax), %xmm4, %xmm4 vpinsrd $3, %ecx, %xmm4, %xmm4 # H = Encrypt X(=0) and T = Encrypt counter @@ -6430,7 +6430,7 @@ L_AES_GCM_encrypt_avx1_calc_iv_12_last: vaesenclast %xmm7, %xmm5, %xmm5 vaesenclast %xmm7, %xmm1, %xmm1 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 - vmovdqa %xmm1, 144(%rsp) + vmovdqu %xmm1, 144(%rsp) jmp L_AES_GCM_encrypt_avx1_iv_done L_AES_GCM_encrypt_avx1_iv_not_12: # Calculate values when IV is not 12 bytes @@ -6583,7 +6583,7 @@ L_AES_GCM_encrypt_avx1_calc_iv_done: # T = Encrypt counter vpxor %xmm0, %xmm0, %xmm0 shll $3, %edx - vpinsrq $0x00, %rdx, %xmm0, %xmm0 + vmovq %rdx, %xmm0 vpxor %xmm0, %xmm4, %xmm4 # ghash_gfmul_avx vpshufd $0x4e, %xmm4, %xmm1 @@ -6654,7 +6654,7 @@ L_AES_GCM_encrypt_avx1_calc_iv_done: vmovdqa 224(%r15), %xmm9 L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last: vaesenclast %xmm9, %xmm8, %xmm8 - vmovdqa %xmm8, 144(%rsp) + vmovdqu %xmm8, 144(%rsp) L_AES_GCM_encrypt_avx1_iv_done: # Additional authentication data movl %r11d, %edx @@ -6788,7 +6788,7 @@ L_AES_GCM_encrypt_avx1_calc_aad_done: vpand L_avx1_aes_gcm_mod2_128(%rip), %xmm5, %xmm5 vpaddd L_avx1_aes_gcm_one(%rip), %xmm4, %xmm4 vpxor %xmm8, %xmm5, %xmm5 - vmovdqa %xmm4, 128(%rsp) + vmovdqu %xmm4, 128(%rsp) xorl %ebx, %ebx cmpl $0x80, %r9d movl %r9d, %r13d @@ -6796,7 +6796,7 @@ L_AES_GCM_encrypt_avx1_calc_aad_done: andl $0xffffff80, %r13d vmovdqa %xmm6, %xmm2 # H ^ 1 - vmovdqa %xmm5, (%rsp) + vmovdqu %xmm5, (%rsp) # H ^ 2 vpclmulqdq $0x00, %xmm5, %xmm5, %xmm8 vpclmulqdq $0x11, %xmm5, %xmm5, %xmm0 @@ -6816,7 +6816,7 @@ L_AES_GCM_encrypt_avx1_calc_aad_done: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm0, %xmm0 - vmovdqa %xmm0, 16(%rsp) + vmovdqu %xmm0, 16(%rsp) # H ^ 3 # ghash_gfmul_red_avx vpshufd $0x4e, %xmm5, %xmm9 @@ -6848,7 +6848,7 @@ L_AES_GCM_encrypt_avx1_calc_aad_done: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm1, %xmm1 - vmovdqa %xmm1, 32(%rsp) + vmovdqu %xmm1, 32(%rsp) # H ^ 4 vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 vpclmulqdq $0x11, %xmm0, %xmm0, %xmm3 @@ -6868,7 +6868,7 @@ L_AES_GCM_encrypt_avx1_calc_aad_done: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm3, %xmm3 - vmovdqa %xmm3, 48(%rsp) + vmovdqu %xmm3, 48(%rsp) # H ^ 5 # ghash_gfmul_red_avx vpshufd $0x4e, %xmm0, %xmm9 @@ -6900,7 +6900,7 @@ L_AES_GCM_encrypt_avx1_calc_aad_done: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm7, %xmm7 - vmovdqa %xmm7, 64(%rsp) + vmovdqu %xmm7, 64(%rsp) # H ^ 6 vpclmulqdq $0x00, %xmm1, %xmm1, %xmm8 vpclmulqdq $0x11, %xmm1, %xmm1, %xmm7 @@ -6920,7 +6920,7 @@ L_AES_GCM_encrypt_avx1_calc_aad_done: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm7, %xmm7 - vmovdqa %xmm7, 80(%rsp) + vmovdqu %xmm7, 80(%rsp) # H ^ 7 # ghash_gfmul_red_avx vpshufd $0x4e, %xmm1, %xmm9 @@ -6952,7 +6952,7 @@ L_AES_GCM_encrypt_avx1_calc_aad_done: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm7, %xmm7 - vmovdqa %xmm7, 96(%rsp) + vmovdqu %xmm7, 96(%rsp) # H ^ 8 vpclmulqdq $0x00, %xmm3, %xmm3, %xmm8 vpclmulqdq $0x11, %xmm3, %xmm3, %xmm7 @@ -6972,9 +6972,9 @@ L_AES_GCM_encrypt_avx1_calc_aad_done: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm7, %xmm7 - vmovdqa %xmm7, 112(%rsp) + vmovdqu %xmm7, 112(%rsp) # First 128 bytes of input - vmovdqa 128(%rsp), %xmm0 + vmovdqu 128(%rsp), %xmm0 vmovdqa L_avx1_aes_gcm_bswap_epi64(%rip), %xmm1 vpshufb %xmm1, %xmm0, %xmm8 vpaddd L_avx1_aes_gcm_one(%rip), %xmm0, %xmm9 @@ -6993,7 +6993,7 @@ L_AES_GCM_encrypt_avx1_calc_aad_done: vpshufb %xmm1, %xmm15, %xmm15 vpaddd L_avx1_aes_gcm_eight(%rip), %xmm0, %xmm0 vmovdqa (%r15), %xmm7 - vmovdqa %xmm0, 128(%rsp) + vmovdqu %xmm0, 128(%rsp) vpxor %xmm7, %xmm8, %xmm8 vpxor %xmm7, %xmm9, %xmm9 vpxor %xmm7, %xmm10, %xmm10 @@ -7164,7 +7164,7 @@ L_AES_GCM_encrypt_avx1_aesenc_128_enc_done: L_AES_GCM_encrypt_avx1_ghash_128: leaq (%rdi,%rbx,1), %rcx leaq (%rsi,%rbx,1), %rdx - vmovdqa 128(%rsp), %xmm0 + vmovdqu 128(%rsp), %xmm0 vmovdqa L_avx1_aes_gcm_bswap_epi64(%rip), %xmm1 vpshufb %xmm1, %xmm0, %xmm8 vpaddd L_avx1_aes_gcm_one(%rip), %xmm0, %xmm9 @@ -7183,7 +7183,7 @@ L_AES_GCM_encrypt_avx1_ghash_128: vpshufb %xmm1, %xmm15, %xmm15 vpaddd L_avx1_aes_gcm_eight(%rip), %xmm0, %xmm0 vmovdqa (%r15), %xmm7 - vmovdqa %xmm0, 128(%rsp) + vmovdqu %xmm0, 128(%rsp) vpxor %xmm7, %xmm8, %xmm8 vpxor %xmm7, %xmm9, %xmm9 vpxor %xmm7, %xmm10, %xmm10 @@ -7192,7 +7192,7 @@ L_AES_GCM_encrypt_avx1_ghash_128: vpxor %xmm7, %xmm13, %xmm13 vpxor %xmm7, %xmm14, %xmm14 vpxor %xmm7, %xmm15, %xmm15 - vmovdqa 112(%rsp), %xmm7 + vmovdqu 112(%rsp), %xmm7 vmovdqu -128(%rdx), %xmm0 vaesenc 16(%r15), %xmm8, %xmm8 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -7213,7 +7213,7 @@ L_AES_GCM_encrypt_avx1_ghash_128: vaesenc 16(%r15), %xmm15, %xmm15 vpxor %xmm2, %xmm1, %xmm1 vpxor %xmm3, %xmm1, %xmm1 - vmovdqa 96(%rsp), %xmm7 + vmovdqu 96(%rsp), %xmm7 vmovdqu -112(%rdx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -7236,7 +7236,7 @@ L_AES_GCM_encrypt_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa 80(%rsp), %xmm7 + vmovdqu 80(%rsp), %xmm7 vmovdqu -96(%rdx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -7259,7 +7259,7 @@ L_AES_GCM_encrypt_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa 64(%rsp), %xmm7 + vmovdqu 64(%rsp), %xmm7 vmovdqu -80(%rdx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -7282,7 +7282,7 @@ L_AES_GCM_encrypt_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa 48(%rsp), %xmm7 + vmovdqu 48(%rsp), %xmm7 vmovdqu -64(%rdx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -7305,7 +7305,7 @@ L_AES_GCM_encrypt_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa 32(%rsp), %xmm7 + vmovdqu 32(%rsp), %xmm7 vmovdqu -48(%rdx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -7328,7 +7328,7 @@ L_AES_GCM_encrypt_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa 16(%rsp), %xmm7 + vmovdqu 16(%rsp), %xmm7 vmovdqu -32(%rdx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -7351,7 +7351,7 @@ L_AES_GCM_encrypt_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa (%rsp), %xmm7 + vmovdqu (%rsp), %xmm7 vmovdqu -16(%rdx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -7490,8 +7490,8 @@ L_AES_GCM_encrypt_avx1_end_128: vpshufb %xmm4, %xmm13, %xmm13 vpshufb %xmm4, %xmm14, %xmm14 vpshufb %xmm4, %xmm15, %xmm15 - vmovdqa (%rsp), %xmm7 - vmovdqa 16(%rsp), %xmm5 + vmovdqu (%rsp), %xmm7 + vmovdqu 16(%rsp), %xmm5 # ghash_gfmul_avx vpshufd $0x4e, %xmm15, %xmm1 vpshufd $0x4e, %xmm7, %xmm2 @@ -7524,8 +7524,8 @@ L_AES_GCM_encrypt_avx1_end_128: vpsrldq $8, %xmm1, %xmm1 vpxor %xmm2, %xmm4, %xmm4 vpxor %xmm1, %xmm6, %xmm6 - vmovdqa 32(%rsp), %xmm7 - vmovdqa 48(%rsp), %xmm5 + vmovdqu 32(%rsp), %xmm7 + vmovdqu 48(%rsp), %xmm5 # ghash_gfmul_xor_avx vpshufd $0x4e, %xmm13, %xmm1 vpshufd $0x4e, %xmm7, %xmm2 @@ -7558,8 +7558,8 @@ L_AES_GCM_encrypt_avx1_end_128: vpsrldq $8, %xmm1, %xmm1 vpxor %xmm2, %xmm4, %xmm4 vpxor %xmm1, %xmm6, %xmm6 - vmovdqa 64(%rsp), %xmm7 - vmovdqa 80(%rsp), %xmm5 + vmovdqu 64(%rsp), %xmm7 + vmovdqu 80(%rsp), %xmm5 # ghash_gfmul_xor_avx vpshufd $0x4e, %xmm11, %xmm1 vpshufd $0x4e, %xmm7, %xmm2 @@ -7592,8 +7592,8 @@ L_AES_GCM_encrypt_avx1_end_128: vpsrldq $8, %xmm1, %xmm1 vpxor %xmm2, %xmm4, %xmm4 vpxor %xmm1, %xmm6, %xmm6 - vmovdqa 96(%rsp), %xmm7 - vmovdqa 112(%rsp), %xmm5 + vmovdqu 96(%rsp), %xmm7 + vmovdqu 112(%rsp), %xmm5 # ghash_gfmul_xor_avx vpshufd $0x4e, %xmm9, %xmm1 vpshufd $0x4e, %xmm7, %xmm2 @@ -7643,7 +7643,7 @@ L_AES_GCM_encrypt_avx1_end_128: vpxor %xmm1, %xmm2, %xmm2 vpxor %xmm4, %xmm2, %xmm2 vpxor %xmm2, %xmm6, %xmm6 - vmovdqa (%rsp), %xmm5 + vmovdqu (%rsp), %xmm5 L_AES_GCM_encrypt_avx1_done_128: movl %r9d, %edx cmpl %edx, %ebx @@ -7652,10 +7652,10 @@ L_AES_GCM_encrypt_avx1_done_128: andl $0xfffffff0, %r13d cmpl %r13d, %ebx jge L_AES_GCM_encrypt_avx1_last_block_done - vmovdqa 128(%rsp), %xmm9 + vmovdqu 128(%rsp), %xmm9 vpshufb L_avx1_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8 vpaddd L_avx1_aes_gcm_one(%rip), %xmm9, %xmm9 - vmovdqa %xmm9, 128(%rsp) + vmovdqu %xmm9, 128(%rsp) vpxor (%r15), %xmm8, %xmm8 vaesenc 16(%r15), %xmm8, %xmm8 vaesenc 32(%r15), %xmm8, %xmm8 @@ -7689,10 +7689,10 @@ L_AES_GCM_encrypt_avx1_aesenc_block_last: jge L_AES_GCM_encrypt_avx1_last_block_ghash L_AES_GCM_encrypt_avx1_last_block_start: vmovdqu (%rdi,%rbx,1), %xmm13 - vmovdqa 128(%rsp), %xmm9 + vmovdqu 128(%rsp), %xmm9 vpshufb L_avx1_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8 vpaddd L_avx1_aes_gcm_one(%rip), %xmm9, %xmm9 - vmovdqa %xmm9, 128(%rsp) + vmovdqu %xmm9, 128(%rsp) vpxor (%r15), %xmm8, %xmm8 vpclmulqdq $16, %xmm5, %xmm6, %xmm10 vaesenc 16(%r15), %xmm8, %xmm8 @@ -7777,7 +7777,7 @@ L_AES_GCM_encrypt_avx1_last_block_done: movl %ecx, %edx andl $15, %ecx jz L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_done - vmovdqa 128(%rsp), %xmm4 + vmovdqu 128(%rsp), %xmm4 vpshufb L_avx1_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4 vpxor (%r15), %xmm4, %xmm4 vaesenc 16(%r15), %xmm4, %xmm4 @@ -7863,8 +7863,9 @@ L_AES_GCM_encrypt_avx1_done_enc: movl %r11d, %ecx shlq $3, %rdx shlq $3, %rcx - vpinsrq $0x00, %rdx, %xmm0, %xmm0 - vpinsrq $0x01, %rcx, %xmm0, %xmm0 + vmovq %rdx, %xmm0 + vmovq %rcx, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 vpxor %xmm0, %xmm6, %xmm6 # ghash_gfmul_red_avx vpshufd $0x4e, %xmm5, %xmm9 @@ -7897,7 +7898,8 @@ L_AES_GCM_encrypt_avx1_done_enc: vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm6, %xmm6 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6 - vpxor 144(%rsp), %xmm6, %xmm0 + vmovdqu 144(%rsp), %xmm0 + vpxor %xmm6, %xmm0, %xmm0 cmpl $16, %r14d je L_AES_GCM_encrypt_avx1_store_tag_16 xorq %rcx, %rcx @@ -7958,7 +7960,7 @@ _AES_GCM_decrypt_avx1: # # Calculate values when IV is 12 bytes # Set counter based on IV movl $0x1000000, %ecx - vpinsrq $0x00, (%rax), %xmm4, %xmm4 + vmovq (%rax), %xmm4 vpinsrd $2, 8(%rax), %xmm4, %xmm4 vpinsrd $3, %ecx, %xmm4, %xmm4 # H = Encrypt X(=0) and T = Encrypt counter @@ -8012,7 +8014,7 @@ L_AES_GCM_decrypt_avx1_calc_iv_12_last: vaesenclast %xmm7, %xmm5, %xmm5 vaesenclast %xmm7, %xmm1, %xmm1 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 - vmovdqa %xmm1, 144(%rsp) + vmovdqu %xmm1, 144(%rsp) jmp L_AES_GCM_decrypt_avx1_iv_done L_AES_GCM_decrypt_avx1_iv_not_12: # Calculate values when IV is not 12 bytes @@ -8165,7 +8167,7 @@ L_AES_GCM_decrypt_avx1_calc_iv_done: # T = Encrypt counter vpxor %xmm0, %xmm0, %xmm0 shll $3, %edx - vpinsrq $0x00, %rdx, %xmm0, %xmm0 + vmovq %rdx, %xmm0 vpxor %xmm0, %xmm4, %xmm4 # ghash_gfmul_avx vpshufd $0x4e, %xmm4, %xmm1 @@ -8236,7 +8238,7 @@ L_AES_GCM_decrypt_avx1_calc_iv_done: vmovdqa 224(%r15), %xmm9 L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last: vaesenclast %xmm9, %xmm8, %xmm8 - vmovdqa %xmm8, 144(%rsp) + vmovdqu %xmm8, 144(%rsp) L_AES_GCM_decrypt_avx1_iv_done: # Additional authentication data movl %r11d, %edx @@ -8370,7 +8372,7 @@ L_AES_GCM_decrypt_avx1_calc_aad_done: vpand L_avx1_aes_gcm_mod2_128(%rip), %xmm5, %xmm5 vpaddd L_avx1_aes_gcm_one(%rip), %xmm4, %xmm4 vpxor %xmm8, %xmm5, %xmm5 - vmovdqa %xmm4, 128(%rsp) + vmovdqu %xmm4, 128(%rsp) xorl %ebx, %ebx cmpl $0x80, %r9d movl %r9d, %r13d @@ -8378,7 +8380,7 @@ L_AES_GCM_decrypt_avx1_calc_aad_done: andl $0xffffff80, %r13d vmovdqa %xmm6, %xmm2 # H ^ 1 - vmovdqa %xmm5, (%rsp) + vmovdqu %xmm5, (%rsp) # H ^ 2 vpclmulqdq $0x00, %xmm5, %xmm5, %xmm8 vpclmulqdq $0x11, %xmm5, %xmm5, %xmm0 @@ -8398,7 +8400,7 @@ L_AES_GCM_decrypt_avx1_calc_aad_done: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm0, %xmm0 - vmovdqa %xmm0, 16(%rsp) + vmovdqu %xmm0, 16(%rsp) # H ^ 3 # ghash_gfmul_red_avx vpshufd $0x4e, %xmm5, %xmm9 @@ -8430,7 +8432,7 @@ L_AES_GCM_decrypt_avx1_calc_aad_done: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm1, %xmm1 - vmovdqa %xmm1, 32(%rsp) + vmovdqu %xmm1, 32(%rsp) # H ^ 4 vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 vpclmulqdq $0x11, %xmm0, %xmm0, %xmm3 @@ -8450,7 +8452,7 @@ L_AES_GCM_decrypt_avx1_calc_aad_done: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm3, %xmm3 - vmovdqa %xmm3, 48(%rsp) + vmovdqu %xmm3, 48(%rsp) # H ^ 5 # ghash_gfmul_red_avx vpshufd $0x4e, %xmm0, %xmm9 @@ -8482,7 +8484,7 @@ L_AES_GCM_decrypt_avx1_calc_aad_done: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm7, %xmm7 - vmovdqa %xmm7, 64(%rsp) + vmovdqu %xmm7, 64(%rsp) # H ^ 6 vpclmulqdq $0x00, %xmm1, %xmm1, %xmm8 vpclmulqdq $0x11, %xmm1, %xmm1, %xmm7 @@ -8502,7 +8504,7 @@ L_AES_GCM_decrypt_avx1_calc_aad_done: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm7, %xmm7 - vmovdqa %xmm7, 80(%rsp) + vmovdqu %xmm7, 80(%rsp) # H ^ 7 # ghash_gfmul_red_avx vpshufd $0x4e, %xmm1, %xmm9 @@ -8534,7 +8536,7 @@ L_AES_GCM_decrypt_avx1_calc_aad_done: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm7, %xmm7 - vmovdqa %xmm7, 96(%rsp) + vmovdqu %xmm7, 96(%rsp) # H ^ 8 vpclmulqdq $0x00, %xmm3, %xmm3, %xmm8 vpclmulqdq $0x11, %xmm3, %xmm3, %xmm7 @@ -8554,11 +8556,11 @@ L_AES_GCM_decrypt_avx1_calc_aad_done: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm7, %xmm7 - vmovdqa %xmm7, 112(%rsp) + vmovdqu %xmm7, 112(%rsp) L_AES_GCM_decrypt_avx1_ghash_128: leaq (%rdi,%rbx,1), %rcx leaq (%rsi,%rbx,1), %rdx - vmovdqa 128(%rsp), %xmm0 + vmovdqu 128(%rsp), %xmm0 vmovdqa L_avx1_aes_gcm_bswap_epi64(%rip), %xmm1 vpshufb %xmm1, %xmm0, %xmm8 vpaddd L_avx1_aes_gcm_one(%rip), %xmm0, %xmm9 @@ -8577,7 +8579,7 @@ L_AES_GCM_decrypt_avx1_ghash_128: vpshufb %xmm1, %xmm15, %xmm15 vpaddd L_avx1_aes_gcm_eight(%rip), %xmm0, %xmm0 vmovdqa (%r15), %xmm7 - vmovdqa %xmm0, 128(%rsp) + vmovdqu %xmm0, 128(%rsp) vpxor %xmm7, %xmm8, %xmm8 vpxor %xmm7, %xmm9, %xmm9 vpxor %xmm7, %xmm10, %xmm10 @@ -8586,7 +8588,7 @@ L_AES_GCM_decrypt_avx1_ghash_128: vpxor %xmm7, %xmm13, %xmm13 vpxor %xmm7, %xmm14, %xmm14 vpxor %xmm7, %xmm15, %xmm15 - vmovdqa 112(%rsp), %xmm7 + vmovdqu 112(%rsp), %xmm7 vmovdqu (%rcx), %xmm0 vaesenc 16(%r15), %xmm8, %xmm8 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -8607,7 +8609,7 @@ L_AES_GCM_decrypt_avx1_ghash_128: vaesenc 16(%r15), %xmm15, %xmm15 vpxor %xmm2, %xmm1, %xmm1 vpxor %xmm3, %xmm1, %xmm1 - vmovdqa 96(%rsp), %xmm7 + vmovdqu 96(%rsp), %xmm7 vmovdqu 16(%rcx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -8630,7 +8632,7 @@ L_AES_GCM_decrypt_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa 80(%rsp), %xmm7 + vmovdqu 80(%rsp), %xmm7 vmovdqu 32(%rcx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -8653,7 +8655,7 @@ L_AES_GCM_decrypt_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa 64(%rsp), %xmm7 + vmovdqu 64(%rsp), %xmm7 vmovdqu 48(%rcx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -8676,7 +8678,7 @@ L_AES_GCM_decrypt_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa 48(%rsp), %xmm7 + vmovdqu 48(%rsp), %xmm7 vmovdqu 64(%rcx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -8699,7 +8701,7 @@ L_AES_GCM_decrypt_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa 32(%rsp), %xmm7 + vmovdqu 32(%rsp), %xmm7 vmovdqu 80(%rcx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -8722,7 +8724,7 @@ L_AES_GCM_decrypt_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa 16(%rsp), %xmm7 + vmovdqu 16(%rsp), %xmm7 vmovdqu 96(%rcx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -8745,7 +8747,7 @@ L_AES_GCM_decrypt_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa (%rsp), %xmm7 + vmovdqu (%rsp), %xmm7 vmovdqu 112(%rcx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -8874,7 +8876,7 @@ L_AES_GCM_decrypt_avx1_aesenc_128_ghash_avx_done: cmpl %r13d, %ebx jl L_AES_GCM_decrypt_avx1_ghash_128 vmovdqa %xmm2, %xmm6 - vmovdqa (%rsp), %xmm5 + vmovdqu (%rsp), %xmm5 L_AES_GCM_decrypt_avx1_done_128: movl %r9d, %edx cmpl %edx, %ebx @@ -8888,10 +8890,10 @@ L_AES_GCM_decrypt_avx1_last_block_start: vmovdqa %xmm5, %xmm0 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm13, %xmm1 vpxor %xmm6, %xmm1, %xmm1 - vmovdqa 128(%rsp), %xmm9 + vmovdqu 128(%rsp), %xmm9 vpshufb L_avx1_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8 vpaddd L_avx1_aes_gcm_one(%rip), %xmm9, %xmm9 - vmovdqa %xmm9, 128(%rsp) + vmovdqu %xmm9, 128(%rsp) vpxor (%r15), %xmm8, %xmm8 vpclmulqdq $16, %xmm0, %xmm1, %xmm10 vaesenc 16(%r15), %xmm8, %xmm8 @@ -8943,7 +8945,7 @@ L_AES_GCM_decrypt_avx1_last_block_done: movl %ecx, %edx andl $15, %ecx jz L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_done - vmovdqa 128(%rsp), %xmm4 + vmovdqu 128(%rsp), %xmm4 vpshufb L_avx1_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4 vpxor (%r15), %xmm4, %xmm4 vaesenc 16(%r15), %xmm4, %xmm4 @@ -9022,8 +9024,9 @@ L_AES_GCM_decrypt_avx1_done_dec: movl %r11d, %ecx shlq $3, %rdx shlq $3, %rcx - vpinsrq $0x00, %rdx, %xmm0, %xmm0 - vpinsrq $0x01, %rcx, %xmm0, %xmm0 + vmovq %rdx, %xmm0 + vmovq %rcx, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 vpxor %xmm0, %xmm6, %xmm6 # ghash_gfmul_red_avx vpshufd $0x4e, %xmm5, %xmm9 @@ -9056,7 +9059,8 @@ L_AES_GCM_decrypt_avx1_done_dec: vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm6, %xmm6 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm6, %xmm6 - vpxor 144(%rsp), %xmm6, %xmm0 + vmovdqu 144(%rsp), %xmm0 + vpxor %xmm6, %xmm0, %xmm0 cmpl $16, %r14d je L_AES_GCM_decrypt_avx1_cmp_tag_16 subq $16, %rsp @@ -9123,7 +9127,7 @@ _AES_GCM_init_avx1: # # Calculate values when IV is 12 bytes # Set counter based on IV movl $0x1000000, %ecx - vpinsrq $0x00, (%r10), %xmm4, %xmm4 + vmovq (%r10), %xmm4 vpinsrd $2, 8(%r10), %xmm4, %xmm4 vpinsrd $3, %ecx, %xmm4, %xmm4 # H = Encrypt X(=0) and T = Encrypt counter @@ -9177,7 +9181,7 @@ L_AES_GCM_init_avx1_calc_iv_12_last: vaesenclast %xmm7, %xmm5, %xmm5 vaesenclast %xmm7, %xmm1, %xmm1 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm5, %xmm5 - vmovdqa %xmm1, %xmm15 + vmovdqu %xmm1, %xmm15 jmp L_AES_GCM_init_avx1_iv_done L_AES_GCM_init_avx1_iv_not_12: # Calculate values when IV is not 12 bytes @@ -9330,7 +9334,7 @@ L_AES_GCM_init_avx1_calc_iv_done: # T = Encrypt counter vpxor %xmm0, %xmm0, %xmm0 shll $3, %edx - vpinsrq $0x00, %rdx, %xmm0, %xmm0 + vmovq %rdx, %xmm0 vpxor %xmm0, %xmm4, %xmm4 # ghash_gfmul_avx vpshufd $0x4e, %xmm4, %xmm1 @@ -9401,7 +9405,7 @@ L_AES_GCM_init_avx1_calc_iv_done: vmovdqa 224(%rdi), %xmm9 L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last: vaesenclast %xmm9, %xmm8, %xmm8 - vmovdqa %xmm8, %xmm15 + vmovdqu %xmm8, %xmm15 L_AES_GCM_init_avx1_iv_done: vmovdqa %xmm15, (%rax) vpshufb L_avx1_aes_gcm_bswap_epi64(%rip), %xmm4, %xmm4 @@ -9502,10 +9506,10 @@ _AES_GCM_encrypt_block_avx1: #endif /* __APPLE__ */ movq %rdx, %r10 movq %rcx, %r11 - vmovdqa (%r8), %xmm9 + vmovdqu (%r8), %xmm9 vpshufb L_avx1_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8 vpaddd L_avx1_aes_gcm_one(%rip), %xmm9, %xmm9 - vmovdqa %xmm9, (%r8) + vmovdqu %xmm9, (%r8) vpxor (%rdi), %xmm8, %xmm8 vaesenc 16(%rdi), %xmm8, %xmm8 vaesenc 32(%rdi), %xmm8, %xmm8 @@ -9641,7 +9645,7 @@ _AES_GCM_encrypt_update_avx1: andl $0xffffff80, %r13d vmovdqa %xmm6, %xmm2 # H ^ 1 - vmovdqa %xmm5, (%rsp) + vmovdqu %xmm5, (%rsp) # H ^ 2 vpclmulqdq $0x00, %xmm5, %xmm5, %xmm8 vpclmulqdq $0x11, %xmm5, %xmm5, %xmm0 @@ -9661,7 +9665,7 @@ _AES_GCM_encrypt_update_avx1: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm0, %xmm0 - vmovdqa %xmm0, 16(%rsp) + vmovdqu %xmm0, 16(%rsp) # H ^ 3 # ghash_gfmul_red_avx vpshufd $0x4e, %xmm5, %xmm9 @@ -9693,7 +9697,7 @@ _AES_GCM_encrypt_update_avx1: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm1, %xmm1 - vmovdqa %xmm1, 32(%rsp) + vmovdqu %xmm1, 32(%rsp) # H ^ 4 vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 vpclmulqdq $0x11, %xmm0, %xmm0, %xmm3 @@ -9713,7 +9717,7 @@ _AES_GCM_encrypt_update_avx1: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm3, %xmm3 - vmovdqa %xmm3, 48(%rsp) + vmovdqu %xmm3, 48(%rsp) # H ^ 5 # ghash_gfmul_red_avx vpshufd $0x4e, %xmm0, %xmm9 @@ -9745,7 +9749,7 @@ _AES_GCM_encrypt_update_avx1: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm7, %xmm7 - vmovdqa %xmm7, 64(%rsp) + vmovdqu %xmm7, 64(%rsp) # H ^ 6 vpclmulqdq $0x00, %xmm1, %xmm1, %xmm8 vpclmulqdq $0x11, %xmm1, %xmm1, %xmm7 @@ -9765,7 +9769,7 @@ _AES_GCM_encrypt_update_avx1: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm7, %xmm7 - vmovdqa %xmm7, 80(%rsp) + vmovdqu %xmm7, 80(%rsp) # H ^ 7 # ghash_gfmul_red_avx vpshufd $0x4e, %xmm1, %xmm9 @@ -9797,7 +9801,7 @@ _AES_GCM_encrypt_update_avx1: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm7, %xmm7 - vmovdqa %xmm7, 96(%rsp) + vmovdqu %xmm7, 96(%rsp) # H ^ 8 vpclmulqdq $0x00, %xmm3, %xmm3, %xmm8 vpclmulqdq $0x11, %xmm3, %xmm3, %xmm7 @@ -9817,9 +9821,9 @@ _AES_GCM_encrypt_update_avx1: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm7, %xmm7 - vmovdqa %xmm7, 112(%rsp) + vmovdqu %xmm7, 112(%rsp) # First 128 bytes of input - vmovdqa (%r12), %xmm0 + vmovdqu (%r12), %xmm0 vmovdqa L_avx1_aes_gcm_bswap_epi64(%rip), %xmm1 vpshufb %xmm1, %xmm0, %xmm8 vpaddd L_avx1_aes_gcm_one(%rip), %xmm0, %xmm9 @@ -9838,7 +9842,7 @@ _AES_GCM_encrypt_update_avx1: vpshufb %xmm1, %xmm15, %xmm15 vpaddd L_avx1_aes_gcm_eight(%rip), %xmm0, %xmm0 vmovdqa (%rdi), %xmm7 - vmovdqa %xmm0, (%r12) + vmovdqu %xmm0, (%r12) vpxor %xmm7, %xmm8, %xmm8 vpxor %xmm7, %xmm9, %xmm9 vpxor %xmm7, %xmm10, %xmm10 @@ -10009,7 +10013,7 @@ L_AES_GCM_encrypt_update_avx1_aesenc_128_enc_done: L_AES_GCM_encrypt_update_avx1_ghash_128: leaq (%r11,%r14,1), %rcx leaq (%r10,%r14,1), %rdx - vmovdqa (%r12), %xmm0 + vmovdqu (%r12), %xmm0 vmovdqa L_avx1_aes_gcm_bswap_epi64(%rip), %xmm1 vpshufb %xmm1, %xmm0, %xmm8 vpaddd L_avx1_aes_gcm_one(%rip), %xmm0, %xmm9 @@ -10028,7 +10032,7 @@ L_AES_GCM_encrypt_update_avx1_ghash_128: vpshufb %xmm1, %xmm15, %xmm15 vpaddd L_avx1_aes_gcm_eight(%rip), %xmm0, %xmm0 vmovdqa (%rdi), %xmm7 - vmovdqa %xmm0, (%r12) + vmovdqu %xmm0, (%r12) vpxor %xmm7, %xmm8, %xmm8 vpxor %xmm7, %xmm9, %xmm9 vpxor %xmm7, %xmm10, %xmm10 @@ -10037,7 +10041,7 @@ L_AES_GCM_encrypt_update_avx1_ghash_128: vpxor %xmm7, %xmm13, %xmm13 vpxor %xmm7, %xmm14, %xmm14 vpxor %xmm7, %xmm15, %xmm15 - vmovdqa 112(%rsp), %xmm7 + vmovdqu 112(%rsp), %xmm7 vmovdqu -128(%rdx), %xmm0 vaesenc 16(%rdi), %xmm8, %xmm8 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -10058,7 +10062,7 @@ L_AES_GCM_encrypt_update_avx1_ghash_128: vaesenc 16(%rdi), %xmm15, %xmm15 vpxor %xmm2, %xmm1, %xmm1 vpxor %xmm3, %xmm1, %xmm1 - vmovdqa 96(%rsp), %xmm7 + vmovdqu 96(%rsp), %xmm7 vmovdqu -112(%rdx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -10081,7 +10085,7 @@ L_AES_GCM_encrypt_update_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa 80(%rsp), %xmm7 + vmovdqu 80(%rsp), %xmm7 vmovdqu -96(%rdx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -10104,7 +10108,7 @@ L_AES_GCM_encrypt_update_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa 64(%rsp), %xmm7 + vmovdqu 64(%rsp), %xmm7 vmovdqu -80(%rdx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -10127,7 +10131,7 @@ L_AES_GCM_encrypt_update_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa 48(%rsp), %xmm7 + vmovdqu 48(%rsp), %xmm7 vmovdqu -64(%rdx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -10150,7 +10154,7 @@ L_AES_GCM_encrypt_update_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa 32(%rsp), %xmm7 + vmovdqu 32(%rsp), %xmm7 vmovdqu -48(%rdx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -10173,7 +10177,7 @@ L_AES_GCM_encrypt_update_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa 16(%rsp), %xmm7 + vmovdqu 16(%rsp), %xmm7 vmovdqu -32(%rdx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -10196,7 +10200,7 @@ L_AES_GCM_encrypt_update_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa (%rsp), %xmm7 + vmovdqu (%rsp), %xmm7 vmovdqu -16(%rdx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -10335,8 +10339,8 @@ L_AES_GCM_encrypt_update_avx1_end_128: vpshufb %xmm4, %xmm13, %xmm13 vpshufb %xmm4, %xmm14, %xmm14 vpshufb %xmm4, %xmm15, %xmm15 - vmovdqa (%rsp), %xmm7 - vmovdqa 16(%rsp), %xmm5 + vmovdqu (%rsp), %xmm7 + vmovdqu 16(%rsp), %xmm5 # ghash_gfmul_avx vpshufd $0x4e, %xmm15, %xmm1 vpshufd $0x4e, %xmm7, %xmm2 @@ -10369,8 +10373,8 @@ L_AES_GCM_encrypt_update_avx1_end_128: vpsrldq $8, %xmm1, %xmm1 vpxor %xmm2, %xmm4, %xmm4 vpxor %xmm1, %xmm6, %xmm6 - vmovdqa 32(%rsp), %xmm7 - vmovdqa 48(%rsp), %xmm5 + vmovdqu 32(%rsp), %xmm7 + vmovdqu 48(%rsp), %xmm5 # ghash_gfmul_xor_avx vpshufd $0x4e, %xmm13, %xmm1 vpshufd $0x4e, %xmm7, %xmm2 @@ -10403,8 +10407,8 @@ L_AES_GCM_encrypt_update_avx1_end_128: vpsrldq $8, %xmm1, %xmm1 vpxor %xmm2, %xmm4, %xmm4 vpxor %xmm1, %xmm6, %xmm6 - vmovdqa 64(%rsp), %xmm7 - vmovdqa 80(%rsp), %xmm5 + vmovdqu 64(%rsp), %xmm7 + vmovdqu 80(%rsp), %xmm5 # ghash_gfmul_xor_avx vpshufd $0x4e, %xmm11, %xmm1 vpshufd $0x4e, %xmm7, %xmm2 @@ -10437,8 +10441,8 @@ L_AES_GCM_encrypt_update_avx1_end_128: vpsrldq $8, %xmm1, %xmm1 vpxor %xmm2, %xmm4, %xmm4 vpxor %xmm1, %xmm6, %xmm6 - vmovdqa 96(%rsp), %xmm7 - vmovdqa 112(%rsp), %xmm5 + vmovdqu 96(%rsp), %xmm7 + vmovdqu 112(%rsp), %xmm5 # ghash_gfmul_xor_avx vpshufd $0x4e, %xmm9, %xmm1 vpshufd $0x4e, %xmm7, %xmm2 @@ -10488,7 +10492,7 @@ L_AES_GCM_encrypt_update_avx1_end_128: vpxor %xmm1, %xmm2, %xmm2 vpxor %xmm4, %xmm2, %xmm2 vpxor %xmm2, %xmm6, %xmm6 - vmovdqa (%rsp), %xmm5 + vmovdqu (%rsp), %xmm5 L_AES_GCM_encrypt_update_avx1_done_128: movl %r8d, %edx cmpl %edx, %r14d @@ -10497,10 +10501,10 @@ L_AES_GCM_encrypt_update_avx1_done_128: andl $0xfffffff0, %r13d cmpl %r13d, %r14d jge L_AES_GCM_encrypt_update_avx1_last_block_done - vmovdqa (%r12), %xmm9 + vmovdqu (%r12), %xmm9 vpshufb L_avx1_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8 vpaddd L_avx1_aes_gcm_one(%rip), %xmm9, %xmm9 - vmovdqa %xmm9, (%r12) + vmovdqu %xmm9, (%r12) vpxor (%rdi), %xmm8, %xmm8 vaesenc 16(%rdi), %xmm8, %xmm8 vaesenc 32(%rdi), %xmm8, %xmm8 @@ -10534,10 +10538,10 @@ L_AES_GCM_encrypt_update_avx1_aesenc_block_last: jge L_AES_GCM_encrypt_update_avx1_last_block_ghash L_AES_GCM_encrypt_update_avx1_last_block_start: vmovdqu (%r11,%r14,1), %xmm13 - vmovdqa (%r12), %xmm9 + vmovdqu (%r12), %xmm9 vpshufb L_avx1_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8 vpaddd L_avx1_aes_gcm_one(%rip), %xmm9, %xmm9 - vmovdqa %xmm9, (%r12) + vmovdqu %xmm9, (%r12) vpxor (%rdi), %xmm8, %xmm8 vpclmulqdq $16, %xmm5, %xmm6, %xmm10 vaesenc 16(%rdi), %xmm8, %xmm8 @@ -10642,7 +10646,7 @@ AES_GCM_encrypt_final_avx1: _AES_GCM_encrypt_final_avx1: #endif /* __APPLE__ */ pushq %r13 - movq %rdx, %rax + movl %edx, %eax movl %ecx, %r10d movl %r8d, %r11d movq 16(%rsp), %r8 @@ -10662,8 +10666,9 @@ _AES_GCM_encrypt_final_avx1: movl %r11d, %ecx shlq $3, %rdx shlq $3, %rcx - vpinsrq $0x00, %rdx, %xmm0, %xmm0 - vpinsrq $0x01, %rcx, %xmm0, %xmm0 + vmovq %rdx, %xmm0 + vmovq %rcx, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 vpxor %xmm0, %xmm4, %xmm4 # ghash_gfmul_red_avx vpshufd $0x4e, %xmm5, %xmm9 @@ -10755,7 +10760,7 @@ _AES_GCM_decrypt_update_avx1: andl $0xffffff80, %r13d vmovdqa %xmm6, %xmm2 # H ^ 1 - vmovdqa %xmm5, (%rsp) + vmovdqu %xmm5, (%rsp) # H ^ 2 vpclmulqdq $0x00, %xmm5, %xmm5, %xmm8 vpclmulqdq $0x11, %xmm5, %xmm5, %xmm0 @@ -10775,7 +10780,7 @@ _AES_GCM_decrypt_update_avx1: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm0, %xmm0 - vmovdqa %xmm0, 16(%rsp) + vmovdqu %xmm0, 16(%rsp) # H ^ 3 # ghash_gfmul_red_avx vpshufd $0x4e, %xmm5, %xmm9 @@ -10807,7 +10812,7 @@ _AES_GCM_decrypt_update_avx1: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm1, %xmm1 - vmovdqa %xmm1, 32(%rsp) + vmovdqu %xmm1, 32(%rsp) # H ^ 4 vpclmulqdq $0x00, %xmm0, %xmm0, %xmm8 vpclmulqdq $0x11, %xmm0, %xmm0, %xmm3 @@ -10827,7 +10832,7 @@ _AES_GCM_decrypt_update_avx1: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm3, %xmm3 - vmovdqa %xmm3, 48(%rsp) + vmovdqu %xmm3, 48(%rsp) # H ^ 5 # ghash_gfmul_red_avx vpshufd $0x4e, %xmm0, %xmm9 @@ -10859,7 +10864,7 @@ _AES_GCM_decrypt_update_avx1: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm7, %xmm7 - vmovdqa %xmm7, 64(%rsp) + vmovdqu %xmm7, 64(%rsp) # H ^ 6 vpclmulqdq $0x00, %xmm1, %xmm1, %xmm8 vpclmulqdq $0x11, %xmm1, %xmm1, %xmm7 @@ -10879,7 +10884,7 @@ _AES_GCM_decrypt_update_avx1: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm7, %xmm7 - vmovdqa %xmm7, 80(%rsp) + vmovdqu %xmm7, 80(%rsp) # H ^ 7 # ghash_gfmul_red_avx vpshufd $0x4e, %xmm1, %xmm9 @@ -10911,7 +10916,7 @@ _AES_GCM_decrypt_update_avx1: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm7, %xmm7 - vmovdqa %xmm7, 96(%rsp) + vmovdqu %xmm7, 96(%rsp) # H ^ 8 vpclmulqdq $0x00, %xmm3, %xmm3, %xmm8 vpclmulqdq $0x11, %xmm3, %xmm3, %xmm7 @@ -10931,11 +10936,11 @@ _AES_GCM_decrypt_update_avx1: vpxor %xmm13, %xmm14, %xmm14 vpxor %xmm8, %xmm14, %xmm14 vpxor %xmm14, %xmm7, %xmm7 - vmovdqa %xmm7, 112(%rsp) + vmovdqu %xmm7, 112(%rsp) L_AES_GCM_decrypt_update_avx1_ghash_128: leaq (%r11,%r14,1), %rcx leaq (%r10,%r14,1), %rdx - vmovdqa (%r12), %xmm0 + vmovdqu (%r12), %xmm0 vmovdqa L_avx1_aes_gcm_bswap_epi64(%rip), %xmm1 vpshufb %xmm1, %xmm0, %xmm8 vpaddd L_avx1_aes_gcm_one(%rip), %xmm0, %xmm9 @@ -10954,7 +10959,7 @@ L_AES_GCM_decrypt_update_avx1_ghash_128: vpshufb %xmm1, %xmm15, %xmm15 vpaddd L_avx1_aes_gcm_eight(%rip), %xmm0, %xmm0 vmovdqa (%rdi), %xmm7 - vmovdqa %xmm0, (%r12) + vmovdqu %xmm0, (%r12) vpxor %xmm7, %xmm8, %xmm8 vpxor %xmm7, %xmm9, %xmm9 vpxor %xmm7, %xmm10, %xmm10 @@ -10963,7 +10968,7 @@ L_AES_GCM_decrypt_update_avx1_ghash_128: vpxor %xmm7, %xmm13, %xmm13 vpxor %xmm7, %xmm14, %xmm14 vpxor %xmm7, %xmm15, %xmm15 - vmovdqa 112(%rsp), %xmm7 + vmovdqu 112(%rsp), %xmm7 vmovdqu (%rcx), %xmm0 vaesenc 16(%rdi), %xmm8, %xmm8 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -10984,7 +10989,7 @@ L_AES_GCM_decrypt_update_avx1_ghash_128: vaesenc 16(%rdi), %xmm15, %xmm15 vpxor %xmm2, %xmm1, %xmm1 vpxor %xmm3, %xmm1, %xmm1 - vmovdqa 96(%rsp), %xmm7 + vmovdqu 96(%rsp), %xmm7 vmovdqu 16(%rcx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -11007,7 +11012,7 @@ L_AES_GCM_decrypt_update_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa 80(%rsp), %xmm7 + vmovdqu 80(%rsp), %xmm7 vmovdqu 32(%rcx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -11030,7 +11035,7 @@ L_AES_GCM_decrypt_update_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa 64(%rsp), %xmm7 + vmovdqu 64(%rsp), %xmm7 vmovdqu 48(%rcx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -11053,7 +11058,7 @@ L_AES_GCM_decrypt_update_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa 48(%rsp), %xmm7 + vmovdqu 48(%rsp), %xmm7 vmovdqu 64(%rcx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -11076,7 +11081,7 @@ L_AES_GCM_decrypt_update_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa 32(%rsp), %xmm7 + vmovdqu 32(%rsp), %xmm7 vmovdqu 80(%rcx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -11099,7 +11104,7 @@ L_AES_GCM_decrypt_update_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa 16(%rsp), %xmm7 + vmovdqu 16(%rsp), %xmm7 vmovdqu 96(%rcx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -11122,7 +11127,7 @@ L_AES_GCM_decrypt_update_avx1_ghash_128: vpxor %xmm6, %xmm1, %xmm1 vpxor %xmm6, %xmm3, %xmm3 vpxor %xmm4, %xmm1, %xmm1 - vmovdqa (%rsp), %xmm7 + vmovdqu (%rsp), %xmm7 vmovdqu 112(%rcx), %xmm0 vpshufd $0x4e, %xmm7, %xmm4 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm0, %xmm0 @@ -11251,7 +11256,7 @@ L_AES_GCM_decrypt_update_avx1_aesenc_128_ghash_avx_done: cmpl %r13d, %r14d jl L_AES_GCM_decrypt_update_avx1_ghash_128 vmovdqa %xmm2, %xmm6 - vmovdqa (%rsp), %xmm5 + vmovdqu (%rsp), %xmm5 L_AES_GCM_decrypt_update_avx1_done_128: movl %r8d, %edx cmpl %edx, %r14d @@ -11265,10 +11270,10 @@ L_AES_GCM_decrypt_update_avx1_last_block_start: vmovdqa %xmm5, %xmm0 vpshufb L_avx1_aes_gcm_bswap_mask(%rip), %xmm13, %xmm1 vpxor %xmm6, %xmm1, %xmm1 - vmovdqa (%r12), %xmm9 + vmovdqu (%r12), %xmm9 vpshufb L_avx1_aes_gcm_bswap_epi64(%rip), %xmm9, %xmm8 vpaddd L_avx1_aes_gcm_one(%rip), %xmm9, %xmm9 - vmovdqa %xmm9, (%r12) + vmovdqu %xmm9, (%r12) vpxor (%rdi), %xmm8, %xmm8 vpclmulqdq $16, %xmm0, %xmm1, %xmm10 vaesenc 16(%rdi), %xmm8, %xmm8 @@ -11342,7 +11347,7 @@ _AES_GCM_decrypt_final_avx1: pushq %r13 pushq %rbp pushq %r12 - movq %rdx, %rax + movl %edx, %eax movl %ecx, %r10d movl %r8d, %r11d movq 32(%rsp), %r8 @@ -11363,8 +11368,9 @@ _AES_GCM_decrypt_final_avx1: movl %r11d, %ecx shlq $3, %rdx shlq $3, %rcx - vpinsrq $0x00, %rdx, %xmm0, %xmm0 - vpinsrq $0x01, %rcx, %xmm0, %xmm0 + vmovq %rdx, %xmm0 + vmovq %rcx, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 vpxor %xmm0, %xmm6, %xmm6 # ghash_gfmul_red_avx vpshufd $0x4e, %xmm5, %xmm9 @@ -11737,7 +11743,7 @@ L_AES_GCM_encrypt_avx2_calc_iv_done: # T = Encrypt counter vpxor %xmm0, %xmm0, %xmm0 shll $3, %edx - vpinsrq $0x00, %rdx, %xmm0, %xmm0 + vmovq %rdx, %xmm0 vpxor %xmm0, %xmm4, %xmm4 # ghash_gfmul_avx vpclmulqdq $16, %xmm4, %xmm5, %xmm2 @@ -12840,10 +12846,10 @@ L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_finish_enc: L_AES_GCM_encrypt_avx2_done_enc: # calc_tag shlq $3, %r10 - vpinsrq $0x00, %r10, %xmm0, %xmm0 shlq $3, %r11 - vpinsrq $0x01, %r11, %xmm1, %xmm1 - vpblendd $12, %xmm1, %xmm0, %xmm0 + vmovq %r10, %xmm0 + vmovq %r11, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 vpxor %xmm6, %xmm0, %xmm0 # ghash_gfmul_red vpclmulqdq $16, %xmm5, %xmm0, %xmm4 @@ -13047,7 +13053,7 @@ L_AES_GCM_decrypt_avx2_calc_iv_done: # T = Encrypt counter vpxor %xmm0, %xmm0, %xmm0 shll $3, %edx - vpinsrq $0x00, %rdx, %xmm0, %xmm0 + vmovq %rdx, %xmm0 vpxor %xmm0, %xmm4, %xmm4 # ghash_gfmul_avx vpclmulqdq $16, %xmm4, %xmm5, %xmm2 @@ -13799,10 +13805,10 @@ L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_loop: L_AES_GCM_decrypt_avx2_done_dec: # calc_tag shlq $3, %r10 - vpinsrq $0x00, %r10, %xmm0, %xmm0 shlq $3, %r11 - vpinsrq $0x01, %r11, %xmm1, %xmm1 - vpblendd $12, %xmm1, %xmm0, %xmm0 + vmovq %r10, %xmm0 + vmovq %r11, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 vpxor %xmm6, %xmm0, %xmm0 # ghash_gfmul_red vpclmulqdq $16, %xmm5, %xmm0, %xmm4 @@ -14006,7 +14012,7 @@ L_AES_GCM_init_avx2_calc_iv_done: # T = Encrypt counter vpxor %xmm0, %xmm0, %xmm0 shll $3, %edx - vpinsrq $0x00, %rdx, %xmm0, %xmm0 + vmovq %rdx, %xmm0 vpxor %xmm0, %xmm4, %xmm4 # ghash_gfmul_avx vpclmulqdq $16, %xmm4, %xmm5, %xmm2 @@ -15182,10 +15188,7 @@ AES_GCM_encrypt_final_avx2: .p2align 4 _AES_GCM_encrypt_final_avx2: #endif /* __APPLE__ */ - pushq %r13 - movl %ecx, %r10d - movl %r8d, %r11d - movq 16(%rsp), %rax + movq 8(%rsp), %rax subq $16, %rsp vmovdqu (%rdi), %xmm4 vmovdqu (%r9), %xmm5 @@ -15199,11 +15202,11 @@ _AES_GCM_encrypt_final_avx2: vpand L_avx2_aes_gcm_mod2_128(%rip), %xmm5, %xmm5 vpxor %xmm0, %xmm5, %xmm5 # calc_tag - shlq $3, %r10 - vpinsrq $0x00, %r10, %xmm0, %xmm0 - shlq $3, %r11 - vpinsrq $0x01, %r11, %xmm1, %xmm1 - vpblendd $12, %xmm1, %xmm0, %xmm0 + shlq $3, %rcx + shlq $3, %r8 + vmovq %rcx, %xmm0 + vmovq %r8, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 vpxor %xmm4, %xmm0, %xmm0 # ghash_gfmul_red vpclmulqdq $16, %xmm5, %xmm0, %xmm7 @@ -15227,13 +15230,13 @@ _AES_GCM_encrypt_final_avx2: # store_tag cmpl $16, %edx je L_AES_GCM_encrypt_final_avx2_store_tag_16 - xorq %rcx, %rcx + xorq %r10, %r10 vmovdqu %xmm0, (%rsp) L_AES_GCM_encrypt_final_avx2_store_tag_loop: - movzbl (%rsp,%rcx,1), %r13d - movb %r13b, (%rsi,%rcx,1) - incl %ecx - cmpl %edx, %ecx + movzbl (%rsp,%r10,1), %r11d + movb %r11b, (%rsi,%r10,1) + incl %r10d + cmpl %edx, %r10d jne L_AES_GCM_encrypt_final_avx2_store_tag_loop jmp L_AES_GCM_encrypt_final_avx2_store_tag_done L_AES_GCM_encrypt_final_avx2_store_tag_16: @@ -15241,7 +15244,6 @@ L_AES_GCM_encrypt_final_avx2_store_tag_16: L_AES_GCM_encrypt_final_avx2_store_tag_done: vzeroupper addq $16, %rsp - popq %r13 repz retq #ifndef __APPLE__ .size AES_GCM_encrypt_final_avx2,.-AES_GCM_encrypt_final_avx2 @@ -15769,12 +15771,9 @@ AES_GCM_decrypt_final_avx2: .p2align 4 _AES_GCM_decrypt_final_avx2: #endif /* __APPLE__ */ - pushq %r13 - pushq %rbp - movl %ecx, %r10d - movl %r8d, %r11d - movq 24(%rsp), %rax - movq 32(%rsp), %rbp + pushq %r12 + movq 16(%rsp), %rax + movq 24(%rsp), %r10 subq $16, %rsp vmovdqu (%rdi), %xmm4 vmovdqu (%r9), %xmm5 @@ -15788,11 +15787,11 @@ _AES_GCM_decrypt_final_avx2: vpand L_avx2_aes_gcm_mod2_128(%rip), %xmm5, %xmm5 vpxor %xmm0, %xmm5, %xmm5 # calc_tag - shlq $3, %r10 - vpinsrq $0x00, %r10, %xmm0, %xmm0 - shlq $3, %r11 - vpinsrq $0x01, %r11, %xmm1, %xmm1 - vpblendd $12, %xmm1, %xmm0, %xmm0 + shlq $3, %rcx + shlq $3, %r8 + vmovq %rcx, %xmm0 + vmovq %r8, %xmm1 + vpunpcklqdq %xmm1, %xmm0, %xmm0 vpxor %xmm4, %xmm0, %xmm0 # ghash_gfmul_red vpclmulqdq $16, %xmm5, %xmm0, %xmm7 @@ -15816,15 +15815,15 @@ _AES_GCM_decrypt_final_avx2: # cmp_tag cmpl $16, %edx je L_AES_GCM_decrypt_final_avx2_cmp_tag_16 - xorq %rcx, %rcx + xorq %r11, %r11 xorq %r9, %r9 vmovdqu %xmm0, (%rsp) L_AES_GCM_decrypt_final_avx2_cmp_tag_loop: - movzbl (%rsp,%rcx,1), %r13d - xorb (%rsi,%rcx,1), %r13b - orb %r13b, %r9b - incl %ecx - cmpl %edx, %ecx + movzbl (%rsp,%r11,1), %r12d + xorb (%rsi,%r11,1), %r12b + orb %r12b, %r9b + incl %r11d + cmpl %edx, %r11d jne L_AES_GCM_decrypt_final_avx2_cmp_tag_loop cmpb $0x00, %r9b sete %r9b @@ -15832,17 +15831,16 @@ L_AES_GCM_decrypt_final_avx2_cmp_tag_loop: L_AES_GCM_decrypt_final_avx2_cmp_tag_16: vmovdqu (%rsi), %xmm1 vpcmpeqb %xmm1, %xmm0, %xmm0 - vpmovmskb %xmm0, %rcx + vpmovmskb %xmm0, %r11 # %%edx == 0xFFFF then return 1 else => return 0 xorl %r9d, %r9d - cmpl $0xffff, %ecx + cmpl $0xffff, %r11d sete %r9b L_AES_GCM_decrypt_final_avx2_cmp_tag_done: - movl %r9d, (%rbp) + movl %r9d, (%r10) vzeroupper addq $16, %rsp - popq %rbp - popq %r13 + popq %r12 repz retq #ifndef __APPLE__ .size AES_GCM_decrypt_final_avx2,.-AES_GCM_decrypt_final_avx2 diff --git a/wolfcrypt/src/aes_gcm_asm.asm b/wolfcrypt/src/aes_gcm_asm.asm new file mode 100644 index 000000000..031a0961c --- /dev/null +++ b/wolfcrypt/src/aes_gcm_asm.asm @@ -0,0 +1,15423 @@ +; /* aes_gcm_asm +; * +; * Copyright (C) 2006-2023 wolfSSL Inc. +; * +; * This file is part of wolfSSL. +; * +; * wolfSSL is free software; you can redistribute it and/or modify +; * it under the terms of the GNU General Public License as published by +; * the Free Software Foundation; either version 2 of the License, or +; * (at your option) any later version. +; * +; * wolfSSL is distributed in the hope that it will be useful, +; * but WITHOUT ANY WARRANTY; without even the implied warranty of +; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; * GNU General Public License for more details. +; * +; * You should have received a copy of the GNU General Public License +; * along with this program; if not, write to the Free Software +; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA +; */ +IF @Version LT 1200 +; AVX2 instructions not recognized by old versions of MASM +IFNDEF NO_AVX2_SUPPORT +NO_AVX2_SUPPORT = 1 +ENDIF +; MOVBE instruction not recognized by old versions of MASM +IFNDEF NO_MOVBE_SUPPORT +NO_MOVBE_SUPPORT = 1 +ENDIF +ENDIF + +IFNDEF HAVE_INTEL_AVX1 +HAVE_INTEL_AVX1 = 1 +ENDIF +IFNDEF NO_AVX2_SUPPORT +HAVE_INTEL_AVX2 = 1 +ENDIF + +IFNDEF _WIN64 +_WIN64 = 1 +ENDIF + +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_one QWORD 0, 1 +ptr_L_aes_gcm_one QWORD L_aes_gcm_one +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_two QWORD 0, 2 +ptr_L_aes_gcm_two QWORD L_aes_gcm_two +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_three QWORD 0, 3 +ptr_L_aes_gcm_three QWORD L_aes_gcm_three +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_four QWORD 0, 4 +ptr_L_aes_gcm_four QWORD L_aes_gcm_four +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_five QWORD 0, 5 +ptr_L_aes_gcm_five QWORD L_aes_gcm_five +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_six QWORD 0, 6 +ptr_L_aes_gcm_six QWORD L_aes_gcm_six +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_seven QWORD 0, 7 +ptr_L_aes_gcm_seven QWORD L_aes_gcm_seven +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_eight QWORD 0, 8 +ptr_L_aes_gcm_eight QWORD L_aes_gcm_eight +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_bswap_epi64 QWORD 283686952306183, 579005069656919567 +ptr_L_aes_gcm_bswap_epi64 QWORD L_aes_gcm_bswap_epi64 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_bswap_mask QWORD 579005069656919567, 283686952306183 +ptr_L_aes_gcm_bswap_mask QWORD L_aes_gcm_bswap_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_aes_gcm_mod2_128 QWORD 1, 13979173243358019584 +ptr_L_aes_gcm_mod2_128 QWORD L_aes_gcm_mod2_128 +_DATA ENDS +_text SEGMENT READONLY PARA +AES_GCM_encrypt PROC + push r13 + push rdi + push rsi + push r12 + push rbx + push r14 + push r15 + mov rdi, rcx + mov rsi, rdx + mov r12, r8 + mov rax, r9 + mov r8, QWORD PTR [rsp+96] + mov r9d, DWORD PTR [rsp+104] + mov r11d, DWORD PTR [rsp+112] + mov ebx, DWORD PTR [rsp+120] + mov r14d, DWORD PTR [rsp+128] + mov r15, QWORD PTR [rsp+136] + mov r10d, DWORD PTR [rsp+144] + sub rsp, 160 + pxor xmm4, xmm4 + pxor xmm6, xmm6 + cmp ebx, 12 + mov edx, ebx + jne L_AES_GCM_encrypt_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + pinsrq xmm4, QWORD PTR [rax], 0 + pinsrd xmm4, DWORD PTR [rax+8], 2 + pinsrd xmm4, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + movdqa xmm1, xmm4 + movdqa xmm5, OWORD PTR [r15] + pxor xmm1, xmm5 + movdqa xmm7, OWORD PTR [r15+16] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [r15+32] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [r15+48] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [r15+64] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [r15+80] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [r15+96] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [r15+112] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [r15+128] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [r15+144] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + cmp r10d, 11 + movdqa xmm7, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_calc_iv_12_last + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [r15+176] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + cmp r10d, 13 + movdqa xmm7, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_calc_iv_12_last + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [r15+208] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [r15+224] +L_AES_GCM_encrypt_calc_iv_12_last: + aesenclast xmm5, xmm7 + aesenclast xmm1, xmm7 + pshufb xmm5, OWORD PTR L_aes_gcm_bswap_mask + movdqu [rsp+144], xmm1 + jmp L_AES_GCM_encrypt_iv_done +L_AES_GCM_encrypt_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + movdqa xmm5, OWORD PTR [r15] + aesenc xmm5, [r15+16] + aesenc xmm5, [r15+32] + aesenc xmm5, [r15+48] + aesenc xmm5, [r15+64] + aesenc xmm5, [r15+80] + aesenc xmm5, [r15+96] + aesenc xmm5, [r15+112] + aesenc xmm5, [r15+128] + aesenc xmm5, [r15+144] + cmp r10d, 11 + movdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_calc_iv_1_aesenc_avx_last + aesenc xmm5, xmm9 + aesenc xmm5, [r15+176] + cmp r10d, 13 + movdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_calc_iv_1_aesenc_avx_last + aesenc xmm5, xmm9 + aesenc xmm5, [r15+208] + movdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_encrypt_calc_iv_1_aesenc_avx_last: + aesenclast xmm5, xmm9 + pshufb xmm5, OWORD PTR L_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov rcx, 0 + je L_AES_GCM_encrypt_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_encrypt_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_encrypt_calc_iv_16_loop: + movdqu xmm8, [rax+rcx] + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm8 + pshufd xmm1, xmm4, 78 + pshufd xmm2, xmm5, 78 + movdqa xmm3, xmm5 + movdqa xmm0, xmm5 + pclmulqdq xmm3, xmm4, 17 + pclmulqdq xmm0, xmm4, 0 + pxor xmm1, xmm4 + pxor xmm2, xmm5 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm7, xmm0 + movdqa xmm4, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm7, xmm2 + pxor xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm4 + psrld xmm0, 31 + psrld xmm1, 31 + pslld xmm7, 1 + pslld xmm4, 1 + movdqa xmm2, xmm0 + pslldq xmm0, 4 + psrldq xmm2, 12 + pslldq xmm1, 4 + por xmm4, xmm2 + por xmm7, xmm0 + por xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm7 + movdqa xmm2, xmm7 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm7, xmm0 + movdqa xmm2, xmm7 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm7 + pxor xmm4, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_encrypt_calc_iv_16_loop + mov edx, ebx + cmp ecx, edx + je L_AES_GCM_encrypt_calc_iv_done +L_AES_GCM_encrypt_calc_iv_lt16: + sub rsp, 16 + pxor xmm8, xmm8 + xor ebx, ebx + movdqu [rsp], xmm8 +L_AES_GCM_encrypt_calc_iv_loop: + movzx r13d, BYTE PTR [rax+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_encrypt_calc_iv_loop + movdqu xmm8, [rsp] + add rsp, 16 + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm8 + pshufd xmm1, xmm4, 78 + pshufd xmm2, xmm5, 78 + movdqa xmm3, xmm5 + movdqa xmm0, xmm5 + pclmulqdq xmm3, xmm4, 17 + pclmulqdq xmm0, xmm4, 0 + pxor xmm1, xmm4 + pxor xmm2, xmm5 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm7, xmm0 + movdqa xmm4, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm7, xmm2 + pxor xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm4 + psrld xmm0, 31 + psrld xmm1, 31 + pslld xmm7, 1 + pslld xmm4, 1 + movdqa xmm2, xmm0 + pslldq xmm0, 4 + psrldq xmm2, 12 + pslldq xmm1, 4 + por xmm4, xmm2 + por xmm7, xmm0 + por xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm7 + movdqa xmm2, xmm7 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm7, xmm0 + movdqa xmm2, xmm7 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm7 + pxor xmm4, xmm2 +L_AES_GCM_encrypt_calc_iv_done: + ; T = Encrypt counter + pxor xmm0, xmm0 + shl edx, 3 + pinsrq xmm0, rdx, 0 + pxor xmm4, xmm0 + pshufd xmm1, xmm4, 78 + pshufd xmm2, xmm5, 78 + movdqa xmm3, xmm5 + movdqa xmm0, xmm5 + pclmulqdq xmm3, xmm4, 17 + pclmulqdq xmm0, xmm4, 0 + pxor xmm1, xmm4 + pxor xmm2, xmm5 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm7, xmm0 + movdqa xmm4, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm7, xmm2 + pxor xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm4 + psrld xmm0, 31 + psrld xmm1, 31 + pslld xmm7, 1 + pslld xmm4, 1 + movdqa xmm2, xmm0 + pslldq xmm0, 4 + psrldq xmm2, 12 + pslldq xmm1, 4 + por xmm4, xmm2 + por xmm7, xmm0 + por xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm7 + movdqa xmm2, xmm7 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm7, xmm0 + movdqa xmm2, xmm7 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm7 + pxor xmm4, xmm2 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + ; Encrypt counter + movdqa xmm8, OWORD PTR [r15] + pxor xmm8, xmm4 + aesenc xmm8, [r15+16] + aesenc xmm8, [r15+32] + aesenc xmm8, [r15+48] + aesenc xmm8, [r15+64] + aesenc xmm8, [r15+80] + aesenc xmm8, [r15+96] + aesenc xmm8, [r15+112] + aesenc xmm8, [r15+128] + aesenc xmm8, [r15+144] + cmp r10d, 11 + movdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_calc_iv_2_aesenc_avx_last + aesenc xmm8, xmm9 + aesenc xmm8, [r15+176] + cmp r10d, 13 + movdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_calc_iv_2_aesenc_avx_last + aesenc xmm8, xmm9 + aesenc xmm8, [r15+208] + movdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_encrypt_calc_iv_2_aesenc_avx_last: + aesenclast xmm8, xmm9 + movdqu [rsp+144], xmm8 +L_AES_GCM_encrypt_iv_done: + ; Additional authentication data + mov edx, r11d + cmp edx, 0 + je L_AES_GCM_encrypt_calc_aad_done + xor ecx, ecx + cmp edx, 16 + jl L_AES_GCM_encrypt_calc_aad_lt16 + and edx, 4294967280 +L_AES_GCM_encrypt_calc_aad_16_loop: + movdqu xmm8, [r12+rcx] + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm6, xmm8 + pshufd xmm1, xmm6, 78 + pshufd xmm2, xmm5, 78 + movdqa xmm3, xmm5 + movdqa xmm0, xmm5 + pclmulqdq xmm3, xmm6, 17 + pclmulqdq xmm0, xmm6, 0 + pxor xmm1, xmm6 + pxor xmm2, xmm5 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm7, xmm0 + movdqa xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm7, xmm2 + pxor xmm6, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm6 + psrld xmm0, 31 + psrld xmm1, 31 + pslld xmm7, 1 + pslld xmm6, 1 + movdqa xmm2, xmm0 + pslldq xmm0, 4 + psrldq xmm2, 12 + pslldq xmm1, 4 + por xmm6, xmm2 + por xmm7, xmm0 + por xmm6, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm7 + movdqa xmm2, xmm7 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm7, xmm0 + movdqa xmm2, xmm7 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm7 + pxor xmm6, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_encrypt_calc_aad_16_loop + mov edx, r11d + cmp ecx, edx + je L_AES_GCM_encrypt_calc_aad_done +L_AES_GCM_encrypt_calc_aad_lt16: + sub rsp, 16 + pxor xmm8, xmm8 + xor ebx, ebx + movdqu [rsp], xmm8 +L_AES_GCM_encrypt_calc_aad_loop: + movzx r13d, BYTE PTR [r12+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_encrypt_calc_aad_loop + movdqu xmm8, [rsp] + add rsp, 16 + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm6, xmm8 + pshufd xmm1, xmm6, 78 + pshufd xmm2, xmm5, 78 + movdqa xmm3, xmm5 + movdqa xmm0, xmm5 + pclmulqdq xmm3, xmm6, 17 + pclmulqdq xmm0, xmm6, 0 + pxor xmm1, xmm6 + pxor xmm2, xmm5 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm7, xmm0 + movdqa xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm7, xmm2 + pxor xmm6, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm6 + psrld xmm0, 31 + psrld xmm1, 31 + pslld xmm7, 1 + pslld xmm6, 1 + movdqa xmm2, xmm0 + pslldq xmm0, 4 + psrldq xmm2, 12 + pslldq xmm1, 4 + por xmm6, xmm2 + por xmm7, xmm0 + por xmm6, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm7 + movdqa xmm2, xmm7 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm7, xmm0 + movdqa xmm2, xmm7 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm7 + pxor xmm6, xmm2 +L_AES_GCM_encrypt_calc_aad_done: + ; Calculate counter and H + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_epi64 + movdqa xmm9, xmm5 + paddd xmm4, OWORD PTR L_aes_gcm_one + movdqa xmm8, xmm5 + movdqu [rsp+128], xmm4 + psrlq xmm9, 63 + psllq xmm8, 1 + pslldq xmm9, 8 + por xmm8, xmm9 + pshufd xmm5, xmm5, 255 + psrad xmm5, 31 + pand xmm5, OWORD PTR L_aes_gcm_mod2_128 + pxor xmm5, xmm8 + xor rbx, rbx + cmp r9d, 128 + mov r13d, r9d + jl L_AES_GCM_encrypt_done_128 + and r13d, 4294967168 + movdqa xmm2, xmm6 + ; H ^ 1 + movdqu [rsp], xmm5 + ; H ^ 2 + pshufd xmm9, xmm5, 78 + pshufd xmm10, xmm5, 78 + movdqa xmm11, xmm5 + movdqa xmm8, xmm5 + pclmulqdq xmm11, xmm5, 17 + pclmulqdq xmm8, xmm5, 0 + pxor xmm9, xmm5 + pxor xmm10, xmm5 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm0, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm0, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm0, xmm14 + movdqu [rsp+16], xmm0 + ; H ^ 3 + pshufd xmm9, xmm5, 78 + pshufd xmm10, xmm0, 78 + movdqa xmm11, xmm0 + movdqa xmm8, xmm0 + pclmulqdq xmm11, xmm5, 17 + pclmulqdq xmm8, xmm5, 0 + pxor xmm9, xmm5 + pxor xmm10, xmm0 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm1, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm1, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm1, xmm14 + movdqu [rsp+32], xmm1 + ; H ^ 4 + pshufd xmm9, xmm0, 78 + pshufd xmm10, xmm0, 78 + movdqa xmm11, xmm0 + movdqa xmm8, xmm0 + pclmulqdq xmm11, xmm0, 17 + pclmulqdq xmm8, xmm0, 0 + pxor xmm9, xmm0 + pxor xmm10, xmm0 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm3, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm3, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm3, xmm14 + movdqu [rsp+48], xmm3 + ; H ^ 5 + pshufd xmm9, xmm0, 78 + pshufd xmm10, xmm1, 78 + movdqa xmm11, xmm1 + movdqa xmm8, xmm1 + pclmulqdq xmm11, xmm0, 17 + pclmulqdq xmm8, xmm0, 0 + pxor xmm9, xmm0 + pxor xmm10, xmm1 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm7, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm7, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm7, xmm14 + movdqu [rsp+64], xmm7 + ; H ^ 6 + pshufd xmm9, xmm1, 78 + pshufd xmm10, xmm1, 78 + movdqa xmm11, xmm1 + movdqa xmm8, xmm1 + pclmulqdq xmm11, xmm1, 17 + pclmulqdq xmm8, xmm1, 0 + pxor xmm9, xmm1 + pxor xmm10, xmm1 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm7, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm7, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm7, xmm14 + movdqu [rsp+80], xmm7 + ; H ^ 7 + pshufd xmm9, xmm1, 78 + pshufd xmm10, xmm3, 78 + movdqa xmm11, xmm3 + movdqa xmm8, xmm3 + pclmulqdq xmm11, xmm1, 17 + pclmulqdq xmm8, xmm1, 0 + pxor xmm9, xmm1 + pxor xmm10, xmm3 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm7, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm7, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm7, xmm14 + movdqu [rsp+96], xmm7 + ; H ^ 8 + pshufd xmm9, xmm3, 78 + pshufd xmm10, xmm3, 78 + movdqa xmm11, xmm3 + movdqa xmm8, xmm3 + pclmulqdq xmm11, xmm3, 17 + pclmulqdq xmm8, xmm3, 0 + pxor xmm9, xmm3 + pxor xmm10, xmm3 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm7, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm7, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm7, xmm14 + movdqu [rsp+112], xmm7 + ; First 128 bytes of input + movdqu xmm8, [rsp+128] + movdqa xmm1, OWORD PTR L_aes_gcm_bswap_epi64 + movdqa xmm0, xmm8 + pshufb xmm8, xmm1 + movdqa xmm9, xmm0 + paddd xmm9, OWORD PTR L_aes_gcm_one + pshufb xmm9, xmm1 + movdqa xmm10, xmm0 + paddd xmm10, OWORD PTR L_aes_gcm_two + pshufb xmm10, xmm1 + movdqa xmm11, xmm0 + paddd xmm11, OWORD PTR L_aes_gcm_three + pshufb xmm11, xmm1 + movdqa xmm12, xmm0 + paddd xmm12, OWORD PTR L_aes_gcm_four + pshufb xmm12, xmm1 + movdqa xmm13, xmm0 + paddd xmm13, OWORD PTR L_aes_gcm_five + pshufb xmm13, xmm1 + movdqa xmm14, xmm0 + paddd xmm14, OWORD PTR L_aes_gcm_six + pshufb xmm14, xmm1 + movdqa xmm15, xmm0 + paddd xmm15, OWORD PTR L_aes_gcm_seven + pshufb xmm15, xmm1 + paddd xmm0, OWORD PTR L_aes_gcm_eight + movdqa xmm7, OWORD PTR [r15] + movdqu [rsp+128], xmm0 + pxor xmm8, xmm7 + pxor xmm9, xmm7 + pxor xmm10, xmm7 + pxor xmm11, xmm7 + pxor xmm12, xmm7 + pxor xmm13, xmm7 + pxor xmm14, xmm7 + pxor xmm15, xmm7 + movdqa xmm7, OWORD PTR [r15+16] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [r15+32] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [r15+48] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [r15+64] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [r15+80] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [r15+96] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [r15+112] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [r15+128] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [r15+144] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + cmp r10d, 11 + movdqa xmm7, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_enc_done + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [r15+176] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + cmp r10d, 13 + movdqa xmm7, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_enc_done + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [r15+208] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [r15+224] +L_AES_GCM_encrypt_enc_done: + aesenclast xmm8, xmm7 + aesenclast xmm9, xmm7 + movdqu xmm0, [rdi] + movdqu xmm1, [rdi+16] + pxor xmm8, xmm0 + pxor xmm9, xmm1 + movdqu [rsi], xmm8 + movdqu [rsi+16], xmm9 + aesenclast xmm10, xmm7 + aesenclast xmm11, xmm7 + movdqu xmm0, [rdi+32] + movdqu xmm1, [rdi+48] + pxor xmm10, xmm0 + pxor xmm11, xmm1 + movdqu [rsi+32], xmm10 + movdqu [rsi+48], xmm11 + aesenclast xmm12, xmm7 + aesenclast xmm13, xmm7 + movdqu xmm0, [rdi+64] + movdqu xmm1, [rdi+80] + pxor xmm12, xmm0 + pxor xmm13, xmm1 + movdqu [rsi+64], xmm12 + movdqu [rsi+80], xmm13 + aesenclast xmm14, xmm7 + aesenclast xmm15, xmm7 + movdqu xmm0, [rdi+96] + movdqu xmm1, [rdi+112] + pxor xmm14, xmm0 + pxor xmm15, xmm1 + movdqu [rsi+96], xmm14 + movdqu [rsi+112], xmm15 + cmp r13d, 128 + mov ebx, 128 + jle L_AES_GCM_encrypt_end_128 + ; More 128 bytes of input +L_AES_GCM_encrypt_ghash_128: + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + movdqu xmm8, [rsp+128] + movdqa xmm1, OWORD PTR L_aes_gcm_bswap_epi64 + movdqa xmm0, xmm8 + pshufb xmm8, xmm1 + movdqa xmm9, xmm0 + paddd xmm9, OWORD PTR L_aes_gcm_one + pshufb xmm9, xmm1 + movdqa xmm10, xmm0 + paddd xmm10, OWORD PTR L_aes_gcm_two + pshufb xmm10, xmm1 + movdqa xmm11, xmm0 + paddd xmm11, OWORD PTR L_aes_gcm_three + pshufb xmm11, xmm1 + movdqa xmm12, xmm0 + paddd xmm12, OWORD PTR L_aes_gcm_four + pshufb xmm12, xmm1 + movdqa xmm13, xmm0 + paddd xmm13, OWORD PTR L_aes_gcm_five + pshufb xmm13, xmm1 + movdqa xmm14, xmm0 + paddd xmm14, OWORD PTR L_aes_gcm_six + pshufb xmm14, xmm1 + movdqa xmm15, xmm0 + paddd xmm15, OWORD PTR L_aes_gcm_seven + pshufb xmm15, xmm1 + paddd xmm0, OWORD PTR L_aes_gcm_eight + movdqa xmm7, OWORD PTR [r15] + movdqu [rsp+128], xmm0 + pxor xmm8, xmm7 + pxor xmm9, xmm7 + pxor xmm10, xmm7 + pxor xmm11, xmm7 + pxor xmm12, xmm7 + pxor xmm13, xmm7 + pxor xmm14, xmm7 + pxor xmm15, xmm7 + movdqu xmm7, [rsp+112] + movdqu xmm0, [rdx+-128] + aesenc xmm8, [r15+16] + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm0, xmm2 + pshufd xmm1, xmm7, 78 + pshufd xmm5, xmm0, 78 + pxor xmm1, xmm7 + pxor xmm5, xmm0 + movdqa xmm3, xmm0 + pclmulqdq xmm3, xmm7, 17 + aesenc xmm9, [r15+16] + aesenc xmm10, [r15+16] + movdqa xmm2, xmm0 + pclmulqdq xmm2, xmm7, 0 + aesenc xmm11, [r15+16] + aesenc xmm12, [r15+16] + pclmulqdq xmm1, xmm5, 0 + aesenc xmm13, [r15+16] + aesenc xmm14, [r15+16] + aesenc xmm15, [r15+16] + pxor xmm1, xmm2 + pxor xmm1, xmm3 + movdqu xmm7, [rsp+96] + movdqu xmm0, [rdx+-112] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [r15+32] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [r15+32] + aesenc xmm10, [r15+32] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [r15+32] + aesenc xmm12, [r15+32] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [r15+32] + aesenc xmm14, [r15+32] + aesenc xmm15, [r15+32] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp+80] + movdqu xmm0, [rdx+-96] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [r15+48] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [r15+48] + aesenc xmm10, [r15+48] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [r15+48] + aesenc xmm12, [r15+48] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [r15+48] + aesenc xmm14, [r15+48] + aesenc xmm15, [r15+48] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp+64] + movdqu xmm0, [rdx+-80] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [r15+64] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [r15+64] + aesenc xmm10, [r15+64] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [r15+64] + aesenc xmm12, [r15+64] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [r15+64] + aesenc xmm14, [r15+64] + aesenc xmm15, [r15+64] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp+48] + movdqu xmm0, [rdx+-64] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [r15+80] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [r15+80] + aesenc xmm10, [r15+80] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [r15+80] + aesenc xmm12, [r15+80] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [r15+80] + aesenc xmm14, [r15+80] + aesenc xmm15, [r15+80] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp+32] + movdqu xmm0, [rdx+-48] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [r15+96] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [r15+96] + aesenc xmm10, [r15+96] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [r15+96] + aesenc xmm12, [r15+96] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [r15+96] + aesenc xmm14, [r15+96] + aesenc xmm15, [r15+96] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp+16] + movdqu xmm0, [rdx+-32] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [r15+112] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [r15+112] + aesenc xmm10, [r15+112] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [r15+112] + aesenc xmm12, [r15+112] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [r15+112] + aesenc xmm14, [r15+112] + aesenc xmm15, [r15+112] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp] + movdqu xmm0, [rdx+-16] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [r15+128] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [r15+128] + aesenc xmm10, [r15+128] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [r15+128] + aesenc xmm12, [r15+128] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [r15+128] + aesenc xmm14, [r15+128] + aesenc xmm15, [r15+128] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqa xmm5, xmm1 + psrldq xmm1, 8 + pslldq xmm5, 8 + aesenc xmm8, [r15+144] + pxor xmm2, xmm5 + pxor xmm3, xmm1 + movdqa xmm7, xmm2 + movdqa xmm4, xmm2 + movdqa xmm5, xmm2 + aesenc xmm9, [r15+144] + pslld xmm7, 31 + pslld xmm4, 30 + pslld xmm5, 25 + aesenc xmm10, [r15+144] + pxor xmm7, xmm4 + pxor xmm7, xmm5 + aesenc xmm11, [r15+144] + movdqa xmm4, xmm7 + pslldq xmm7, 12 + psrldq xmm4, 4 + aesenc xmm12, [r15+144] + pxor xmm2, xmm7 + movdqa xmm5, xmm2 + movdqa xmm1, xmm2 + movdqa xmm0, xmm2 + aesenc xmm13, [r15+144] + psrld xmm5, 1 + psrld xmm1, 2 + psrld xmm0, 7 + aesenc xmm14, [r15+144] + pxor xmm5, xmm1 + pxor xmm5, xmm0 + aesenc xmm15, [r15+144] + pxor xmm5, xmm4 + pxor xmm2, xmm5 + pxor xmm2, xmm3 + cmp r10d, 11 + movdqa xmm7, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_aesenc_128_ghash_avx_done + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [r15+176] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + cmp r10d, 13 + movdqa xmm7, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_aesenc_128_ghash_avx_done + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [r15+208] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [r15+224] +L_AES_GCM_encrypt_aesenc_128_ghash_avx_done: + aesenclast xmm8, xmm7 + aesenclast xmm9, xmm7 + movdqu xmm0, [rcx] + movdqu xmm1, [rcx+16] + pxor xmm8, xmm0 + pxor xmm9, xmm1 + movdqu [rdx], xmm8 + movdqu [rdx+16], xmm9 + aesenclast xmm10, xmm7 + aesenclast xmm11, xmm7 + movdqu xmm0, [rcx+32] + movdqu xmm1, [rcx+48] + pxor xmm10, xmm0 + pxor xmm11, xmm1 + movdqu [rdx+32], xmm10 + movdqu [rdx+48], xmm11 + aesenclast xmm12, xmm7 + aesenclast xmm13, xmm7 + movdqu xmm0, [rcx+64] + movdqu xmm1, [rcx+80] + pxor xmm12, xmm0 + pxor xmm13, xmm1 + movdqu [rdx+64], xmm12 + movdqu [rdx+80], xmm13 + aesenclast xmm14, xmm7 + aesenclast xmm15, xmm7 + movdqu xmm0, [rcx+96] + movdqu xmm1, [rcx+112] + pxor xmm14, xmm0 + pxor xmm15, xmm1 + movdqu [rdx+96], xmm14 + movdqu [rdx+112], xmm15 + add ebx, 128 + cmp ebx, r13d + jl L_AES_GCM_encrypt_ghash_128 +L_AES_GCM_encrypt_end_128: + movdqa xmm4, OWORD PTR L_aes_gcm_bswap_mask + pshufb xmm8, xmm4 + pshufb xmm9, xmm4 + pshufb xmm10, xmm4 + pshufb xmm11, xmm4 + pxor xmm8, xmm2 + pshufb xmm12, xmm4 + pshufb xmm13, xmm4 + pshufb xmm14, xmm4 + pshufb xmm15, xmm4 + movdqu xmm7, [rsp+112] + pshufd xmm1, xmm8, 78 + pshufd xmm2, xmm7, 78 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + pclmulqdq xmm3, xmm8, 17 + pclmulqdq xmm0, xmm8, 0 + pxor xmm1, xmm8 + pxor xmm2, xmm7 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm4, xmm0 + movdqa xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm4, xmm2 + pxor xmm6, xmm1 + movdqu xmm7, [rsp+96] + pshufd xmm1, xmm9, 78 + pshufd xmm2, xmm7, 78 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + pclmulqdq xmm3, xmm9, 17 + pclmulqdq xmm0, xmm9, 0 + pxor xmm1, xmm9 + pxor xmm2, xmm7 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + pxor xmm4, xmm0 + pxor xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm4, xmm2 + pxor xmm6, xmm1 + movdqu xmm7, [rsp+80] + pshufd xmm1, xmm10, 78 + pshufd xmm2, xmm7, 78 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + pclmulqdq xmm3, xmm10, 17 + pclmulqdq xmm0, xmm10, 0 + pxor xmm1, xmm10 + pxor xmm2, xmm7 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + pxor xmm4, xmm0 + pxor xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm4, xmm2 + pxor xmm6, xmm1 + movdqu xmm7, [rsp+64] + pshufd xmm1, xmm11, 78 + pshufd xmm2, xmm7, 78 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + pclmulqdq xmm3, xmm11, 17 + pclmulqdq xmm0, xmm11, 0 + pxor xmm1, xmm11 + pxor xmm2, xmm7 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + pxor xmm4, xmm0 + pxor xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm4, xmm2 + pxor xmm6, xmm1 + movdqu xmm7, [rsp+48] + pshufd xmm1, xmm12, 78 + pshufd xmm2, xmm7, 78 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + pclmulqdq xmm3, xmm12, 17 + pclmulqdq xmm0, xmm12, 0 + pxor xmm1, xmm12 + pxor xmm2, xmm7 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + pxor xmm4, xmm0 + pxor xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm4, xmm2 + pxor xmm6, xmm1 + movdqu xmm7, [rsp+32] + pshufd xmm1, xmm13, 78 + pshufd xmm2, xmm7, 78 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + pclmulqdq xmm3, xmm13, 17 + pclmulqdq xmm0, xmm13, 0 + pxor xmm1, xmm13 + pxor xmm2, xmm7 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + pxor xmm4, xmm0 + pxor xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm4, xmm2 + pxor xmm6, xmm1 + movdqu xmm7, [rsp+16] + pshufd xmm1, xmm14, 78 + pshufd xmm2, xmm7, 78 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + pclmulqdq xmm3, xmm14, 17 + pclmulqdq xmm0, xmm14, 0 + pxor xmm1, xmm14 + pxor xmm2, xmm7 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + pxor xmm4, xmm0 + pxor xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm4, xmm2 + pxor xmm6, xmm1 + movdqu xmm7, [rsp] + pshufd xmm1, xmm15, 78 + pshufd xmm2, xmm7, 78 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + pclmulqdq xmm3, xmm15, 17 + pclmulqdq xmm0, xmm15, 0 + pxor xmm1, xmm15 + pxor xmm2, xmm7 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + pxor xmm4, xmm0 + pxor xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm4, xmm2 + pxor xmm6, xmm1 + movdqa xmm0, xmm4 + movdqa xmm1, xmm4 + movdqa xmm2, xmm4 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm4, xmm0 + movdqa xmm2, xmm4 + movdqa xmm3, xmm4 + movdqa xmm0, xmm4 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm4 + pxor xmm6, xmm2 + movdqu xmm5, [rsp] +L_AES_GCM_encrypt_done_128: + mov edx, r9d + cmp ebx, edx + jge L_AES_GCM_encrypt_done_enc + mov r13d, r9d + and r13d, 4294967280 + cmp ebx, r13d + jge L_AES_GCM_encrypt_last_block_done + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + movdqu xmm8, [rsp+128] + movdqa xmm9, xmm8 + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_epi64 + paddd xmm9, OWORD PTR L_aes_gcm_one + pxor xmm8, [r15] + movdqu [rsp+128], xmm9 + aesenc xmm8, [r15+16] + aesenc xmm8, [r15+32] + aesenc xmm8, [r15+48] + aesenc xmm8, [r15+64] + aesenc xmm8, [r15+80] + aesenc xmm8, [r15+96] + aesenc xmm8, [r15+112] + aesenc xmm8, [r15+128] + aesenc xmm8, [r15+144] + cmp r10d, 11 + movdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_aesenc_block_aesenc_avx_last + aesenc xmm8, xmm9 + aesenc xmm8, [r15+176] + cmp r10d, 13 + movdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_aesenc_block_aesenc_avx_last + aesenc xmm8, xmm9 + aesenc xmm8, [r15+208] + movdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_encrypt_aesenc_block_aesenc_avx_last: + aesenclast xmm8, xmm9 + movdqu xmm9, [rcx] + pxor xmm8, xmm9 + movdqu [rdx], xmm8 + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm6, xmm8 + add ebx, 16 + cmp ebx, r13d + jge L_AES_GCM_encrypt_last_block_ghash +L_AES_GCM_encrypt_last_block_start: + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + movdqu xmm8, [rsp+128] + movdqa xmm9, xmm8 + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_epi64 + paddd xmm9, OWORD PTR L_aes_gcm_one + pxor xmm8, [r15] + movdqu [rsp+128], xmm9 + movdqa xmm10, xmm6 + pclmulqdq xmm10, xmm5, 16 + aesenc xmm8, [r15+16] + aesenc xmm8, [r15+32] + movdqa xmm11, xmm6 + pclmulqdq xmm11, xmm5, 1 + aesenc xmm8, [r15+48] + aesenc xmm8, [r15+64] + movdqa xmm12, xmm6 + pclmulqdq xmm12, xmm5, 0 + aesenc xmm8, [r15+80] + movdqa xmm1, xmm6 + pclmulqdq xmm1, xmm5, 17 + aesenc xmm8, [r15+96] + pxor xmm10, xmm11 + movdqa xmm2, xmm10 + psrldq xmm10, 8 + pslldq xmm2, 8 + aesenc xmm8, [r15+112] + movdqa xmm3, xmm1 + pxor xmm2, xmm12 + pxor xmm3, xmm10 + movdqa xmm0, OWORD PTR L_aes_gcm_mod2_128 + movdqa xmm11, xmm2 + pclmulqdq xmm11, xmm0, 16 + aesenc xmm8, [r15+128] + pshufd xmm10, xmm2, 78 + pxor xmm10, xmm11 + movdqa xmm11, xmm10 + pclmulqdq xmm11, xmm0, 16 + aesenc xmm8, [r15+144] + pshufd xmm6, xmm10, 78 + pxor xmm6, xmm11 + pxor xmm6, xmm3 + cmp r10d, 11 + movdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_aesenc_gfmul_last + aesenc xmm8, xmm9 + aesenc xmm8, [r15+176] + cmp r10d, 13 + movdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_aesenc_gfmul_last + aesenc xmm8, xmm9 + aesenc xmm8, [r15+208] + movdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_encrypt_aesenc_gfmul_last: + aesenclast xmm8, xmm9 + movdqu xmm9, [rcx] + pxor xmm8, xmm9 + movdqu [rdx], xmm8 + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm6, xmm8 + add ebx, 16 + cmp ebx, r13d + jl L_AES_GCM_encrypt_last_block_start +L_AES_GCM_encrypt_last_block_ghash: + pshufd xmm9, xmm5, 78 + pshufd xmm10, xmm6, 78 + movdqa xmm11, xmm6 + movdqa xmm8, xmm6 + pclmulqdq xmm11, xmm5, 17 + pclmulqdq xmm8, xmm5, 0 + pxor xmm9, xmm5 + pxor xmm10, xmm6 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm6, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm6, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm6, xmm14 +L_AES_GCM_encrypt_last_block_done: + mov ecx, r9d + mov edx, ecx + and ecx, 15 + jz L_AES_GCM_encrypt_aesenc_last15_enc_avx_done + movdqu xmm4, [rsp+128] + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_epi64 + pxor xmm4, [r15] + aesenc xmm4, [r15+16] + aesenc xmm4, [r15+32] + aesenc xmm4, [r15+48] + aesenc xmm4, [r15+64] + aesenc xmm4, [r15+80] + aesenc xmm4, [r15+96] + aesenc xmm4, [r15+112] + aesenc xmm4, [r15+128] + aesenc xmm4, [r15+144] + cmp r10d, 11 + movdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_aesenc_last15_enc_avx_aesenc_avx_last + aesenc xmm4, xmm9 + aesenc xmm4, [r15+176] + cmp r10d, 13 + movdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_aesenc_last15_enc_avx_aesenc_avx_last + aesenc xmm4, xmm9 + aesenc xmm4, [r15+208] + movdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_encrypt_aesenc_last15_enc_avx_aesenc_avx_last: + aesenclast xmm4, xmm9 + sub rsp, 16 + xor ecx, ecx + movdqu [rsp], xmm4 +L_AES_GCM_encrypt_aesenc_last15_enc_avx_loop: + movzx r13d, BYTE PTR [rdi+rbx] + xor r13b, BYTE PTR [rsp+rcx] + mov BYTE PTR [rsi+rbx], r13b + mov BYTE PTR [rsp+rcx], r13b + inc ebx + inc ecx + cmp ebx, edx + jl L_AES_GCM_encrypt_aesenc_last15_enc_avx_loop + xor r13, r13 + cmp ecx, 16 + je L_AES_GCM_encrypt_aesenc_last15_enc_avx_finish_enc +L_AES_GCM_encrypt_aesenc_last15_enc_avx_byte_loop: + mov BYTE PTR [rsp+rcx], r13b + inc ecx + cmp ecx, 16 + jl L_AES_GCM_encrypt_aesenc_last15_enc_avx_byte_loop +L_AES_GCM_encrypt_aesenc_last15_enc_avx_finish_enc: + movdqu xmm4, [rsp] + add rsp, 16 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm6, xmm4 + pshufd xmm9, xmm5, 78 + pshufd xmm10, xmm6, 78 + movdqa xmm11, xmm6 + movdqa xmm8, xmm6 + pclmulqdq xmm11, xmm5, 17 + pclmulqdq xmm8, xmm5, 0 + pxor xmm9, xmm5 + pxor xmm10, xmm6 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm6, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm6, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm6, xmm14 +L_AES_GCM_encrypt_aesenc_last15_enc_avx_done: +L_AES_GCM_encrypt_done_enc: + mov edx, r9d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + pinsrq xmm0, rdx, 0 + pinsrq xmm0, rcx, 1 + pxor xmm6, xmm0 + pshufd xmm9, xmm5, 78 + pshufd xmm10, xmm6, 78 + movdqa xmm11, xmm6 + movdqa xmm8, xmm6 + pclmulqdq xmm11, xmm5, 17 + pclmulqdq xmm8, xmm5, 0 + pxor xmm9, xmm5 + pxor xmm10, xmm6 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm6, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm6, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm6, xmm14 + pshufb xmm6, OWORD PTR L_aes_gcm_bswap_mask + movdqu xmm0, [rsp+144] + pxor xmm0, xmm6 + cmp r14d, 16 + je L_AES_GCM_encrypt_store_tag_16 + xor rcx, rcx + movdqu [rsp], xmm0 +L_AES_GCM_encrypt_store_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + mov BYTE PTR [r8+rcx], r13b + inc ecx + cmp ecx, r14d + jne L_AES_GCM_encrypt_store_tag_loop + jmp L_AES_GCM_encrypt_store_tag_done +L_AES_GCM_encrypt_store_tag_16: + movdqu [r8], xmm0 +L_AES_GCM_encrypt_store_tag_done: + add rsp, 160 + pop r15 + pop r14 + pop rbx + pop r12 + pop rsi + pop rdi + pop r13 + ret +AES_GCM_encrypt ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_decrypt PROC + push r13 + push rdi + push rsi + push r12 + push rbx + push r14 + push r15 + push rbp + mov rdi, rcx + mov rsi, rdx + mov r12, r8 + mov rax, r9 + mov r8, QWORD PTR [rsp+104] + mov r9d, DWORD PTR [rsp+112] + mov r11d, DWORD PTR [rsp+120] + mov ebx, DWORD PTR [rsp+128] + mov r14d, DWORD PTR [rsp+136] + mov r15, QWORD PTR [rsp+144] + mov r10d, DWORD PTR [rsp+152] + mov rbp, QWORD PTR [rsp+160] + sub rsp, 168 + pxor xmm4, xmm4 + pxor xmm6, xmm6 + cmp ebx, 12 + mov edx, ebx + jne L_AES_GCM_decrypt_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + pinsrq xmm4, QWORD PTR [rax], 0 + pinsrd xmm4, DWORD PTR [rax+8], 2 + pinsrd xmm4, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + movdqa xmm1, xmm4 + movdqa xmm5, OWORD PTR [r15] + pxor xmm1, xmm5 + movdqa xmm7, OWORD PTR [r15+16] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [r15+32] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [r15+48] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [r15+64] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [r15+80] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [r15+96] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [r15+112] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [r15+128] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [r15+144] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + cmp r10d, 11 + movdqa xmm7, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_calc_iv_12_last + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [r15+176] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + cmp r10d, 13 + movdqa xmm7, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_calc_iv_12_last + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [r15+208] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [r15+224] +L_AES_GCM_decrypt_calc_iv_12_last: + aesenclast xmm5, xmm7 + aesenclast xmm1, xmm7 + pshufb xmm5, OWORD PTR L_aes_gcm_bswap_mask + movdqu [rsp+144], xmm1 + jmp L_AES_GCM_decrypt_iv_done +L_AES_GCM_decrypt_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + movdqa xmm5, OWORD PTR [r15] + aesenc xmm5, [r15+16] + aesenc xmm5, [r15+32] + aesenc xmm5, [r15+48] + aesenc xmm5, [r15+64] + aesenc xmm5, [r15+80] + aesenc xmm5, [r15+96] + aesenc xmm5, [r15+112] + aesenc xmm5, [r15+128] + aesenc xmm5, [r15+144] + cmp r10d, 11 + movdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_calc_iv_1_aesenc_avx_last + aesenc xmm5, xmm9 + aesenc xmm5, [r15+176] + cmp r10d, 13 + movdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_calc_iv_1_aesenc_avx_last + aesenc xmm5, xmm9 + aesenc xmm5, [r15+208] + movdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_decrypt_calc_iv_1_aesenc_avx_last: + aesenclast xmm5, xmm9 + pshufb xmm5, OWORD PTR L_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov rcx, 0 + je L_AES_GCM_decrypt_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_decrypt_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_decrypt_calc_iv_16_loop: + movdqu xmm8, [rax+rcx] + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm8 + pshufd xmm1, xmm4, 78 + pshufd xmm2, xmm5, 78 + movdqa xmm3, xmm5 + movdqa xmm0, xmm5 + pclmulqdq xmm3, xmm4, 17 + pclmulqdq xmm0, xmm4, 0 + pxor xmm1, xmm4 + pxor xmm2, xmm5 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm7, xmm0 + movdqa xmm4, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm7, xmm2 + pxor xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm4 + psrld xmm0, 31 + psrld xmm1, 31 + pslld xmm7, 1 + pslld xmm4, 1 + movdqa xmm2, xmm0 + pslldq xmm0, 4 + psrldq xmm2, 12 + pslldq xmm1, 4 + por xmm4, xmm2 + por xmm7, xmm0 + por xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm7 + movdqa xmm2, xmm7 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm7, xmm0 + movdqa xmm2, xmm7 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm7 + pxor xmm4, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_decrypt_calc_iv_16_loop + mov edx, ebx + cmp ecx, edx + je L_AES_GCM_decrypt_calc_iv_done +L_AES_GCM_decrypt_calc_iv_lt16: + sub rsp, 16 + pxor xmm8, xmm8 + xor ebx, ebx + movdqu [rsp], xmm8 +L_AES_GCM_decrypt_calc_iv_loop: + movzx r13d, BYTE PTR [rax+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_decrypt_calc_iv_loop + movdqu xmm8, [rsp] + add rsp, 16 + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm8 + pshufd xmm1, xmm4, 78 + pshufd xmm2, xmm5, 78 + movdqa xmm3, xmm5 + movdqa xmm0, xmm5 + pclmulqdq xmm3, xmm4, 17 + pclmulqdq xmm0, xmm4, 0 + pxor xmm1, xmm4 + pxor xmm2, xmm5 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm7, xmm0 + movdqa xmm4, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm7, xmm2 + pxor xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm4 + psrld xmm0, 31 + psrld xmm1, 31 + pslld xmm7, 1 + pslld xmm4, 1 + movdqa xmm2, xmm0 + pslldq xmm0, 4 + psrldq xmm2, 12 + pslldq xmm1, 4 + por xmm4, xmm2 + por xmm7, xmm0 + por xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm7 + movdqa xmm2, xmm7 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm7, xmm0 + movdqa xmm2, xmm7 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm7 + pxor xmm4, xmm2 +L_AES_GCM_decrypt_calc_iv_done: + ; T = Encrypt counter + pxor xmm0, xmm0 + shl edx, 3 + pinsrq xmm0, rdx, 0 + pxor xmm4, xmm0 + pshufd xmm1, xmm4, 78 + pshufd xmm2, xmm5, 78 + movdqa xmm3, xmm5 + movdqa xmm0, xmm5 + pclmulqdq xmm3, xmm4, 17 + pclmulqdq xmm0, xmm4, 0 + pxor xmm1, xmm4 + pxor xmm2, xmm5 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm7, xmm0 + movdqa xmm4, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm7, xmm2 + pxor xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm4 + psrld xmm0, 31 + psrld xmm1, 31 + pslld xmm7, 1 + pslld xmm4, 1 + movdqa xmm2, xmm0 + pslldq xmm0, 4 + psrldq xmm2, 12 + pslldq xmm1, 4 + por xmm4, xmm2 + por xmm7, xmm0 + por xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm7 + movdqa xmm2, xmm7 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm7, xmm0 + movdqa xmm2, xmm7 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm7 + pxor xmm4, xmm2 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + ; Encrypt counter + movdqa xmm8, OWORD PTR [r15] + pxor xmm8, xmm4 + aesenc xmm8, [r15+16] + aesenc xmm8, [r15+32] + aesenc xmm8, [r15+48] + aesenc xmm8, [r15+64] + aesenc xmm8, [r15+80] + aesenc xmm8, [r15+96] + aesenc xmm8, [r15+112] + aesenc xmm8, [r15+128] + aesenc xmm8, [r15+144] + cmp r10d, 11 + movdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_calc_iv_2_aesenc_avx_last + aesenc xmm8, xmm9 + aesenc xmm8, [r15+176] + cmp r10d, 13 + movdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_calc_iv_2_aesenc_avx_last + aesenc xmm8, xmm9 + aesenc xmm8, [r15+208] + movdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_decrypt_calc_iv_2_aesenc_avx_last: + aesenclast xmm8, xmm9 + movdqu [rsp+144], xmm8 +L_AES_GCM_decrypt_iv_done: + ; Additional authentication data + mov edx, r11d + cmp edx, 0 + je L_AES_GCM_decrypt_calc_aad_done + xor ecx, ecx + cmp edx, 16 + jl L_AES_GCM_decrypt_calc_aad_lt16 + and edx, 4294967280 +L_AES_GCM_decrypt_calc_aad_16_loop: + movdqu xmm8, [r12+rcx] + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm6, xmm8 + pshufd xmm1, xmm6, 78 + pshufd xmm2, xmm5, 78 + movdqa xmm3, xmm5 + movdqa xmm0, xmm5 + pclmulqdq xmm3, xmm6, 17 + pclmulqdq xmm0, xmm6, 0 + pxor xmm1, xmm6 + pxor xmm2, xmm5 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm7, xmm0 + movdqa xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm7, xmm2 + pxor xmm6, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm6 + psrld xmm0, 31 + psrld xmm1, 31 + pslld xmm7, 1 + pslld xmm6, 1 + movdqa xmm2, xmm0 + pslldq xmm0, 4 + psrldq xmm2, 12 + pslldq xmm1, 4 + por xmm6, xmm2 + por xmm7, xmm0 + por xmm6, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm7 + movdqa xmm2, xmm7 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm7, xmm0 + movdqa xmm2, xmm7 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm7 + pxor xmm6, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_decrypt_calc_aad_16_loop + mov edx, r11d + cmp ecx, edx + je L_AES_GCM_decrypt_calc_aad_done +L_AES_GCM_decrypt_calc_aad_lt16: + sub rsp, 16 + pxor xmm8, xmm8 + xor ebx, ebx + movdqu [rsp], xmm8 +L_AES_GCM_decrypt_calc_aad_loop: + movzx r13d, BYTE PTR [r12+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_decrypt_calc_aad_loop + movdqu xmm8, [rsp] + add rsp, 16 + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm6, xmm8 + pshufd xmm1, xmm6, 78 + pshufd xmm2, xmm5, 78 + movdqa xmm3, xmm5 + movdqa xmm0, xmm5 + pclmulqdq xmm3, xmm6, 17 + pclmulqdq xmm0, xmm6, 0 + pxor xmm1, xmm6 + pxor xmm2, xmm5 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm7, xmm0 + movdqa xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm7, xmm2 + pxor xmm6, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm6 + psrld xmm0, 31 + psrld xmm1, 31 + pslld xmm7, 1 + pslld xmm6, 1 + movdqa xmm2, xmm0 + pslldq xmm0, 4 + psrldq xmm2, 12 + pslldq xmm1, 4 + por xmm6, xmm2 + por xmm7, xmm0 + por xmm6, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm7 + movdqa xmm2, xmm7 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm7, xmm0 + movdqa xmm2, xmm7 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm7 + pxor xmm6, xmm2 +L_AES_GCM_decrypt_calc_aad_done: + ; Calculate counter and H + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_epi64 + movdqa xmm9, xmm5 + paddd xmm4, OWORD PTR L_aes_gcm_one + movdqa xmm8, xmm5 + movdqu [rsp+128], xmm4 + psrlq xmm9, 63 + psllq xmm8, 1 + pslldq xmm9, 8 + por xmm8, xmm9 + pshufd xmm5, xmm5, 255 + psrad xmm5, 31 + pand xmm5, OWORD PTR L_aes_gcm_mod2_128 + pxor xmm5, xmm8 + xor ebx, ebx + cmp r9d, 128 + mov r13d, r9d + jl L_AES_GCM_decrypt_done_128 + and r13d, 4294967168 + movdqa xmm2, xmm6 + ; H ^ 1 + movdqu [rsp], xmm5 + ; H ^ 2 + pshufd xmm9, xmm5, 78 + pshufd xmm10, xmm5, 78 + movdqa xmm11, xmm5 + movdqa xmm8, xmm5 + pclmulqdq xmm11, xmm5, 17 + pclmulqdq xmm8, xmm5, 0 + pxor xmm9, xmm5 + pxor xmm10, xmm5 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm0, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm0, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm0, xmm14 + movdqu [rsp+16], xmm0 + ; H ^ 3 + pshufd xmm9, xmm5, 78 + pshufd xmm10, xmm0, 78 + movdqa xmm11, xmm0 + movdqa xmm8, xmm0 + pclmulqdq xmm11, xmm5, 17 + pclmulqdq xmm8, xmm5, 0 + pxor xmm9, xmm5 + pxor xmm10, xmm0 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm1, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm1, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm1, xmm14 + movdqu [rsp+32], xmm1 + ; H ^ 4 + pshufd xmm9, xmm0, 78 + pshufd xmm10, xmm0, 78 + movdqa xmm11, xmm0 + movdqa xmm8, xmm0 + pclmulqdq xmm11, xmm0, 17 + pclmulqdq xmm8, xmm0, 0 + pxor xmm9, xmm0 + pxor xmm10, xmm0 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm3, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm3, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm3, xmm14 + movdqu [rsp+48], xmm3 + ; H ^ 5 + pshufd xmm9, xmm0, 78 + pshufd xmm10, xmm1, 78 + movdqa xmm11, xmm1 + movdqa xmm8, xmm1 + pclmulqdq xmm11, xmm0, 17 + pclmulqdq xmm8, xmm0, 0 + pxor xmm9, xmm0 + pxor xmm10, xmm1 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm7, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm7, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm7, xmm14 + movdqu [rsp+64], xmm7 + ; H ^ 6 + pshufd xmm9, xmm1, 78 + pshufd xmm10, xmm1, 78 + movdqa xmm11, xmm1 + movdqa xmm8, xmm1 + pclmulqdq xmm11, xmm1, 17 + pclmulqdq xmm8, xmm1, 0 + pxor xmm9, xmm1 + pxor xmm10, xmm1 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm7, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm7, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm7, xmm14 + movdqu [rsp+80], xmm7 + ; H ^ 7 + pshufd xmm9, xmm1, 78 + pshufd xmm10, xmm3, 78 + movdqa xmm11, xmm3 + movdqa xmm8, xmm3 + pclmulqdq xmm11, xmm1, 17 + pclmulqdq xmm8, xmm1, 0 + pxor xmm9, xmm1 + pxor xmm10, xmm3 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm7, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm7, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm7, xmm14 + movdqu [rsp+96], xmm7 + ; H ^ 8 + pshufd xmm9, xmm3, 78 + pshufd xmm10, xmm3, 78 + movdqa xmm11, xmm3 + movdqa xmm8, xmm3 + pclmulqdq xmm11, xmm3, 17 + pclmulqdq xmm8, xmm3, 0 + pxor xmm9, xmm3 + pxor xmm10, xmm3 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm7, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm7, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm7, xmm14 + movdqu [rsp+112], xmm7 +L_AES_GCM_decrypt_ghash_128: + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + movdqu xmm8, [rsp+128] + movdqa xmm1, OWORD PTR L_aes_gcm_bswap_epi64 + movdqa xmm0, xmm8 + pshufb xmm8, xmm1 + movdqa xmm9, xmm0 + paddd xmm9, OWORD PTR L_aes_gcm_one + pshufb xmm9, xmm1 + movdqa xmm10, xmm0 + paddd xmm10, OWORD PTR L_aes_gcm_two + pshufb xmm10, xmm1 + movdqa xmm11, xmm0 + paddd xmm11, OWORD PTR L_aes_gcm_three + pshufb xmm11, xmm1 + movdqa xmm12, xmm0 + paddd xmm12, OWORD PTR L_aes_gcm_four + pshufb xmm12, xmm1 + movdqa xmm13, xmm0 + paddd xmm13, OWORD PTR L_aes_gcm_five + pshufb xmm13, xmm1 + movdqa xmm14, xmm0 + paddd xmm14, OWORD PTR L_aes_gcm_six + pshufb xmm14, xmm1 + movdqa xmm15, xmm0 + paddd xmm15, OWORD PTR L_aes_gcm_seven + pshufb xmm15, xmm1 + paddd xmm0, OWORD PTR L_aes_gcm_eight + movdqa xmm7, OWORD PTR [r15] + movdqu [rsp+128], xmm0 + pxor xmm8, xmm7 + pxor xmm9, xmm7 + pxor xmm10, xmm7 + pxor xmm11, xmm7 + pxor xmm12, xmm7 + pxor xmm13, xmm7 + pxor xmm14, xmm7 + pxor xmm15, xmm7 + movdqu xmm7, [rsp+112] + movdqu xmm0, [rcx] + aesenc xmm8, [r15+16] + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm0, xmm2 + pshufd xmm1, xmm7, 78 + pshufd xmm5, xmm0, 78 + pxor xmm1, xmm7 + pxor xmm5, xmm0 + movdqa xmm3, xmm0 + pclmulqdq xmm3, xmm7, 17 + aesenc xmm9, [r15+16] + aesenc xmm10, [r15+16] + movdqa xmm2, xmm0 + pclmulqdq xmm2, xmm7, 0 + aesenc xmm11, [r15+16] + aesenc xmm12, [r15+16] + pclmulqdq xmm1, xmm5, 0 + aesenc xmm13, [r15+16] + aesenc xmm14, [r15+16] + aesenc xmm15, [r15+16] + pxor xmm1, xmm2 + pxor xmm1, xmm3 + movdqu xmm7, [rsp+96] + movdqu xmm0, [rcx+16] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [r15+32] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [r15+32] + aesenc xmm10, [r15+32] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [r15+32] + aesenc xmm12, [r15+32] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [r15+32] + aesenc xmm14, [r15+32] + aesenc xmm15, [r15+32] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp+80] + movdqu xmm0, [rcx+32] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [r15+48] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [r15+48] + aesenc xmm10, [r15+48] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [r15+48] + aesenc xmm12, [r15+48] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [r15+48] + aesenc xmm14, [r15+48] + aesenc xmm15, [r15+48] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp+64] + movdqu xmm0, [rcx+48] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [r15+64] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [r15+64] + aesenc xmm10, [r15+64] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [r15+64] + aesenc xmm12, [r15+64] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [r15+64] + aesenc xmm14, [r15+64] + aesenc xmm15, [r15+64] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp+48] + movdqu xmm0, [rcx+64] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [r15+80] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [r15+80] + aesenc xmm10, [r15+80] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [r15+80] + aesenc xmm12, [r15+80] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [r15+80] + aesenc xmm14, [r15+80] + aesenc xmm15, [r15+80] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp+32] + movdqu xmm0, [rcx+80] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [r15+96] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [r15+96] + aesenc xmm10, [r15+96] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [r15+96] + aesenc xmm12, [r15+96] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [r15+96] + aesenc xmm14, [r15+96] + aesenc xmm15, [r15+96] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp+16] + movdqu xmm0, [rcx+96] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [r15+112] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [r15+112] + aesenc xmm10, [r15+112] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [r15+112] + aesenc xmm12, [r15+112] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [r15+112] + aesenc xmm14, [r15+112] + aesenc xmm15, [r15+112] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp] + movdqu xmm0, [rcx+112] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [r15+128] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [r15+128] + aesenc xmm10, [r15+128] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [r15+128] + aesenc xmm12, [r15+128] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [r15+128] + aesenc xmm14, [r15+128] + aesenc xmm15, [r15+128] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqa xmm5, xmm1 + psrldq xmm1, 8 + pslldq xmm5, 8 + aesenc xmm8, [r15+144] + pxor xmm2, xmm5 + pxor xmm3, xmm1 + movdqa xmm7, xmm2 + movdqa xmm4, xmm2 + movdqa xmm5, xmm2 + aesenc xmm9, [r15+144] + pslld xmm7, 31 + pslld xmm4, 30 + pslld xmm5, 25 + aesenc xmm10, [r15+144] + pxor xmm7, xmm4 + pxor xmm7, xmm5 + aesenc xmm11, [r15+144] + movdqa xmm4, xmm7 + pslldq xmm7, 12 + psrldq xmm4, 4 + aesenc xmm12, [r15+144] + pxor xmm2, xmm7 + movdqa xmm5, xmm2 + movdqa xmm1, xmm2 + movdqa xmm0, xmm2 + aesenc xmm13, [r15+144] + psrld xmm5, 1 + psrld xmm1, 2 + psrld xmm0, 7 + aesenc xmm14, [r15+144] + pxor xmm5, xmm1 + pxor xmm5, xmm0 + aesenc xmm15, [r15+144] + pxor xmm5, xmm4 + pxor xmm2, xmm5 + pxor xmm2, xmm3 + cmp r10d, 11 + movdqa xmm7, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_aesenc_128_ghash_avx_done + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [r15+176] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + cmp r10d, 13 + movdqa xmm7, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_aesenc_128_ghash_avx_done + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [r15+208] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [r15+224] +L_AES_GCM_decrypt_aesenc_128_ghash_avx_done: + aesenclast xmm8, xmm7 + aesenclast xmm9, xmm7 + movdqu xmm0, [rcx] + movdqu xmm1, [rcx+16] + pxor xmm8, xmm0 + pxor xmm9, xmm1 + movdqu [rdx], xmm8 + movdqu [rdx+16], xmm9 + aesenclast xmm10, xmm7 + aesenclast xmm11, xmm7 + movdqu xmm0, [rcx+32] + movdqu xmm1, [rcx+48] + pxor xmm10, xmm0 + pxor xmm11, xmm1 + movdqu [rdx+32], xmm10 + movdqu [rdx+48], xmm11 + aesenclast xmm12, xmm7 + aesenclast xmm13, xmm7 + movdqu xmm0, [rcx+64] + movdqu xmm1, [rcx+80] + pxor xmm12, xmm0 + pxor xmm13, xmm1 + movdqu [rdx+64], xmm12 + movdqu [rdx+80], xmm13 + aesenclast xmm14, xmm7 + aesenclast xmm15, xmm7 + movdqu xmm0, [rcx+96] + movdqu xmm1, [rcx+112] + pxor xmm14, xmm0 + pxor xmm15, xmm1 + movdqu [rdx+96], xmm14 + movdqu [rdx+112], xmm15 + add ebx, 128 + cmp ebx, r13d + jl L_AES_GCM_decrypt_ghash_128 + movdqa xmm6, xmm2 + movdqu xmm5, [rsp] +L_AES_GCM_decrypt_done_128: + mov edx, r9d + cmp ebx, edx + jge L_AES_GCM_decrypt_done_dec + mov r13d, r9d + and r13d, 4294967280 + cmp ebx, r13d + jge L_AES_GCM_decrypt_last_block_done +L_AES_GCM_decrypt_last_block_start: + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + movdqu xmm1, [rcx] + movdqa xmm0, xmm5 + pshufb xmm1, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm1, xmm6 + movdqu xmm8, [rsp+128] + movdqa xmm9, xmm8 + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_epi64 + paddd xmm9, OWORD PTR L_aes_gcm_one + pxor xmm8, [r15] + movdqu [rsp+128], xmm9 + movdqa xmm10, xmm1 + pclmulqdq xmm10, xmm0, 16 + aesenc xmm8, [r15+16] + aesenc xmm8, [r15+32] + movdqa xmm11, xmm1 + pclmulqdq xmm11, xmm0, 1 + aesenc xmm8, [r15+48] + aesenc xmm8, [r15+64] + movdqa xmm12, xmm1 + pclmulqdq xmm12, xmm0, 0 + aesenc xmm8, [r15+80] + movdqa xmm1, xmm1 + pclmulqdq xmm1, xmm0, 17 + aesenc xmm8, [r15+96] + pxor xmm10, xmm11 + movdqa xmm2, xmm10 + psrldq xmm10, 8 + pslldq xmm2, 8 + aesenc xmm8, [r15+112] + movdqa xmm3, xmm1 + pxor xmm2, xmm12 + pxor xmm3, xmm10 + movdqa xmm0, OWORD PTR L_aes_gcm_mod2_128 + movdqa xmm11, xmm2 + pclmulqdq xmm11, xmm0, 16 + aesenc xmm8, [r15+128] + pshufd xmm10, xmm2, 78 + pxor xmm10, xmm11 + movdqa xmm11, xmm10 + pclmulqdq xmm11, xmm0, 16 + aesenc xmm8, [r15+144] + pshufd xmm6, xmm10, 78 + pxor xmm6, xmm11 + pxor xmm6, xmm3 + cmp r10d, 11 + movdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_aesenc_gfmul_last + aesenc xmm8, xmm9 + aesenc xmm8, [r15+176] + cmp r10d, 13 + movdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_aesenc_gfmul_last + aesenc xmm8, xmm9 + aesenc xmm8, [r15+208] + movdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_decrypt_aesenc_gfmul_last: + aesenclast xmm8, xmm9 + movdqu xmm9, [rcx] + pxor xmm8, xmm9 + movdqu [rdx], xmm8 + add ebx, 16 + cmp ebx, r13d + jl L_AES_GCM_decrypt_last_block_start +L_AES_GCM_decrypt_last_block_done: + mov ecx, r9d + mov edx, ecx + and ecx, 15 + jz L_AES_GCM_decrypt_aesenc_last15_dec_avx_done + movdqu xmm4, [rsp+128] + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_epi64 + pxor xmm4, [r15] + aesenc xmm4, [r15+16] + aesenc xmm4, [r15+32] + aesenc xmm4, [r15+48] + aesenc xmm4, [r15+64] + aesenc xmm4, [r15+80] + aesenc xmm4, [r15+96] + aesenc xmm4, [r15+112] + aesenc xmm4, [r15+128] + aesenc xmm4, [r15+144] + cmp r10d, 11 + movdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_aesenc_last15_dec_avx_aesenc_avx_last + aesenc xmm4, xmm9 + aesenc xmm4, [r15+176] + cmp r10d, 13 + movdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_aesenc_last15_dec_avx_aesenc_avx_last + aesenc xmm4, xmm9 + aesenc xmm4, [r15+208] + movdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_decrypt_aesenc_last15_dec_avx_aesenc_avx_last: + aesenclast xmm4, xmm9 + sub rsp, 32 + xor ecx, ecx + movdqu [rsp], xmm4 + pxor xmm0, xmm0 + movdqu [rsp+16], xmm0 +L_AES_GCM_decrypt_aesenc_last15_dec_avx_loop: + movzx r13d, BYTE PTR [rdi+rbx] + mov BYTE PTR [rsp+rcx+16], r13b + xor r13b, BYTE PTR [rsp+rcx] + mov BYTE PTR [rsi+rbx], r13b + inc ebx + inc ecx + cmp ebx, edx + jl L_AES_GCM_decrypt_aesenc_last15_dec_avx_loop + movdqu xmm4, [rsp+16] + add rsp, 32 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm6, xmm4 + pshufd xmm9, xmm5, 78 + pshufd xmm10, xmm6, 78 + movdqa xmm11, xmm6 + movdqa xmm8, xmm6 + pclmulqdq xmm11, xmm5, 17 + pclmulqdq xmm8, xmm5, 0 + pxor xmm9, xmm5 + pxor xmm10, xmm6 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm6, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm6, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm6, xmm14 +L_AES_GCM_decrypt_aesenc_last15_dec_avx_done: +L_AES_GCM_decrypt_done_dec: + mov edx, r9d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + pinsrq xmm0, rdx, 0 + pinsrq xmm0, rcx, 1 + pxor xmm6, xmm0 + pshufd xmm9, xmm5, 78 + pshufd xmm10, xmm6, 78 + movdqa xmm11, xmm6 + movdqa xmm8, xmm6 + pclmulqdq xmm11, xmm5, 17 + pclmulqdq xmm8, xmm5, 0 + pxor xmm9, xmm5 + pxor xmm10, xmm6 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm6, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm6, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm6, xmm14 + pshufb xmm6, OWORD PTR L_aes_gcm_bswap_mask + movdqu xmm0, [rsp+144] + pxor xmm0, xmm6 + cmp r14d, 16 + je L_AES_GCM_decrypt_cmp_tag_16 + sub rsp, 16 + xor rcx, rcx + xor rbx, rbx + movdqu [rsp], xmm0 +L_AES_GCM_decrypt_cmp_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + xor r13b, BYTE PTR [r8+rcx] + or bl, r13b + inc ecx + cmp ecx, r14d + jne L_AES_GCM_decrypt_cmp_tag_loop + cmp rbx, 0 + sete bl + add rsp, 16 + xor rcx, rcx + jmp L_AES_GCM_decrypt_cmp_tag_done +L_AES_GCM_decrypt_cmp_tag_16: + movdqu xmm1, [r8] + pcmpeqb xmm0, xmm1 + pmovmskb rdx, xmm0 + ; %%edx == 0xFFFF then return 1 else => return 0 + xor ebx, ebx + cmp edx, 65535 + sete bl +L_AES_GCM_decrypt_cmp_tag_done: + mov DWORD PTR [rbp], ebx + add rsp, 168 + pop rbp + pop r15 + pop r14 + pop rbx + pop r12 + pop rsi + pop rdi + pop r13 + ret +AES_GCM_decrypt ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_init_aesni PROC + push rdi + push rsi + push r12 + push r13 + push r14 + mov rdi, rcx + mov rsi, rdx + mov r10, r8 + mov r11d, r9d + mov rax, QWORD PTR [rsp+80] + mov r8, QWORD PTR [rsp+88] + mov r9, QWORD PTR [rsp+96] + sub rsp, 16 + pxor xmm4, xmm4 + mov edx, r11d + cmp edx, 12 + jne L_AES_GCM_init_aesni_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + pinsrq xmm4, QWORD PTR [r10], 0 + pinsrd xmm4, DWORD PTR [r10+8], 2 + pinsrd xmm4, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + movdqa xmm1, xmm4 + movdqa xmm5, OWORD PTR [rdi] + pxor xmm1, xmm5 + movdqa xmm7, OWORD PTR [rdi+16] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [rdi+32] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [rdi+48] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [rdi+64] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [rdi+80] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [rdi+96] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [rdi+112] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [rdi+128] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [rdi+144] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + cmp esi, 11 + movdqa xmm7, OWORD PTR [rdi+160] + jl L_AES_GCM_init_aesni_calc_iv_12_last + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [rdi+176] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + cmp esi, 13 + movdqa xmm7, OWORD PTR [rdi+192] + jl L_AES_GCM_init_aesni_calc_iv_12_last + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [rdi+208] + aesenc xmm5, xmm7 + aesenc xmm1, xmm7 + movdqa xmm7, OWORD PTR [rdi+224] +L_AES_GCM_init_aesni_calc_iv_12_last: + aesenclast xmm5, xmm7 + aesenclast xmm1, xmm7 + pshufb xmm5, OWORD PTR L_aes_gcm_bswap_mask + movdqu xmm15, xmm1 + jmp L_AES_GCM_init_aesni_iv_done +L_AES_GCM_init_aesni_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + movdqa xmm5, OWORD PTR [rdi] + aesenc xmm5, [rdi+16] + aesenc xmm5, [rdi+32] + aesenc xmm5, [rdi+48] + aesenc xmm5, [rdi+64] + aesenc xmm5, [rdi+80] + aesenc xmm5, [rdi+96] + aesenc xmm5, [rdi+112] + aesenc xmm5, [rdi+128] + aesenc xmm5, [rdi+144] + cmp esi, 11 + movdqa xmm9, OWORD PTR [rdi+160] + jl L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last + aesenc xmm5, xmm9 + aesenc xmm5, [rdi+176] + cmp esi, 13 + movdqa xmm9, OWORD PTR [rdi+192] + jl L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last + aesenc xmm5, xmm9 + aesenc xmm5, [rdi+208] + movdqa xmm9, OWORD PTR [rdi+224] +L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last: + aesenclast xmm5, xmm9 + pshufb xmm5, OWORD PTR L_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov rcx, 0 + je L_AES_GCM_init_aesni_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_init_aesni_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_init_aesni_calc_iv_16_loop: + movdqu xmm8, [r10+rcx] + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm8 + pshufd xmm1, xmm4, 78 + pshufd xmm2, xmm5, 78 + movdqa xmm3, xmm5 + movdqa xmm0, xmm5 + pclmulqdq xmm3, xmm4, 17 + pclmulqdq xmm0, xmm4, 0 + pxor xmm1, xmm4 + pxor xmm2, xmm5 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm7, xmm0 + movdqa xmm4, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm7, xmm2 + pxor xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm4 + psrld xmm0, 31 + psrld xmm1, 31 + pslld xmm7, 1 + pslld xmm4, 1 + movdqa xmm2, xmm0 + pslldq xmm0, 4 + psrldq xmm2, 12 + pslldq xmm1, 4 + por xmm4, xmm2 + por xmm7, xmm0 + por xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm7 + movdqa xmm2, xmm7 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm7, xmm0 + movdqa xmm2, xmm7 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm7 + pxor xmm4, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_init_aesni_calc_iv_16_loop + mov edx, r11d + cmp ecx, edx + je L_AES_GCM_init_aesni_calc_iv_done +L_AES_GCM_init_aesni_calc_iv_lt16: + sub rsp, 16 + pxor xmm8, xmm8 + xor r13d, r13d + movdqu [rsp], xmm8 +L_AES_GCM_init_aesni_calc_iv_loop: + movzx r12d, BYTE PTR [r10+rcx] + mov BYTE PTR [rsp+r13], r12b + inc ecx + inc r13d + cmp ecx, edx + jl L_AES_GCM_init_aesni_calc_iv_loop + movdqu xmm8, [rsp] + add rsp, 16 + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm8 + pshufd xmm1, xmm4, 78 + pshufd xmm2, xmm5, 78 + movdqa xmm3, xmm5 + movdqa xmm0, xmm5 + pclmulqdq xmm3, xmm4, 17 + pclmulqdq xmm0, xmm4, 0 + pxor xmm1, xmm4 + pxor xmm2, xmm5 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm7, xmm0 + movdqa xmm4, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm7, xmm2 + pxor xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm4 + psrld xmm0, 31 + psrld xmm1, 31 + pslld xmm7, 1 + pslld xmm4, 1 + movdqa xmm2, xmm0 + pslldq xmm0, 4 + psrldq xmm2, 12 + pslldq xmm1, 4 + por xmm4, xmm2 + por xmm7, xmm0 + por xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm7 + movdqa xmm2, xmm7 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm7, xmm0 + movdqa xmm2, xmm7 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm7 + pxor xmm4, xmm2 +L_AES_GCM_init_aesni_calc_iv_done: + ; T = Encrypt counter + pxor xmm0, xmm0 + shl edx, 3 + pinsrq xmm0, rdx, 0 + pxor xmm4, xmm0 + pshufd xmm1, xmm4, 78 + pshufd xmm2, xmm5, 78 + movdqa xmm3, xmm5 + movdqa xmm0, xmm5 + pclmulqdq xmm3, xmm4, 17 + pclmulqdq xmm0, xmm4, 0 + pxor xmm1, xmm4 + pxor xmm2, xmm5 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm7, xmm0 + movdqa xmm4, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm7, xmm2 + pxor xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm4 + psrld xmm0, 31 + psrld xmm1, 31 + pslld xmm7, 1 + pslld xmm4, 1 + movdqa xmm2, xmm0 + pslldq xmm0, 4 + psrldq xmm2, 12 + pslldq xmm1, 4 + por xmm4, xmm2 + por xmm7, xmm0 + por xmm4, xmm1 + movdqa xmm0, xmm7 + movdqa xmm1, xmm7 + movdqa xmm2, xmm7 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm7, xmm0 + movdqa xmm2, xmm7 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm7 + pxor xmm4, xmm2 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + ; Encrypt counter + movdqa xmm8, OWORD PTR [rdi] + pxor xmm8, xmm4 + aesenc xmm8, [rdi+16] + aesenc xmm8, [rdi+32] + aesenc xmm8, [rdi+48] + aesenc xmm8, [rdi+64] + aesenc xmm8, [rdi+80] + aesenc xmm8, [rdi+96] + aesenc xmm8, [rdi+112] + aesenc xmm8, [rdi+128] + aesenc xmm8, [rdi+144] + cmp esi, 11 + movdqa xmm9, OWORD PTR [rdi+160] + jl L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last + aesenc xmm8, xmm9 + aesenc xmm8, [rdi+176] + cmp esi, 13 + movdqa xmm9, OWORD PTR [rdi+192] + jl L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last + aesenc xmm8, xmm9 + aesenc xmm8, [rdi+208] + movdqa xmm9, OWORD PTR [rdi+224] +L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last: + aesenclast xmm8, xmm9 + movdqu xmm15, xmm8 +L_AES_GCM_init_aesni_iv_done: + movdqa OWORD PTR [r9], xmm15 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_epi64 + paddd xmm4, OWORD PTR L_aes_gcm_one + movdqa OWORD PTR [rax], xmm5 + movdqa OWORD PTR [r8], xmm4 + add rsp, 16 + pop r14 + pop r13 + pop r12 + pop rsi + pop rdi + ret +AES_GCM_init_aesni ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_aad_update_aesni PROC + mov rax, rcx + movdqa xmm5, OWORD PTR [r8] + movdqa xmm6, OWORD PTR [r9] + xor ecx, ecx +L_AES_GCM_aad_update_aesni_16_loop: + movdqu xmm8, [rax+rcx] + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm5, xmm8 + pshufd xmm1, xmm5, 78 + pshufd xmm2, xmm6, 78 + movdqa xmm3, xmm6 + movdqa xmm0, xmm6 + pclmulqdq xmm3, xmm5, 17 + pclmulqdq xmm0, xmm5, 0 + pxor xmm1, xmm5 + pxor xmm2, xmm6 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm4, xmm0 + movdqa xmm5, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm4, xmm2 + pxor xmm5, xmm1 + movdqa xmm0, xmm4 + movdqa xmm1, xmm5 + psrld xmm0, 31 + psrld xmm1, 31 + pslld xmm4, 1 + pslld xmm5, 1 + movdqa xmm2, xmm0 + pslldq xmm0, 4 + psrldq xmm2, 12 + pslldq xmm1, 4 + por xmm5, xmm2 + por xmm4, xmm0 + por xmm5, xmm1 + movdqa xmm0, xmm4 + movdqa xmm1, xmm4 + movdqa xmm2, xmm4 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm4, xmm0 + movdqa xmm2, xmm4 + movdqa xmm3, xmm4 + movdqa xmm0, xmm4 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm4 + pxor xmm5, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_aad_update_aesni_16_loop + movdqa OWORD PTR [r8], xmm5 + ret +AES_GCM_aad_update_aesni ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_encrypt_block_aesni PROC + mov r10, r8 + mov r11, r9 + mov rax, QWORD PTR [rsp+40] + movdqu xmm8, [rax] + movdqa xmm9, xmm8 + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_epi64 + paddd xmm9, OWORD PTR L_aes_gcm_one + pxor xmm8, [rcx] + movdqu [rax], xmm9 + aesenc xmm8, [rcx+16] + aesenc xmm8, [rcx+32] + aesenc xmm8, [rcx+48] + aesenc xmm8, [rcx+64] + aesenc xmm8, [rcx+80] + aesenc xmm8, [rcx+96] + aesenc xmm8, [rcx+112] + aesenc xmm8, [rcx+128] + aesenc xmm8, [rcx+144] + cmp edx, 11 + movdqa xmm9, OWORD PTR [rcx+160] + jl L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last + aesenc xmm8, xmm9 + aesenc xmm8, [rcx+176] + cmp edx, 13 + movdqa xmm9, OWORD PTR [rcx+192] + jl L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last + aesenc xmm8, xmm9 + aesenc xmm8, [rcx+208] + movdqa xmm9, OWORD PTR [rcx+224] +L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last: + aesenclast xmm8, xmm9 + movdqu xmm9, [r11] + pxor xmm8, xmm9 + movdqu [r10], xmm8 + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask + ret +AES_GCM_encrypt_block_aesni ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_ghash_block_aesni PROC + movdqa xmm4, OWORD PTR [rdx] + movdqa xmm5, OWORD PTR [r8] + movdqu xmm8, [rcx] + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm4, xmm8 + pshufd xmm1, xmm4, 78 + pshufd xmm2, xmm5, 78 + movdqa xmm3, xmm5 + movdqa xmm0, xmm5 + pclmulqdq xmm3, xmm4, 17 + pclmulqdq xmm0, xmm4, 0 + pxor xmm1, xmm4 + pxor xmm2, xmm5 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm6, xmm0 + movdqa xmm4, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm6, xmm2 + pxor xmm4, xmm1 + movdqa xmm0, xmm6 + movdqa xmm1, xmm4 + psrld xmm0, 31 + psrld xmm1, 31 + pslld xmm6, 1 + pslld xmm4, 1 + movdqa xmm2, xmm0 + pslldq xmm0, 4 + psrldq xmm2, 12 + pslldq xmm1, 4 + por xmm4, xmm2 + por xmm6, xmm0 + por xmm4, xmm1 + movdqa xmm0, xmm6 + movdqa xmm1, xmm6 + movdqa xmm2, xmm6 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm6, xmm0 + movdqa xmm2, xmm6 + movdqa xmm3, xmm6 + movdqa xmm0, xmm6 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm6 + pxor xmm4, xmm2 + movdqa OWORD PTR [rdx], xmm4 + ret +AES_GCM_ghash_block_aesni ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_encrypt_update_aesni PROC + push r13 + push r12 + push r14 + push r15 + push rdi + mov rax, rcx + mov r10, r8 + mov r8d, edx + mov r11, r9 + mov r9d, DWORD PTR [rsp+80] + mov r12, QWORD PTR [rsp+88] + mov r14, QWORD PTR [rsp+96] + mov r15, QWORD PTR [rsp+104] + sub rsp, 160 + movdqa xmm6, OWORD PTR [r12] + movdqa xmm5, OWORD PTR [r14] + movdqa xmm9, xmm5 + movdqa xmm8, xmm5 + psrlq xmm9, 63 + psllq xmm8, 1 + pslldq xmm9, 8 + por xmm8, xmm9 + pshufd xmm5, xmm5, 255 + psrad xmm5, 31 + pand xmm5, OWORD PTR L_aes_gcm_mod2_128 + pxor xmm5, xmm8 + xor rdi, rdi + cmp r9d, 128 + mov r13d, r9d + jl L_AES_GCM_encrypt_update_aesni_done_128 + and r13d, 4294967168 + movdqa xmm2, xmm6 + ; H ^ 1 + movdqu [rsp], xmm5 + ; H ^ 2 + pshufd xmm9, xmm5, 78 + pshufd xmm10, xmm5, 78 + movdqa xmm11, xmm5 + movdqa xmm8, xmm5 + pclmulqdq xmm11, xmm5, 17 + pclmulqdq xmm8, xmm5, 0 + pxor xmm9, xmm5 + pxor xmm10, xmm5 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm0, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm0, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm0, xmm14 + movdqu [rsp+16], xmm0 + ; H ^ 3 + pshufd xmm9, xmm5, 78 + pshufd xmm10, xmm0, 78 + movdqa xmm11, xmm0 + movdqa xmm8, xmm0 + pclmulqdq xmm11, xmm5, 17 + pclmulqdq xmm8, xmm5, 0 + pxor xmm9, xmm5 + pxor xmm10, xmm0 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm1, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm1, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm1, xmm14 + movdqu [rsp+32], xmm1 + ; H ^ 4 + pshufd xmm9, xmm0, 78 + pshufd xmm10, xmm0, 78 + movdqa xmm11, xmm0 + movdqa xmm8, xmm0 + pclmulqdq xmm11, xmm0, 17 + pclmulqdq xmm8, xmm0, 0 + pxor xmm9, xmm0 + pxor xmm10, xmm0 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm3, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm3, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm3, xmm14 + movdqu [rsp+48], xmm3 + ; H ^ 5 + pshufd xmm9, xmm0, 78 + pshufd xmm10, xmm1, 78 + movdqa xmm11, xmm1 + movdqa xmm8, xmm1 + pclmulqdq xmm11, xmm0, 17 + pclmulqdq xmm8, xmm0, 0 + pxor xmm9, xmm0 + pxor xmm10, xmm1 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm7, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm7, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm7, xmm14 + movdqu [rsp+64], xmm7 + ; H ^ 6 + pshufd xmm9, xmm1, 78 + pshufd xmm10, xmm1, 78 + movdqa xmm11, xmm1 + movdqa xmm8, xmm1 + pclmulqdq xmm11, xmm1, 17 + pclmulqdq xmm8, xmm1, 0 + pxor xmm9, xmm1 + pxor xmm10, xmm1 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm7, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm7, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm7, xmm14 + movdqu [rsp+80], xmm7 + ; H ^ 7 + pshufd xmm9, xmm1, 78 + pshufd xmm10, xmm3, 78 + movdqa xmm11, xmm3 + movdqa xmm8, xmm3 + pclmulqdq xmm11, xmm1, 17 + pclmulqdq xmm8, xmm1, 0 + pxor xmm9, xmm1 + pxor xmm10, xmm3 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm7, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm7, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm7, xmm14 + movdqu [rsp+96], xmm7 + ; H ^ 8 + pshufd xmm9, xmm3, 78 + pshufd xmm10, xmm3, 78 + movdqa xmm11, xmm3 + movdqa xmm8, xmm3 + pclmulqdq xmm11, xmm3, 17 + pclmulqdq xmm8, xmm3, 0 + pxor xmm9, xmm3 + pxor xmm10, xmm3 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm7, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm7, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm7, xmm14 + movdqu [rsp+112], xmm7 + ; First 128 bytes of input + movdqu xmm8, [r15] + movdqa xmm1, OWORD PTR L_aes_gcm_bswap_epi64 + movdqa xmm0, xmm8 + pshufb xmm8, xmm1 + movdqa xmm9, xmm0 + paddd xmm9, OWORD PTR L_aes_gcm_one + pshufb xmm9, xmm1 + movdqa xmm10, xmm0 + paddd xmm10, OWORD PTR L_aes_gcm_two + pshufb xmm10, xmm1 + movdqa xmm11, xmm0 + paddd xmm11, OWORD PTR L_aes_gcm_three + pshufb xmm11, xmm1 + movdqa xmm12, xmm0 + paddd xmm12, OWORD PTR L_aes_gcm_four + pshufb xmm12, xmm1 + movdqa xmm13, xmm0 + paddd xmm13, OWORD PTR L_aes_gcm_five + pshufb xmm13, xmm1 + movdqa xmm14, xmm0 + paddd xmm14, OWORD PTR L_aes_gcm_six + pshufb xmm14, xmm1 + movdqa xmm15, xmm0 + paddd xmm15, OWORD PTR L_aes_gcm_seven + pshufb xmm15, xmm1 + paddd xmm0, OWORD PTR L_aes_gcm_eight + movdqa xmm7, OWORD PTR [rax] + movdqu [r15], xmm0 + pxor xmm8, xmm7 + pxor xmm9, xmm7 + pxor xmm10, xmm7 + pxor xmm11, xmm7 + pxor xmm12, xmm7 + pxor xmm13, xmm7 + pxor xmm14, xmm7 + pxor xmm15, xmm7 + movdqa xmm7, OWORD PTR [rax+16] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [rax+32] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [rax+48] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [rax+64] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [rax+80] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [rax+96] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [rax+112] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [rax+128] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [rax+144] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + cmp r8d, 11 + movdqa xmm7, OWORD PTR [rax+160] + jl L_AES_GCM_encrypt_update_aesni_enc_done + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [rax+176] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + cmp r8d, 13 + movdqa xmm7, OWORD PTR [rax+192] + jl L_AES_GCM_encrypt_update_aesni_enc_done + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [rax+208] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [rax+224] +L_AES_GCM_encrypt_update_aesni_enc_done: + aesenclast xmm8, xmm7 + aesenclast xmm9, xmm7 + movdqu xmm0, [r11] + movdqu xmm1, [r11+16] + pxor xmm8, xmm0 + pxor xmm9, xmm1 + movdqu [r10], xmm8 + movdqu [r10+16], xmm9 + aesenclast xmm10, xmm7 + aesenclast xmm11, xmm7 + movdqu xmm0, [r11+32] + movdqu xmm1, [r11+48] + pxor xmm10, xmm0 + pxor xmm11, xmm1 + movdqu [r10+32], xmm10 + movdqu [r10+48], xmm11 + aesenclast xmm12, xmm7 + aesenclast xmm13, xmm7 + movdqu xmm0, [r11+64] + movdqu xmm1, [r11+80] + pxor xmm12, xmm0 + pxor xmm13, xmm1 + movdqu [r10+64], xmm12 + movdqu [r10+80], xmm13 + aesenclast xmm14, xmm7 + aesenclast xmm15, xmm7 + movdqu xmm0, [r11+96] + movdqu xmm1, [r11+112] + pxor xmm14, xmm0 + pxor xmm15, xmm1 + movdqu [r10+96], xmm14 + movdqu [r10+112], xmm15 + cmp r13d, 128 + mov edi, 128 + jle L_AES_GCM_encrypt_update_aesni_end_128 + ; More 128 bytes of input +L_AES_GCM_encrypt_update_aesni_ghash_128: + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + movdqu xmm8, [r15] + movdqa xmm1, OWORD PTR L_aes_gcm_bswap_epi64 + movdqa xmm0, xmm8 + pshufb xmm8, xmm1 + movdqa xmm9, xmm0 + paddd xmm9, OWORD PTR L_aes_gcm_one + pshufb xmm9, xmm1 + movdqa xmm10, xmm0 + paddd xmm10, OWORD PTR L_aes_gcm_two + pshufb xmm10, xmm1 + movdqa xmm11, xmm0 + paddd xmm11, OWORD PTR L_aes_gcm_three + pshufb xmm11, xmm1 + movdqa xmm12, xmm0 + paddd xmm12, OWORD PTR L_aes_gcm_four + pshufb xmm12, xmm1 + movdqa xmm13, xmm0 + paddd xmm13, OWORD PTR L_aes_gcm_five + pshufb xmm13, xmm1 + movdqa xmm14, xmm0 + paddd xmm14, OWORD PTR L_aes_gcm_six + pshufb xmm14, xmm1 + movdqa xmm15, xmm0 + paddd xmm15, OWORD PTR L_aes_gcm_seven + pshufb xmm15, xmm1 + paddd xmm0, OWORD PTR L_aes_gcm_eight + movdqa xmm7, OWORD PTR [rax] + movdqu [r15], xmm0 + pxor xmm8, xmm7 + pxor xmm9, xmm7 + pxor xmm10, xmm7 + pxor xmm11, xmm7 + pxor xmm12, xmm7 + pxor xmm13, xmm7 + pxor xmm14, xmm7 + pxor xmm15, xmm7 + movdqu xmm7, [rsp+112] + movdqu xmm0, [rdx+-128] + aesenc xmm8, [rax+16] + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm0, xmm2 + pshufd xmm1, xmm7, 78 + pshufd xmm5, xmm0, 78 + pxor xmm1, xmm7 + pxor xmm5, xmm0 + movdqa xmm3, xmm0 + pclmulqdq xmm3, xmm7, 17 + aesenc xmm9, [rax+16] + aesenc xmm10, [rax+16] + movdqa xmm2, xmm0 + pclmulqdq xmm2, xmm7, 0 + aesenc xmm11, [rax+16] + aesenc xmm12, [rax+16] + pclmulqdq xmm1, xmm5, 0 + aesenc xmm13, [rax+16] + aesenc xmm14, [rax+16] + aesenc xmm15, [rax+16] + pxor xmm1, xmm2 + pxor xmm1, xmm3 + movdqu xmm7, [rsp+96] + movdqu xmm0, [rdx+-112] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [rax+32] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [rax+32] + aesenc xmm10, [rax+32] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [rax+32] + aesenc xmm12, [rax+32] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [rax+32] + aesenc xmm14, [rax+32] + aesenc xmm15, [rax+32] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp+80] + movdqu xmm0, [rdx+-96] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [rax+48] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [rax+48] + aesenc xmm10, [rax+48] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [rax+48] + aesenc xmm12, [rax+48] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [rax+48] + aesenc xmm14, [rax+48] + aesenc xmm15, [rax+48] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp+64] + movdqu xmm0, [rdx+-80] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [rax+64] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [rax+64] + aesenc xmm10, [rax+64] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [rax+64] + aesenc xmm12, [rax+64] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [rax+64] + aesenc xmm14, [rax+64] + aesenc xmm15, [rax+64] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp+48] + movdqu xmm0, [rdx+-64] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [rax+80] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [rax+80] + aesenc xmm10, [rax+80] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [rax+80] + aesenc xmm12, [rax+80] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [rax+80] + aesenc xmm14, [rax+80] + aesenc xmm15, [rax+80] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp+32] + movdqu xmm0, [rdx+-48] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [rax+96] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [rax+96] + aesenc xmm10, [rax+96] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [rax+96] + aesenc xmm12, [rax+96] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [rax+96] + aesenc xmm14, [rax+96] + aesenc xmm15, [rax+96] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp+16] + movdqu xmm0, [rdx+-32] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [rax+112] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [rax+112] + aesenc xmm10, [rax+112] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [rax+112] + aesenc xmm12, [rax+112] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [rax+112] + aesenc xmm14, [rax+112] + aesenc xmm15, [rax+112] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp] + movdqu xmm0, [rdx+-16] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [rax+128] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [rax+128] + aesenc xmm10, [rax+128] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [rax+128] + aesenc xmm12, [rax+128] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [rax+128] + aesenc xmm14, [rax+128] + aesenc xmm15, [rax+128] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqa xmm5, xmm1 + psrldq xmm1, 8 + pslldq xmm5, 8 + aesenc xmm8, [rax+144] + pxor xmm2, xmm5 + pxor xmm3, xmm1 + movdqa xmm7, xmm2 + movdqa xmm4, xmm2 + movdqa xmm5, xmm2 + aesenc xmm9, [rax+144] + pslld xmm7, 31 + pslld xmm4, 30 + pslld xmm5, 25 + aesenc xmm10, [rax+144] + pxor xmm7, xmm4 + pxor xmm7, xmm5 + aesenc xmm11, [rax+144] + movdqa xmm4, xmm7 + pslldq xmm7, 12 + psrldq xmm4, 4 + aesenc xmm12, [rax+144] + pxor xmm2, xmm7 + movdqa xmm5, xmm2 + movdqa xmm1, xmm2 + movdqa xmm0, xmm2 + aesenc xmm13, [rax+144] + psrld xmm5, 1 + psrld xmm1, 2 + psrld xmm0, 7 + aesenc xmm14, [rax+144] + pxor xmm5, xmm1 + pxor xmm5, xmm0 + aesenc xmm15, [rax+144] + pxor xmm5, xmm4 + pxor xmm2, xmm5 + pxor xmm2, xmm3 + cmp r8d, 11 + movdqa xmm7, OWORD PTR [rax+160] + jl L_AES_GCM_encrypt_update_aesni_aesenc_128_ghash_avx_done + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [rax+176] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + cmp r8d, 13 + movdqa xmm7, OWORD PTR [rax+192] + jl L_AES_GCM_encrypt_update_aesni_aesenc_128_ghash_avx_done + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [rax+208] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [rax+224] +L_AES_GCM_encrypt_update_aesni_aesenc_128_ghash_avx_done: + aesenclast xmm8, xmm7 + aesenclast xmm9, xmm7 + movdqu xmm0, [rcx] + movdqu xmm1, [rcx+16] + pxor xmm8, xmm0 + pxor xmm9, xmm1 + movdqu [rdx], xmm8 + movdqu [rdx+16], xmm9 + aesenclast xmm10, xmm7 + aesenclast xmm11, xmm7 + movdqu xmm0, [rcx+32] + movdqu xmm1, [rcx+48] + pxor xmm10, xmm0 + pxor xmm11, xmm1 + movdqu [rdx+32], xmm10 + movdqu [rdx+48], xmm11 + aesenclast xmm12, xmm7 + aesenclast xmm13, xmm7 + movdqu xmm0, [rcx+64] + movdqu xmm1, [rcx+80] + pxor xmm12, xmm0 + pxor xmm13, xmm1 + movdqu [rdx+64], xmm12 + movdqu [rdx+80], xmm13 + aesenclast xmm14, xmm7 + aesenclast xmm15, xmm7 + movdqu xmm0, [rcx+96] + movdqu xmm1, [rcx+112] + pxor xmm14, xmm0 + pxor xmm15, xmm1 + movdqu [rdx+96], xmm14 + movdqu [rdx+112], xmm15 + add edi, 128 + cmp edi, r13d + jl L_AES_GCM_encrypt_update_aesni_ghash_128 +L_AES_GCM_encrypt_update_aesni_end_128: + movdqa xmm4, OWORD PTR L_aes_gcm_bswap_mask + pshufb xmm8, xmm4 + pshufb xmm9, xmm4 + pshufb xmm10, xmm4 + pshufb xmm11, xmm4 + pxor xmm8, xmm2 + pshufb xmm12, xmm4 + pshufb xmm13, xmm4 + pshufb xmm14, xmm4 + pshufb xmm15, xmm4 + movdqu xmm7, [rsp+112] + pshufd xmm1, xmm8, 78 + pshufd xmm2, xmm7, 78 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + pclmulqdq xmm3, xmm8, 17 + pclmulqdq xmm0, xmm8, 0 + pxor xmm1, xmm8 + pxor xmm2, xmm7 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm4, xmm0 + movdqa xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm4, xmm2 + pxor xmm6, xmm1 + movdqu xmm7, [rsp+96] + pshufd xmm1, xmm9, 78 + pshufd xmm2, xmm7, 78 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + pclmulqdq xmm3, xmm9, 17 + pclmulqdq xmm0, xmm9, 0 + pxor xmm1, xmm9 + pxor xmm2, xmm7 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + pxor xmm4, xmm0 + pxor xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm4, xmm2 + pxor xmm6, xmm1 + movdqu xmm7, [rsp+80] + pshufd xmm1, xmm10, 78 + pshufd xmm2, xmm7, 78 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + pclmulqdq xmm3, xmm10, 17 + pclmulqdq xmm0, xmm10, 0 + pxor xmm1, xmm10 + pxor xmm2, xmm7 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + pxor xmm4, xmm0 + pxor xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm4, xmm2 + pxor xmm6, xmm1 + movdqu xmm7, [rsp+64] + pshufd xmm1, xmm11, 78 + pshufd xmm2, xmm7, 78 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + pclmulqdq xmm3, xmm11, 17 + pclmulqdq xmm0, xmm11, 0 + pxor xmm1, xmm11 + pxor xmm2, xmm7 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + pxor xmm4, xmm0 + pxor xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm4, xmm2 + pxor xmm6, xmm1 + movdqu xmm7, [rsp+48] + pshufd xmm1, xmm12, 78 + pshufd xmm2, xmm7, 78 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + pclmulqdq xmm3, xmm12, 17 + pclmulqdq xmm0, xmm12, 0 + pxor xmm1, xmm12 + pxor xmm2, xmm7 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + pxor xmm4, xmm0 + pxor xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm4, xmm2 + pxor xmm6, xmm1 + movdqu xmm7, [rsp+32] + pshufd xmm1, xmm13, 78 + pshufd xmm2, xmm7, 78 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + pclmulqdq xmm3, xmm13, 17 + pclmulqdq xmm0, xmm13, 0 + pxor xmm1, xmm13 + pxor xmm2, xmm7 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + pxor xmm4, xmm0 + pxor xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm4, xmm2 + pxor xmm6, xmm1 + movdqu xmm7, [rsp+16] + pshufd xmm1, xmm14, 78 + pshufd xmm2, xmm7, 78 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + pclmulqdq xmm3, xmm14, 17 + pclmulqdq xmm0, xmm14, 0 + pxor xmm1, xmm14 + pxor xmm2, xmm7 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + pxor xmm4, xmm0 + pxor xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm4, xmm2 + pxor xmm6, xmm1 + movdqu xmm7, [rsp] + pshufd xmm1, xmm15, 78 + pshufd xmm2, xmm7, 78 + movdqa xmm3, xmm7 + movdqa xmm0, xmm7 + pclmulqdq xmm3, xmm15, 17 + pclmulqdq xmm0, xmm15, 0 + pxor xmm1, xmm15 + pxor xmm2, xmm7 + pclmulqdq xmm1, xmm2, 0 + pxor xmm1, xmm0 + pxor xmm1, xmm3 + movdqa xmm2, xmm1 + pxor xmm4, xmm0 + pxor xmm6, xmm3 + pslldq xmm2, 8 + psrldq xmm1, 8 + pxor xmm4, xmm2 + pxor xmm6, xmm1 + movdqa xmm0, xmm4 + movdqa xmm1, xmm4 + movdqa xmm2, xmm4 + pslld xmm0, 31 + pslld xmm1, 30 + pslld xmm2, 25 + pxor xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + pslldq xmm0, 12 + pxor xmm4, xmm0 + movdqa xmm2, xmm4 + movdqa xmm3, xmm4 + movdqa xmm0, xmm4 + psrld xmm2, 1 + psrld xmm3, 2 + psrld xmm0, 7 + pxor xmm2, xmm3 + pxor xmm2, xmm0 + pxor xmm2, xmm1 + pxor xmm2, xmm4 + pxor xmm6, xmm2 + movdqu xmm5, [rsp] +L_AES_GCM_encrypt_update_aesni_done_128: + mov edx, r9d + cmp edi, edx + jge L_AES_GCM_encrypt_update_aesni_done_enc + mov r13d, r9d + and r13d, 4294967280 + cmp edi, r13d + jge L_AES_GCM_encrypt_update_aesni_last_block_done + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + movdqu xmm8, [r15] + movdqa xmm9, xmm8 + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_epi64 + paddd xmm9, OWORD PTR L_aes_gcm_one + pxor xmm8, [rax] + movdqu [r15], xmm9 + aesenc xmm8, [rax+16] + aesenc xmm8, [rax+32] + aesenc xmm8, [rax+48] + aesenc xmm8, [rax+64] + aesenc xmm8, [rax+80] + aesenc xmm8, [rax+96] + aesenc xmm8, [rax+112] + aesenc xmm8, [rax+128] + aesenc xmm8, [rax+144] + cmp r8d, 11 + movdqa xmm9, OWORD PTR [rax+160] + jl L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last + aesenc xmm8, xmm9 + aesenc xmm8, [rax+176] + cmp r8d, 13 + movdqa xmm9, OWORD PTR [rax+192] + jl L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last + aesenc xmm8, xmm9 + aesenc xmm8, [rax+208] + movdqa xmm9, OWORD PTR [rax+224] +L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last: + aesenclast xmm8, xmm9 + movdqu xmm9, [rcx] + pxor xmm8, xmm9 + movdqu [rdx], xmm8 + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm6, xmm8 + add edi, 16 + cmp edi, r13d + jge L_AES_GCM_encrypt_update_aesni_last_block_ghash +L_AES_GCM_encrypt_update_aesni_last_block_start: + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + movdqu xmm8, [r15] + movdqa xmm9, xmm8 + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_epi64 + paddd xmm9, OWORD PTR L_aes_gcm_one + pxor xmm8, [rax] + movdqu [r15], xmm9 + movdqa xmm10, xmm6 + pclmulqdq xmm10, xmm5, 16 + aesenc xmm8, [rax+16] + aesenc xmm8, [rax+32] + movdqa xmm11, xmm6 + pclmulqdq xmm11, xmm5, 1 + aesenc xmm8, [rax+48] + aesenc xmm8, [rax+64] + movdqa xmm12, xmm6 + pclmulqdq xmm12, xmm5, 0 + aesenc xmm8, [rax+80] + movdqa xmm1, xmm6 + pclmulqdq xmm1, xmm5, 17 + aesenc xmm8, [rax+96] + pxor xmm10, xmm11 + movdqa xmm2, xmm10 + psrldq xmm10, 8 + pslldq xmm2, 8 + aesenc xmm8, [rax+112] + movdqa xmm3, xmm1 + pxor xmm2, xmm12 + pxor xmm3, xmm10 + movdqa xmm0, OWORD PTR L_aes_gcm_mod2_128 + movdqa xmm11, xmm2 + pclmulqdq xmm11, xmm0, 16 + aesenc xmm8, [rax+128] + pshufd xmm10, xmm2, 78 + pxor xmm10, xmm11 + movdqa xmm11, xmm10 + pclmulqdq xmm11, xmm0, 16 + aesenc xmm8, [rax+144] + pshufd xmm6, xmm10, 78 + pxor xmm6, xmm11 + pxor xmm6, xmm3 + cmp r8d, 11 + movdqa xmm9, OWORD PTR [rax+160] + jl L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last + aesenc xmm8, xmm9 + aesenc xmm8, [rax+176] + cmp r8d, 13 + movdqa xmm9, OWORD PTR [rax+192] + jl L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last + aesenc xmm8, xmm9 + aesenc xmm8, [rax+208] + movdqa xmm9, OWORD PTR [rax+224] +L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last: + aesenclast xmm8, xmm9 + movdqu xmm9, [rcx] + pxor xmm8, xmm9 + movdqu [rdx], xmm8 + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm6, xmm8 + add edi, 16 + cmp edi, r13d + jl L_AES_GCM_encrypt_update_aesni_last_block_start +L_AES_GCM_encrypt_update_aesni_last_block_ghash: + pshufd xmm9, xmm5, 78 + pshufd xmm10, xmm6, 78 + movdqa xmm11, xmm6 + movdqa xmm8, xmm6 + pclmulqdq xmm11, xmm5, 17 + pclmulqdq xmm8, xmm5, 0 + pxor xmm9, xmm5 + pxor xmm10, xmm6 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm6, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm6, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm6, xmm14 +L_AES_GCM_encrypt_update_aesni_last_block_done: +L_AES_GCM_encrypt_update_aesni_done_enc: + movdqa OWORD PTR [r12], xmm6 + add rsp, 160 + pop rdi + pop r15 + pop r14 + pop r12 + pop r13 + ret +AES_GCM_encrypt_update_aesni ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_encrypt_final_aesni PROC + push r13 + push r12 + push r14 + mov rax, rcx + mov r10d, r9d + mov r9, rdx + mov r11d, DWORD PTR [rsp+64] + mov r12, QWORD PTR [rsp+72] + mov r14, QWORD PTR [rsp+80] + sub rsp, 16 + movdqa xmm4, OWORD PTR [rax] + movdqa xmm5, OWORD PTR [r12] + movdqa xmm6, OWORD PTR [r14] + movdqa xmm9, xmm5 + movdqa xmm8, xmm5 + psrlq xmm9, 63 + psllq xmm8, 1 + pslldq xmm9, 8 + por xmm8, xmm9 + pshufd xmm5, xmm5, 255 + psrad xmm5, 31 + pand xmm5, OWORD PTR L_aes_gcm_mod2_128 + pxor xmm5, xmm8 + mov edx, r10d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + pinsrq xmm0, rdx, 0 + pinsrq xmm0, rcx, 1 + pxor xmm4, xmm0 + pshufd xmm9, xmm5, 78 + pshufd xmm10, xmm4, 78 + movdqa xmm11, xmm4 + movdqa xmm8, xmm4 + pclmulqdq xmm11, xmm5, 17 + pclmulqdq xmm8, xmm5, 0 + pxor xmm9, xmm5 + pxor xmm10, xmm4 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm4, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm4, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm4, xmm14 + pshufb xmm4, OWORD PTR L_aes_gcm_bswap_mask + movdqu xmm0, xmm6 + pxor xmm0, xmm4 + cmp r8d, 16 + je L_AES_GCM_encrypt_final_aesni_store_tag_16 + xor rcx, rcx + movdqu [rsp], xmm0 +L_AES_GCM_encrypt_final_aesni_store_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + mov BYTE PTR [r9+rcx], r13b + inc ecx + cmp ecx, r8d + jne L_AES_GCM_encrypt_final_aesni_store_tag_loop + jmp L_AES_GCM_encrypt_final_aesni_store_tag_done +L_AES_GCM_encrypt_final_aesni_store_tag_16: + movdqu [r9], xmm0 +L_AES_GCM_encrypt_final_aesni_store_tag_done: + add rsp, 16 + pop r14 + pop r12 + pop r13 + ret +AES_GCM_encrypt_final_aesni ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_decrypt_update_aesni PROC + push r13 + push r12 + push r14 + push r15 + push rdi + push rsi + mov rax, rcx + mov r10, r8 + mov r8d, edx + mov r11, r9 + mov r9d, DWORD PTR [rsp+88] + mov r12, QWORD PTR [rsp+96] + mov r14, QWORD PTR [rsp+104] + mov r15, QWORD PTR [rsp+112] + sub rsp, 168 + movdqa xmm6, OWORD PTR [r12] + movdqa xmm5, OWORD PTR [r14] + movdqa xmm9, xmm5 + movdqa xmm8, xmm5 + psrlq xmm9, 63 + psllq xmm8, 1 + pslldq xmm9, 8 + por xmm8, xmm9 + pshufd xmm5, xmm5, 255 + psrad xmm5, 31 + pand xmm5, OWORD PTR L_aes_gcm_mod2_128 + pxor xmm5, xmm8 + xor edi, edi + cmp r9d, 128 + mov r13d, r9d + jl L_AES_GCM_decrypt_update_aesni_done_128 + and r13d, 4294967168 + movdqa xmm2, xmm6 + ; H ^ 1 + movdqu [rsp], xmm5 + ; H ^ 2 + pshufd xmm9, xmm5, 78 + pshufd xmm10, xmm5, 78 + movdqa xmm11, xmm5 + movdqa xmm8, xmm5 + pclmulqdq xmm11, xmm5, 17 + pclmulqdq xmm8, xmm5, 0 + pxor xmm9, xmm5 + pxor xmm10, xmm5 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm0, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm0, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm0, xmm14 + movdqu [rsp+16], xmm0 + ; H ^ 3 + pshufd xmm9, xmm5, 78 + pshufd xmm10, xmm0, 78 + movdqa xmm11, xmm0 + movdqa xmm8, xmm0 + pclmulqdq xmm11, xmm5, 17 + pclmulqdq xmm8, xmm5, 0 + pxor xmm9, xmm5 + pxor xmm10, xmm0 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm1, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm1, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm1, xmm14 + movdqu [rsp+32], xmm1 + ; H ^ 4 + pshufd xmm9, xmm0, 78 + pshufd xmm10, xmm0, 78 + movdqa xmm11, xmm0 + movdqa xmm8, xmm0 + pclmulqdq xmm11, xmm0, 17 + pclmulqdq xmm8, xmm0, 0 + pxor xmm9, xmm0 + pxor xmm10, xmm0 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm3, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm3, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm3, xmm14 + movdqu [rsp+48], xmm3 + ; H ^ 5 + pshufd xmm9, xmm0, 78 + pshufd xmm10, xmm1, 78 + movdqa xmm11, xmm1 + movdqa xmm8, xmm1 + pclmulqdq xmm11, xmm0, 17 + pclmulqdq xmm8, xmm0, 0 + pxor xmm9, xmm0 + pxor xmm10, xmm1 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm7, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm7, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm7, xmm14 + movdqu [rsp+64], xmm7 + ; H ^ 6 + pshufd xmm9, xmm1, 78 + pshufd xmm10, xmm1, 78 + movdqa xmm11, xmm1 + movdqa xmm8, xmm1 + pclmulqdq xmm11, xmm1, 17 + pclmulqdq xmm8, xmm1, 0 + pxor xmm9, xmm1 + pxor xmm10, xmm1 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm7, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm7, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm7, xmm14 + movdqu [rsp+80], xmm7 + ; H ^ 7 + pshufd xmm9, xmm1, 78 + pshufd xmm10, xmm3, 78 + movdqa xmm11, xmm3 + movdqa xmm8, xmm3 + pclmulqdq xmm11, xmm1, 17 + pclmulqdq xmm8, xmm1, 0 + pxor xmm9, xmm1 + pxor xmm10, xmm3 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm7, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm7, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm7, xmm14 + movdqu [rsp+96], xmm7 + ; H ^ 8 + pshufd xmm9, xmm3, 78 + pshufd xmm10, xmm3, 78 + movdqa xmm11, xmm3 + movdqa xmm8, xmm3 + pclmulqdq xmm11, xmm3, 17 + pclmulqdq xmm8, xmm3, 0 + pxor xmm9, xmm3 + pxor xmm10, xmm3 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm7, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm7, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm7, xmm14 + movdqu [rsp+112], xmm7 +L_AES_GCM_decrypt_update_aesni_ghash_128: + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + movdqu xmm8, [r15] + movdqa xmm1, OWORD PTR L_aes_gcm_bswap_epi64 + movdqa xmm0, xmm8 + pshufb xmm8, xmm1 + movdqa xmm9, xmm0 + paddd xmm9, OWORD PTR L_aes_gcm_one + pshufb xmm9, xmm1 + movdqa xmm10, xmm0 + paddd xmm10, OWORD PTR L_aes_gcm_two + pshufb xmm10, xmm1 + movdqa xmm11, xmm0 + paddd xmm11, OWORD PTR L_aes_gcm_three + pshufb xmm11, xmm1 + movdqa xmm12, xmm0 + paddd xmm12, OWORD PTR L_aes_gcm_four + pshufb xmm12, xmm1 + movdqa xmm13, xmm0 + paddd xmm13, OWORD PTR L_aes_gcm_five + pshufb xmm13, xmm1 + movdqa xmm14, xmm0 + paddd xmm14, OWORD PTR L_aes_gcm_six + pshufb xmm14, xmm1 + movdqa xmm15, xmm0 + paddd xmm15, OWORD PTR L_aes_gcm_seven + pshufb xmm15, xmm1 + paddd xmm0, OWORD PTR L_aes_gcm_eight + movdqa xmm7, OWORD PTR [rax] + movdqu [r15], xmm0 + pxor xmm8, xmm7 + pxor xmm9, xmm7 + pxor xmm10, xmm7 + pxor xmm11, xmm7 + pxor xmm12, xmm7 + pxor xmm13, xmm7 + pxor xmm14, xmm7 + pxor xmm15, xmm7 + movdqu xmm7, [rsp+112] + movdqu xmm0, [rcx] + aesenc xmm8, [rax+16] + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm0, xmm2 + pshufd xmm1, xmm7, 78 + pshufd xmm5, xmm0, 78 + pxor xmm1, xmm7 + pxor xmm5, xmm0 + movdqa xmm3, xmm0 + pclmulqdq xmm3, xmm7, 17 + aesenc xmm9, [rax+16] + aesenc xmm10, [rax+16] + movdqa xmm2, xmm0 + pclmulqdq xmm2, xmm7, 0 + aesenc xmm11, [rax+16] + aesenc xmm12, [rax+16] + pclmulqdq xmm1, xmm5, 0 + aesenc xmm13, [rax+16] + aesenc xmm14, [rax+16] + aesenc xmm15, [rax+16] + pxor xmm1, xmm2 + pxor xmm1, xmm3 + movdqu xmm7, [rsp+96] + movdqu xmm0, [rcx+16] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [rax+32] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [rax+32] + aesenc xmm10, [rax+32] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [rax+32] + aesenc xmm12, [rax+32] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [rax+32] + aesenc xmm14, [rax+32] + aesenc xmm15, [rax+32] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp+80] + movdqu xmm0, [rcx+32] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [rax+48] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [rax+48] + aesenc xmm10, [rax+48] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [rax+48] + aesenc xmm12, [rax+48] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [rax+48] + aesenc xmm14, [rax+48] + aesenc xmm15, [rax+48] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp+64] + movdqu xmm0, [rcx+48] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [rax+64] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [rax+64] + aesenc xmm10, [rax+64] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [rax+64] + aesenc xmm12, [rax+64] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [rax+64] + aesenc xmm14, [rax+64] + aesenc xmm15, [rax+64] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp+48] + movdqu xmm0, [rcx+64] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [rax+80] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [rax+80] + aesenc xmm10, [rax+80] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [rax+80] + aesenc xmm12, [rax+80] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [rax+80] + aesenc xmm14, [rax+80] + aesenc xmm15, [rax+80] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp+32] + movdqu xmm0, [rcx+80] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [rax+96] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [rax+96] + aesenc xmm10, [rax+96] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [rax+96] + aesenc xmm12, [rax+96] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [rax+96] + aesenc xmm14, [rax+96] + aesenc xmm15, [rax+96] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp+16] + movdqu xmm0, [rcx+96] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [rax+112] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [rax+112] + aesenc xmm10, [rax+112] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [rax+112] + aesenc xmm12, [rax+112] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [rax+112] + aesenc xmm14, [rax+112] + aesenc xmm15, [rax+112] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqu xmm7, [rsp] + movdqu xmm0, [rcx+112] + pshufd xmm4, xmm7, 78 + pshufb xmm0, OWORD PTR L_aes_gcm_bswap_mask + aesenc xmm8, [rax+128] + pxor xmm4, xmm7 + pshufd xmm5, xmm0, 78 + pxor xmm5, xmm0 + movdqa xmm6, xmm0 + pclmulqdq xmm6, xmm7, 17 + aesenc xmm9, [rax+128] + aesenc xmm10, [rax+128] + pclmulqdq xmm7, xmm0, 0 + aesenc xmm11, [rax+128] + aesenc xmm12, [rax+128] + pclmulqdq xmm4, xmm5, 0 + aesenc xmm13, [rax+128] + aesenc xmm14, [rax+128] + aesenc xmm15, [rax+128] + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm1, xmm6 + pxor xmm3, xmm6 + pxor xmm1, xmm4 + movdqa xmm5, xmm1 + psrldq xmm1, 8 + pslldq xmm5, 8 + aesenc xmm8, [rax+144] + pxor xmm2, xmm5 + pxor xmm3, xmm1 + movdqa xmm7, xmm2 + movdqa xmm4, xmm2 + movdqa xmm5, xmm2 + aesenc xmm9, [rax+144] + pslld xmm7, 31 + pslld xmm4, 30 + pslld xmm5, 25 + aesenc xmm10, [rax+144] + pxor xmm7, xmm4 + pxor xmm7, xmm5 + aesenc xmm11, [rax+144] + movdqa xmm4, xmm7 + pslldq xmm7, 12 + psrldq xmm4, 4 + aesenc xmm12, [rax+144] + pxor xmm2, xmm7 + movdqa xmm5, xmm2 + movdqa xmm1, xmm2 + movdqa xmm0, xmm2 + aesenc xmm13, [rax+144] + psrld xmm5, 1 + psrld xmm1, 2 + psrld xmm0, 7 + aesenc xmm14, [rax+144] + pxor xmm5, xmm1 + pxor xmm5, xmm0 + aesenc xmm15, [rax+144] + pxor xmm5, xmm4 + pxor xmm2, xmm5 + pxor xmm2, xmm3 + cmp r8d, 11 + movdqa xmm7, OWORD PTR [rax+160] + jl L_AES_GCM_decrypt_update_aesni_aesenc_128_ghash_avx_done + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [rax+176] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + cmp r8d, 13 + movdqa xmm7, OWORD PTR [rax+192] + jl L_AES_GCM_decrypt_update_aesni_aesenc_128_ghash_avx_done + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [rax+208] + aesenc xmm8, xmm7 + aesenc xmm9, xmm7 + aesenc xmm10, xmm7 + aesenc xmm11, xmm7 + aesenc xmm12, xmm7 + aesenc xmm13, xmm7 + aesenc xmm14, xmm7 + aesenc xmm15, xmm7 + movdqa xmm7, OWORD PTR [rax+224] +L_AES_GCM_decrypt_update_aesni_aesenc_128_ghash_avx_done: + aesenclast xmm8, xmm7 + aesenclast xmm9, xmm7 + movdqu xmm0, [rcx] + movdqu xmm1, [rcx+16] + pxor xmm8, xmm0 + pxor xmm9, xmm1 + movdqu [rdx], xmm8 + movdqu [rdx+16], xmm9 + aesenclast xmm10, xmm7 + aesenclast xmm11, xmm7 + movdqu xmm0, [rcx+32] + movdqu xmm1, [rcx+48] + pxor xmm10, xmm0 + pxor xmm11, xmm1 + movdqu [rdx+32], xmm10 + movdqu [rdx+48], xmm11 + aesenclast xmm12, xmm7 + aesenclast xmm13, xmm7 + movdqu xmm0, [rcx+64] + movdqu xmm1, [rcx+80] + pxor xmm12, xmm0 + pxor xmm13, xmm1 + movdqu [rdx+64], xmm12 + movdqu [rdx+80], xmm13 + aesenclast xmm14, xmm7 + aesenclast xmm15, xmm7 + movdqu xmm0, [rcx+96] + movdqu xmm1, [rcx+112] + pxor xmm14, xmm0 + pxor xmm15, xmm1 + movdqu [rdx+96], xmm14 + movdqu [rdx+112], xmm15 + add edi, 128 + cmp edi, r13d + jl L_AES_GCM_decrypt_update_aesni_ghash_128 + movdqa xmm6, xmm2 + movdqu xmm5, [rsp] +L_AES_GCM_decrypt_update_aesni_done_128: + mov edx, r9d + cmp edi, edx + jge L_AES_GCM_decrypt_update_aesni_done_dec + mov r13d, r9d + and r13d, 4294967280 + cmp edi, r13d + jge L_AES_GCM_decrypt_update_aesni_last_block_done +L_AES_GCM_decrypt_update_aesni_last_block_start: + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + movdqu xmm1, [rcx] + movdqa xmm0, xmm5 + pshufb xmm1, OWORD PTR L_aes_gcm_bswap_mask + pxor xmm1, xmm6 + movdqu xmm8, [r15] + movdqa xmm9, xmm8 + pshufb xmm8, OWORD PTR L_aes_gcm_bswap_epi64 + paddd xmm9, OWORD PTR L_aes_gcm_one + pxor xmm8, [rax] + movdqu [r15], xmm9 + movdqa xmm10, xmm1 + pclmulqdq xmm10, xmm0, 16 + aesenc xmm8, [rax+16] + aesenc xmm8, [rax+32] + movdqa xmm11, xmm1 + pclmulqdq xmm11, xmm0, 1 + aesenc xmm8, [rax+48] + aesenc xmm8, [rax+64] + movdqa xmm12, xmm1 + pclmulqdq xmm12, xmm0, 0 + aesenc xmm8, [rax+80] + movdqa xmm1, xmm1 + pclmulqdq xmm1, xmm0, 17 + aesenc xmm8, [rax+96] + pxor xmm10, xmm11 + movdqa xmm2, xmm10 + psrldq xmm10, 8 + pslldq xmm2, 8 + aesenc xmm8, [rax+112] + movdqa xmm3, xmm1 + pxor xmm2, xmm12 + pxor xmm3, xmm10 + movdqa xmm0, OWORD PTR L_aes_gcm_mod2_128 + movdqa xmm11, xmm2 + pclmulqdq xmm11, xmm0, 16 + aesenc xmm8, [rax+128] + pshufd xmm10, xmm2, 78 + pxor xmm10, xmm11 + movdqa xmm11, xmm10 + pclmulqdq xmm11, xmm0, 16 + aesenc xmm8, [rax+144] + pshufd xmm6, xmm10, 78 + pxor xmm6, xmm11 + pxor xmm6, xmm3 + cmp r8d, 11 + movdqa xmm9, OWORD PTR [rax+160] + jl L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last + aesenc xmm8, xmm9 + aesenc xmm8, [rax+176] + cmp r8d, 13 + movdqa xmm9, OWORD PTR [rax+192] + jl L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last + aesenc xmm8, xmm9 + aesenc xmm8, [rax+208] + movdqa xmm9, OWORD PTR [rax+224] +L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last: + aesenclast xmm8, xmm9 + movdqu xmm9, [rcx] + pxor xmm8, xmm9 + movdqu [rdx], xmm8 + add edi, 16 + cmp edi, r13d + jl L_AES_GCM_decrypt_update_aesni_last_block_start +L_AES_GCM_decrypt_update_aesni_last_block_done: +L_AES_GCM_decrypt_update_aesni_done_dec: + movdqa OWORD PTR [r12], xmm6 + add rsp, 168 + pop rsi + pop rdi + pop r15 + pop r14 + pop r12 + pop r13 + ret +AES_GCM_decrypt_update_aesni ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_decrypt_final_aesni PROC + push r13 + push r12 + push r14 + push rbp + push r15 + mov rax, rcx + mov r10d, r9d + mov r9, rdx + mov r11d, DWORD PTR [rsp+80] + mov r12, QWORD PTR [rsp+88] + mov r14, QWORD PTR [rsp+96] + mov rbp, QWORD PTR [rsp+104] + sub rsp, 16 + movdqa xmm6, OWORD PTR [rax] + movdqa xmm5, OWORD PTR [r12] + movdqa xmm15, OWORD PTR [r14] + movdqa xmm9, xmm5 + movdqa xmm8, xmm5 + psrlq xmm9, 63 + psllq xmm8, 1 + pslldq xmm9, 8 + por xmm8, xmm9 + pshufd xmm5, xmm5, 255 + psrad xmm5, 31 + pand xmm5, OWORD PTR L_aes_gcm_mod2_128 + pxor xmm5, xmm8 + mov edx, r10d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + pinsrq xmm0, rdx, 0 + pinsrq xmm0, rcx, 1 + pxor xmm6, xmm0 + pshufd xmm9, xmm5, 78 + pshufd xmm10, xmm6, 78 + movdqa xmm11, xmm6 + movdqa xmm8, xmm6 + pclmulqdq xmm11, xmm5, 17 + pclmulqdq xmm8, xmm5, 0 + pxor xmm9, xmm5 + pxor xmm10, xmm6 + pclmulqdq xmm9, xmm10, 0 + pxor xmm9, xmm8 + pxor xmm9, xmm11 + movdqa xmm10, xmm9 + movdqa xmm6, xmm11 + pslldq xmm10, 8 + psrldq xmm9, 8 + pxor xmm8, xmm10 + pxor xmm6, xmm9 + movdqa xmm12, xmm8 + movdqa xmm13, xmm8 + movdqa xmm14, xmm8 + pslld xmm12, 31 + pslld xmm13, 30 + pslld xmm14, 25 + pxor xmm12, xmm13 + pxor xmm12, xmm14 + movdqa xmm13, xmm12 + psrldq xmm13, 4 + pslldq xmm12, 12 + pxor xmm8, xmm12 + movdqa xmm14, xmm8 + movdqa xmm10, xmm8 + movdqa xmm9, xmm8 + psrld xmm14, 1 + psrld xmm10, 2 + psrld xmm9, 7 + pxor xmm14, xmm10 + pxor xmm14, xmm9 + pxor xmm14, xmm13 + pxor xmm14, xmm8 + pxor xmm6, xmm14 + pshufb xmm6, OWORD PTR L_aes_gcm_bswap_mask + movdqu xmm0, xmm15 + pxor xmm0, xmm6 + cmp r8d, 16 + je L_AES_GCM_decrypt_final_aesni_cmp_tag_16 + sub rsp, 16 + xor rcx, rcx + xor r15, r15 + movdqu [rsp], xmm0 +L_AES_GCM_decrypt_final_aesni_cmp_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + xor r13b, BYTE PTR [r9+rcx] + or r15b, r13b + inc ecx + cmp ecx, r8d + jne L_AES_GCM_decrypt_final_aesni_cmp_tag_loop + cmp r15, 0 + sete r15b + add rsp, 16 + xor rcx, rcx + jmp L_AES_GCM_decrypt_final_aesni_cmp_tag_done +L_AES_GCM_decrypt_final_aesni_cmp_tag_16: + movdqu xmm1, [r9] + pcmpeqb xmm0, xmm1 + pmovmskb rdx, xmm0 + ; %%edx == 0xFFFF then return 1 else => return 0 + xor r15d, r15d + cmp edx, 65535 + sete r15b +L_AES_GCM_decrypt_final_aesni_cmp_tag_done: + mov DWORD PTR [rbp], r15d + add rsp, 16 + pop r15 + pop rbp + pop r14 + pop r12 + pop r13 + ret +AES_GCM_decrypt_final_aesni ENDP +_text ENDS +IFDEF HAVE_INTEL_AVX1 +_DATA SEGMENT +ALIGN 16 +L_avx1_aes_gcm_one QWORD 0, 1 +ptr_L_avx1_aes_gcm_one QWORD L_avx1_aes_gcm_one +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx1_aes_gcm_two QWORD 0, 2 +ptr_L_avx1_aes_gcm_two QWORD L_avx1_aes_gcm_two +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx1_aes_gcm_three QWORD 0, 3 +ptr_L_avx1_aes_gcm_three QWORD L_avx1_aes_gcm_three +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx1_aes_gcm_four QWORD 0, 4 +ptr_L_avx1_aes_gcm_four QWORD L_avx1_aes_gcm_four +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx1_aes_gcm_five QWORD 0, 5 +ptr_L_avx1_aes_gcm_five QWORD L_avx1_aes_gcm_five +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx1_aes_gcm_six QWORD 0, 6 +ptr_L_avx1_aes_gcm_six QWORD L_avx1_aes_gcm_six +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx1_aes_gcm_seven QWORD 0, 7 +ptr_L_avx1_aes_gcm_seven QWORD L_avx1_aes_gcm_seven +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx1_aes_gcm_eight QWORD 0, 8 +ptr_L_avx1_aes_gcm_eight QWORD L_avx1_aes_gcm_eight +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx1_aes_gcm_bswap_epi64 QWORD 283686952306183, 579005069656919567 +ptr_L_avx1_aes_gcm_bswap_epi64 QWORD L_avx1_aes_gcm_bswap_epi64 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx1_aes_gcm_bswap_mask QWORD 579005069656919567, 283686952306183 +ptr_L_avx1_aes_gcm_bswap_mask QWORD L_avx1_aes_gcm_bswap_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx1_aes_gcm_mod2_128 QWORD 1, 13979173243358019584 +ptr_L_avx1_aes_gcm_mod2_128 QWORD L_avx1_aes_gcm_mod2_128 +_DATA ENDS +_text SEGMENT READONLY PARA +AES_GCM_encrypt_avx1 PROC + push r13 + push rdi + push rsi + push r12 + push rbx + push r14 + push r15 + mov rdi, rcx + mov rsi, rdx + mov r12, r8 + mov rax, r9 + mov r8, QWORD PTR [rsp+96] + mov r9d, DWORD PTR [rsp+104] + mov r11d, DWORD PTR [rsp+112] + mov ebx, DWORD PTR [rsp+120] + mov r14d, DWORD PTR [rsp+128] + mov r15, QWORD PTR [rsp+136] + mov r10d, DWORD PTR [rsp+144] + sub rsp, 160 + vpxor xmm4, xmm4, xmm4 + vpxor xmm6, xmm6, xmm6 + mov edx, ebx + cmp edx, 12 + jne L_AES_GCM_encrypt_avx1_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + vmovq xmm4, QWORD PTR [rax] + vpinsrd xmm4, xmm4, DWORD PTR [rax+8], 2 + vpinsrd xmm4, xmm4, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqa xmm5, OWORD PTR [r15] + vpxor xmm1, xmm4, xmm5 + vmovdqa xmm7, OWORD PTR [r15+16] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+32] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+48] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+64] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+80] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+96] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+112] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+128] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+144] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + cmp r10d, 11 + vmovdqa xmm7, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_avx1_calc_iv_12_last + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+176] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + cmp r10d, 13 + vmovdqa xmm7, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_avx1_calc_iv_12_last + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+208] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+224] +L_AES_GCM_encrypt_avx1_calc_iv_12_last: + vaesenclast xmm5, xmm5, xmm7 + vaesenclast xmm1, xmm1, xmm7 + vpshufb xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask + vmovdqu OWORD PTR [rsp+144], xmm1 + jmp L_AES_GCM_encrypt_avx1_iv_done +L_AES_GCM_encrypt_avx1_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqa xmm5, OWORD PTR [r15] + vaesenc xmm5, xmm5, [r15+16] + vaesenc xmm5, xmm5, [r15+32] + vaesenc xmm5, xmm5, [r15+48] + vaesenc xmm5, xmm5, [r15+64] + vaesenc xmm5, xmm5, [r15+80] + vaesenc xmm5, xmm5, [r15+96] + vaesenc xmm5, xmm5, [r15+112] + vaesenc xmm5, xmm5, [r15+128] + vaesenc xmm5, xmm5, [r15+144] + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm9 + vaesenc xmm5, xmm5, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm9 + vaesenc xmm5, xmm5, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last: + vaesenclast xmm5, xmm5, xmm9 + vpshufb xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov rcx, 0 + je L_AES_GCM_encrypt_avx1_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_encrypt_avx1_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_encrypt_avx1_calc_iv_16_loop: + vmovdqu xmm8, OWORD PTR [rax+rcx] + vpshufb xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_encrypt_avx1_calc_iv_16_loop + mov edx, ebx + cmp ecx, edx + je L_AES_GCM_encrypt_avx1_calc_iv_done +L_AES_GCM_encrypt_avx1_calc_iv_lt16: + sub rsp, 16 + vpxor xmm8, xmm8, xmm8 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm8 +L_AES_GCM_encrypt_avx1_calc_iv_loop: + movzx r13d, BYTE PTR [rax+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_encrypt_avx1_calc_iv_loop + vmovdqu xmm8, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 +L_AES_GCM_encrypt_avx1_calc_iv_done: + ; T = Encrypt counter + vpxor xmm0, xmm0, xmm0 + shl edx, 3 + vmovq xmm0, rdx + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 + vpshufb xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask + ; Encrypt counter + vmovdqa xmm8, OWORD PTR [r15] + vpxor xmm8, xmm8, xmm4 + vaesenc xmm8, xmm8, [r15+16] + vaesenc xmm8, xmm8, [r15+32] + vaesenc xmm8, xmm8, [r15+48] + vaesenc xmm8, xmm8, [r15+64] + vaesenc xmm8, xmm8, [r15+80] + vaesenc xmm8, xmm8, [r15+96] + vaesenc xmm8, xmm8, [r15+112] + vaesenc xmm8, xmm8, [r15+128] + vaesenc xmm8, xmm8, [r15+144] + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqu OWORD PTR [rsp+144], xmm8 +L_AES_GCM_encrypt_avx1_iv_done: + ; Additional authentication data + mov edx, r11d + cmp edx, 0 + je L_AES_GCM_encrypt_avx1_calc_aad_done + xor ecx, ecx + cmp edx, 16 + jl L_AES_GCM_encrypt_avx1_calc_aad_lt16 + and edx, 4294967280 +L_AES_GCM_encrypt_avx1_calc_aad_16_loop: + vmovdqu xmm8, OWORD PTR [r12+rcx] + vpshufb xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm6, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm6, 17 + vpclmulqdq xmm0, xmm5, xmm6, 0 + vpxor xmm1, xmm1, xmm6 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm6, xmm6, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm6, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm6, xmm6, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm6, xmm6, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_encrypt_avx1_calc_aad_16_loop + mov edx, r11d + cmp ecx, edx + je L_AES_GCM_encrypt_avx1_calc_aad_done +L_AES_GCM_encrypt_avx1_calc_aad_lt16: + sub rsp, 16 + vpxor xmm8, xmm8, xmm8 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm8 +L_AES_GCM_encrypt_avx1_calc_aad_loop: + movzx r13d, BYTE PTR [r12+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_encrypt_avx1_calc_aad_loop + vmovdqu xmm8, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm6, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm6, 17 + vpclmulqdq xmm0, xmm5, xmm6, 0 + vpxor xmm1, xmm1, xmm6 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm6, xmm6, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm6, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm6, xmm6, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm6, xmm6, xmm2 +L_AES_GCM_encrypt_avx1_calc_aad_done: + ; Calculate counter and H + vpsrlq xmm9, xmm5, 63 + vpsllq xmm8, xmm5, 1 + vpslldq xmm9, xmm9, 8 + vpor xmm8, xmm8, xmm9 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpshufb xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_epi64 + vpand xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128 + vpaddd xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_one + vpxor xmm5, xmm5, xmm8 + vmovdqu OWORD PTR [rsp+128], xmm4 + xor ebx, ebx + cmp r9d, 128 + mov r13d, r9d + jl L_AES_GCM_encrypt_avx1_done_128 + and r13d, 4294967168 + vmovdqa xmm2, xmm6 + ; H ^ 1 + vmovdqu OWORD PTR [rsp], xmm5 + ; H ^ 2 + vpclmulqdq xmm8, xmm5, xmm5, 0 + vpclmulqdq xmm0, xmm5, xmm5, 17 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm0, xmm0, xmm14 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpshufd xmm10, xmm0, 78 + vpclmulqdq xmm11, xmm0, xmm5, 17 + vpclmulqdq xmm8, xmm0, xmm5, 0 + vpxor xmm9, xmm9, xmm5 + vpxor xmm10, xmm10, xmm0 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpxor xmm9, xmm9, xmm8 + vpxor xmm9, xmm9, xmm11 + vpslldq xmm10, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vpxor xmm8, xmm8, xmm10 + vpxor xmm1, xmm11, xmm9 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm1, xmm1, xmm14 + vmovdqu OWORD PTR [rsp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm3, xmm0, xmm0, 17 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm3, xmm3, xmm14 + vmovdqu OWORD PTR [rsp+48], xmm3 + ; H ^ 5 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpshufd xmm10, xmm1, 78 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpxor xmm9, xmm9, xmm0 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpxor xmm9, xmm9, xmm8 + vpxor xmm9, xmm9, xmm11 + vpslldq xmm10, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vpxor xmm8, xmm8, xmm10 + vpxor xmm7, xmm11, xmm9 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm7, xmm7, xmm14 + vmovdqu OWORD PTR [rsp+64], xmm7 + ; H ^ 6 + vpclmulqdq xmm8, xmm1, xmm1, 0 + vpclmulqdq xmm7, xmm1, xmm1, 17 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm7, xmm7, xmm14 + vmovdqu OWORD PTR [rsp+80], xmm7 + ; H ^ 7 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm1, 78 + vpshufd xmm10, xmm3, 78 + vpclmulqdq xmm11, xmm3, xmm1, 17 + vpclmulqdq xmm8, xmm3, xmm1, 0 + vpxor xmm9, xmm9, xmm1 + vpxor xmm10, xmm10, xmm3 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpxor xmm9, xmm9, xmm8 + vpxor xmm9, xmm9, xmm11 + vpslldq xmm10, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vpxor xmm8, xmm8, xmm10 + vpxor xmm7, xmm11, xmm9 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm7, xmm7, xmm14 + vmovdqu OWORD PTR [rsp+96], xmm7 + ; H ^ 8 + vpclmulqdq xmm8, xmm3, xmm3, 0 + vpclmulqdq xmm7, xmm3, xmm3, 17 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm7, xmm7, xmm14 + vmovdqu OWORD PTR [rsp+112], xmm7 + ; First 128 bytes of input + vmovdqu xmm0, OWORD PTR [rsp+128] + vmovdqa xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64 + vpshufb xmm8, xmm0, xmm1 + vpaddd xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one + vpshufb xmm9, xmm9, xmm1 + vpaddd xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two + vpshufb xmm10, xmm10, xmm1 + vpaddd xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three + vpshufb xmm11, xmm11, xmm1 + vpaddd xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four + vpshufb xmm12, xmm12, xmm1 + vpaddd xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five + vpshufb xmm13, xmm13, xmm1 + vpaddd xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six + vpshufb xmm14, xmm14, xmm1 + vpaddd xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven + vpshufb xmm15, xmm15, xmm1 + vpaddd xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight + vmovdqa xmm7, OWORD PTR [r15] + vmovdqu OWORD PTR [rsp+128], xmm0 + vpxor xmm8, xmm8, xmm7 + vpxor xmm9, xmm9, xmm7 + vpxor xmm10, xmm10, xmm7 + vpxor xmm11, xmm11, xmm7 + vpxor xmm12, xmm12, xmm7 + vpxor xmm13, xmm13, xmm7 + vpxor xmm14, xmm14, xmm7 + vpxor xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [r15+16] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [r15+32] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [r15+48] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [r15+64] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [r15+80] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [r15+96] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [r15+112] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [r15+128] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [r15+144] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + cmp r10d, 11 + vmovdqa xmm7, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_avx1_aesenc_128_enc_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [r15+176] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + cmp r10d, 13 + vmovdqa xmm7, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_avx1_aesenc_128_enc_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [r15+208] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [r15+224] +L_AES_GCM_encrypt_avx1_aesenc_128_enc_done: + vaesenclast xmm8, xmm8, xmm7 + vaesenclast xmm9, xmm9, xmm7 + vmovdqu xmm0, OWORD PTR [rdi] + vmovdqu xmm1, OWORD PTR [rdi+16] + vpxor xmm8, xmm8, xmm0 + vpxor xmm9, xmm9, xmm1 + vmovdqu OWORD PTR [rsi], xmm8 + vmovdqu OWORD PTR [rsi+16], xmm9 + vaesenclast xmm10, xmm10, xmm7 + vaesenclast xmm11, xmm11, xmm7 + vmovdqu xmm0, OWORD PTR [rdi+32] + vmovdqu xmm1, OWORD PTR [rdi+48] + vpxor xmm10, xmm10, xmm0 + vpxor xmm11, xmm11, xmm1 + vmovdqu OWORD PTR [rsi+32], xmm10 + vmovdqu OWORD PTR [rsi+48], xmm11 + vaesenclast xmm12, xmm12, xmm7 + vaesenclast xmm13, xmm13, xmm7 + vmovdqu xmm0, OWORD PTR [rdi+64] + vmovdqu xmm1, OWORD PTR [rdi+80] + vpxor xmm12, xmm12, xmm0 + vpxor xmm13, xmm13, xmm1 + vmovdqu OWORD PTR [rsi+64], xmm12 + vmovdqu OWORD PTR [rsi+80], xmm13 + vaesenclast xmm14, xmm14, xmm7 + vaesenclast xmm15, xmm15, xmm7 + vmovdqu xmm0, OWORD PTR [rdi+96] + vmovdqu xmm1, OWORD PTR [rdi+112] + vpxor xmm14, xmm14, xmm0 + vpxor xmm15, xmm15, xmm1 + vmovdqu OWORD PTR [rsi+96], xmm14 + vmovdqu OWORD PTR [rsi+112], xmm15 + cmp r13d, 128 + mov ebx, 128 + jle L_AES_GCM_encrypt_avx1_end_128 + ; More 128 bytes of input +L_AES_GCM_encrypt_avx1_ghash_128: + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu xmm0, OWORD PTR [rsp+128] + vmovdqa xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64 + vpshufb xmm8, xmm0, xmm1 + vpaddd xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one + vpshufb xmm9, xmm9, xmm1 + vpaddd xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two + vpshufb xmm10, xmm10, xmm1 + vpaddd xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three + vpshufb xmm11, xmm11, xmm1 + vpaddd xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four + vpshufb xmm12, xmm12, xmm1 + vpaddd xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five + vpshufb xmm13, xmm13, xmm1 + vpaddd xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six + vpshufb xmm14, xmm14, xmm1 + vpaddd xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven + vpshufb xmm15, xmm15, xmm1 + vpaddd xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight + vmovdqa xmm7, OWORD PTR [r15] + vmovdqu OWORD PTR [rsp+128], xmm0 + vpxor xmm8, xmm8, xmm7 + vpxor xmm9, xmm9, xmm7 + vpxor xmm10, xmm10, xmm7 + vpxor xmm11, xmm11, xmm7 + vpxor xmm12, xmm12, xmm7 + vpxor xmm13, xmm13, xmm7 + vpxor xmm14, xmm14, xmm7 + vpxor xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rsp+112] + vmovdqu xmm0, OWORD PTR [rdx+-128] + vaesenc xmm8, xmm8, [r15+16] + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm0, xmm0, xmm2 + vpshufd xmm1, xmm7, 78 + vpshufd xmm5, xmm0, 78 + vpxor xmm1, xmm1, xmm7 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm3, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [r15+16] + vaesenc xmm10, xmm10, [r15+16] + vpclmulqdq xmm2, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [r15+16] + vaesenc xmm12, xmm12, [r15+16] + vpclmulqdq xmm1, xmm1, xmm5, 0 + vaesenc xmm13, xmm13, [r15+16] + vaesenc xmm14, xmm14, [r15+16] + vaesenc xmm15, xmm15, [r15+16] + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmm7, OWORD PTR [rsp+96] + vmovdqu xmm0, OWORD PTR [rdx+-112] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [r15+32] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [r15+32] + vaesenc xmm10, xmm10, [r15+32] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [r15+32] + vaesenc xmm12, xmm12, [r15+32] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [r15+32] + vaesenc xmm14, xmm14, [r15+32] + vaesenc xmm15, xmm15, [r15+32] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm0, OWORD PTR [rdx+-96] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [r15+48] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [r15+48] + vaesenc xmm10, xmm10, [r15+48] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [r15+48] + vaesenc xmm12, xmm12, [r15+48] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [r15+48] + vaesenc xmm14, xmm14, [r15+48] + vaesenc xmm15, xmm15, [r15+48] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp+64] + vmovdqu xmm0, OWORD PTR [rdx+-80] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [r15+64] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [r15+64] + vaesenc xmm10, xmm10, [r15+64] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [r15+64] + vaesenc xmm12, xmm12, [r15+64] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [r15+64] + vaesenc xmm14, xmm14, [r15+64] + vaesenc xmm15, xmm15, [r15+64] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp+48] + vmovdqu xmm0, OWORD PTR [rdx+-64] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [r15+80] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [r15+80] + vaesenc xmm10, xmm10, [r15+80] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [r15+80] + vaesenc xmm12, xmm12, [r15+80] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [r15+80] + vaesenc xmm14, xmm14, [r15+80] + vaesenc xmm15, xmm15, [r15+80] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp+32] + vmovdqu xmm0, OWORD PTR [rdx+-48] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [r15+96] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [r15+96] + vaesenc xmm10, xmm10, [r15+96] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [r15+96] + vaesenc xmm12, xmm12, [r15+96] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [r15+96] + vaesenc xmm14, xmm14, [r15+96] + vaesenc xmm15, xmm15, [r15+96] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm0, OWORD PTR [rdx+-32] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [r15+112] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [r15+112] + vaesenc xmm10, xmm10, [r15+112] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [r15+112] + vaesenc xmm12, xmm12, [r15+112] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [r15+112] + vaesenc xmm14, xmm14, [r15+112] + vaesenc xmm15, xmm15, [r15+112] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp] + vmovdqu xmm0, OWORD PTR [rdx+-16] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [r15+128] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [r15+128] + vaesenc xmm10, xmm10, [r15+128] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [r15+128] + vaesenc xmm12, xmm12, [r15+128] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [r15+128] + vaesenc xmm14, xmm14, [r15+128] + vaesenc xmm15, xmm15, [r15+128] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vpslldq xmm5, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vaesenc xmm8, xmm8, [r15+144] + vpxor xmm2, xmm2, xmm5 + vpxor xmm3, xmm3, xmm1 + vaesenc xmm9, xmm9, [r15+144] + vpslld xmm7, xmm2, 31 + vpslld xmm4, xmm2, 30 + vpslld xmm5, xmm2, 25 + vaesenc xmm10, xmm10, [r15+144] + vpxor xmm7, xmm7, xmm4 + vpxor xmm7, xmm7, xmm5 + vaesenc xmm11, xmm11, [r15+144] + vpsrldq xmm4, xmm7, 4 + vpslldq xmm7, xmm7, 12 + vaesenc xmm12, xmm12, [r15+144] + vpxor xmm2, xmm2, xmm7 + vpsrld xmm5, xmm2, 1 + vaesenc xmm13, xmm13, [r15+144] + vpsrld xmm1, xmm2, 2 + vpsrld xmm0, xmm2, 7 + vaesenc xmm14, xmm14, [r15+144] + vpxor xmm5, xmm5, xmm1 + vpxor xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, [r15+144] + vpxor xmm5, xmm5, xmm4 + vpxor xmm2, xmm2, xmm5 + vpxor xmm2, xmm2, xmm3 + cmp r10d, 11 + vmovdqa xmm7, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_avx1_aesenc_128_ghash_avx_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [r15+176] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + cmp r10d, 13 + vmovdqa xmm7, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_avx1_aesenc_128_ghash_avx_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [r15+208] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [r15+224] +L_AES_GCM_encrypt_avx1_aesenc_128_ghash_avx_done: + vaesenclast xmm8, xmm8, xmm7 + vaesenclast xmm9, xmm9, xmm7 + vmovdqu xmm0, OWORD PTR [rcx] + vmovdqu xmm1, OWORD PTR [rcx+16] + vpxor xmm8, xmm8, xmm0 + vpxor xmm9, xmm9, xmm1 + vmovdqu OWORD PTR [rdx], xmm8 + vmovdqu OWORD PTR [rdx+16], xmm9 + vaesenclast xmm10, xmm10, xmm7 + vaesenclast xmm11, xmm11, xmm7 + vmovdqu xmm0, OWORD PTR [rcx+32] + vmovdqu xmm1, OWORD PTR [rcx+48] + vpxor xmm10, xmm10, xmm0 + vpxor xmm11, xmm11, xmm1 + vmovdqu OWORD PTR [rdx+32], xmm10 + vmovdqu OWORD PTR [rdx+48], xmm11 + vaesenclast xmm12, xmm12, xmm7 + vaesenclast xmm13, xmm13, xmm7 + vmovdqu xmm0, OWORD PTR [rcx+64] + vmovdqu xmm1, OWORD PTR [rcx+80] + vpxor xmm12, xmm12, xmm0 + vpxor xmm13, xmm13, xmm1 + vmovdqu OWORD PTR [rdx+64], xmm12 + vmovdqu OWORD PTR [rdx+80], xmm13 + vaesenclast xmm14, xmm14, xmm7 + vaesenclast xmm15, xmm15, xmm7 + vmovdqu xmm0, OWORD PTR [rcx+96] + vmovdqu xmm1, OWORD PTR [rcx+112] + vpxor xmm14, xmm14, xmm0 + vpxor xmm15, xmm15, xmm1 + vmovdqu OWORD PTR [rdx+96], xmm14 + vmovdqu OWORD PTR [rdx+112], xmm15 + add ebx, 128 + cmp ebx, r13d + jl L_AES_GCM_encrypt_avx1_ghash_128 +L_AES_GCM_encrypt_avx1_end_128: + vmovdqa xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpshufb xmm8, xmm8, xmm4 + vpshufb xmm9, xmm9, xmm4 + vpshufb xmm10, xmm10, xmm4 + vpshufb xmm11, xmm11, xmm4 + vpxor xmm8, xmm8, xmm2 + vpshufb xmm12, xmm12, xmm4 + vpshufb xmm13, xmm13, xmm4 + vpshufb xmm14, xmm14, xmm4 + vpshufb xmm15, xmm15, xmm4 + vmovdqu xmm7, OWORD PTR [rsp] + vmovdqu xmm5, OWORD PTR [rsp+16] + ; ghash_gfmul_avx + vpshufd xmm1, xmm15, 78 + vpshufd xmm2, xmm7, 78 + vpclmulqdq xmm3, xmm7, xmm15, 17 + vpclmulqdq xmm0, xmm7, xmm15, 0 + vpxor xmm1, xmm1, xmm15 + vpxor xmm2, xmm2, xmm7 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm6, xmm6, xmm1 + ; ghash_gfmul_xor_avx + vpshufd xmm1, xmm14, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm14, 17 + vpclmulqdq xmm0, xmm5, xmm14, 0 + vpxor xmm1, xmm1, xmm14 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vpxor xmm4, xmm4, xmm0 + vpxor xmm6, xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm6, xmm6, xmm1 + vmovdqu xmm7, OWORD PTR [rsp+32] + vmovdqu xmm5, OWORD PTR [rsp+48] + ; ghash_gfmul_xor_avx + vpshufd xmm1, xmm13, 78 + vpshufd xmm2, xmm7, 78 + vpclmulqdq xmm3, xmm7, xmm13, 17 + vpclmulqdq xmm0, xmm7, xmm13, 0 + vpxor xmm1, xmm1, xmm13 + vpxor xmm2, xmm2, xmm7 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vpxor xmm4, xmm4, xmm0 + vpxor xmm6, xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm6, xmm6, xmm1 + ; ghash_gfmul_xor_avx + vpshufd xmm1, xmm12, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm12, 17 + vpclmulqdq xmm0, xmm5, xmm12, 0 + vpxor xmm1, xmm1, xmm12 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vpxor xmm4, xmm4, xmm0 + vpxor xmm6, xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm6, xmm6, xmm1 + vmovdqu xmm7, OWORD PTR [rsp+64] + vmovdqu xmm5, OWORD PTR [rsp+80] + ; ghash_gfmul_xor_avx + vpshufd xmm1, xmm11, 78 + vpshufd xmm2, xmm7, 78 + vpclmulqdq xmm3, xmm7, xmm11, 17 + vpclmulqdq xmm0, xmm7, xmm11, 0 + vpxor xmm1, xmm1, xmm11 + vpxor xmm2, xmm2, xmm7 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vpxor xmm4, xmm4, xmm0 + vpxor xmm6, xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm6, xmm6, xmm1 + ; ghash_gfmul_xor_avx + vpshufd xmm1, xmm10, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm10, 17 + vpclmulqdq xmm0, xmm5, xmm10, 0 + vpxor xmm1, xmm1, xmm10 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vpxor xmm4, xmm4, xmm0 + vpxor xmm6, xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm6, xmm6, xmm1 + vmovdqu xmm7, OWORD PTR [rsp+96] + vmovdqu xmm5, OWORD PTR [rsp+112] + ; ghash_gfmul_xor_avx + vpshufd xmm1, xmm9, 78 + vpshufd xmm2, xmm7, 78 + vpclmulqdq xmm3, xmm7, xmm9, 17 + vpclmulqdq xmm0, xmm7, xmm9, 0 + vpxor xmm1, xmm1, xmm9 + vpxor xmm2, xmm2, xmm7 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vpxor xmm4, xmm4, xmm0 + vpxor xmm6, xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm6, xmm6, xmm1 + ; ghash_gfmul_xor_avx + vpshufd xmm1, xmm8, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm8, 17 + vpclmulqdq xmm0, xmm5, xmm8, 0 + vpxor xmm1, xmm1, xmm8 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vpxor xmm4, xmm4, xmm0 + vpxor xmm6, xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm6, xmm6, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm6, xmm6, xmm2 + vmovdqu xmm5, OWORD PTR [rsp] +L_AES_GCM_encrypt_avx1_done_128: + mov edx, r9d + cmp ebx, edx + jge L_AES_GCM_encrypt_avx1_done_enc + mov r13d, r9d + and r13d, 4294967280 + cmp ebx, r13d + jge L_AES_GCM_encrypt_avx1_last_block_done + vmovdqu xmm9, OWORD PTR [rsp+128] + vpshufb xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one + vmovdqu OWORD PTR [rsp+128], xmm9 + vpxor xmm8, xmm8, [r15] + vaesenc xmm8, xmm8, [r15+16] + vaesenc xmm8, xmm8, [r15+32] + vaesenc xmm8, xmm8, [r15+48] + vaesenc xmm8, xmm8, [r15+64] + vaesenc xmm8, xmm8, [r15+80] + vaesenc xmm8, xmm8, [r15+96] + vaesenc xmm8, xmm8, [r15+112] + vaesenc xmm8, xmm8, [r15+128] + vaesenc xmm8, xmm8, [r15+144] + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_avx1_aesenc_block_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_avx1_aesenc_block_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_encrypt_avx1_aesenc_block_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqu xmm9, OWORD PTR [rdi+rbx] + vpxor xmm8, xmm8, xmm9 + vmovdqu OWORD PTR [rsi+rbx], xmm8 + vpshufb xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm8 + add ebx, 16 + cmp ebx, r13d + jge L_AES_GCM_encrypt_avx1_last_block_ghash +L_AES_GCM_encrypt_avx1_last_block_start: + vmovdqu xmm13, OWORD PTR [rdi+rbx] + vmovdqu xmm9, OWORD PTR [rsp+128] + vpshufb xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one + vmovdqu OWORD PTR [rsp+128], xmm9 + vpxor xmm8, xmm8, [r15] + vpclmulqdq xmm10, xmm6, xmm5, 16 + vaesenc xmm8, xmm8, [r15+16] + vaesenc xmm8, xmm8, [r15+32] + vpclmulqdq xmm11, xmm6, xmm5, 1 + vaesenc xmm8, xmm8, [r15+48] + vaesenc xmm8, xmm8, [r15+64] + vpclmulqdq xmm12, xmm6, xmm5, 0 + vaesenc xmm8, xmm8, [r15+80] + vpclmulqdq xmm1, xmm6, xmm5, 17 + vaesenc xmm8, xmm8, [r15+96] + vpxor xmm10, xmm10, xmm11 + vpslldq xmm2, xmm10, 8 + vpsrldq xmm10, xmm10, 8 + vaesenc xmm8, xmm8, [r15+112] + vpxor xmm2, xmm2, xmm12 + vpxor xmm3, xmm1, xmm10 + vmovdqa xmm0, OWORD PTR L_avx1_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm2, xmm0, 16 + vaesenc xmm8, xmm8, [r15+128] + vpshufd xmm10, xmm2, 78 + vpxor xmm10, xmm10, xmm11 + vpclmulqdq xmm11, xmm10, xmm0, 16 + vaesenc xmm8, xmm8, [r15+144] + vpshufd xmm10, xmm10, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm6, xmm10, xmm3 + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_avx1_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_avx1_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_encrypt_avx1_aesenc_gfmul_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqa xmm0, xmm13 + vpxor xmm8, xmm8, xmm0 + vmovdqu OWORD PTR [rsi+rbx], xmm8 + vpshufb xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask + add ebx, 16 + vpxor xmm6, xmm6, xmm8 + cmp ebx, r13d + jl L_AES_GCM_encrypt_avx1_last_block_start +L_AES_GCM_encrypt_avx1_last_block_ghash: + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpshufd xmm10, xmm6, 78 + vpclmulqdq xmm11, xmm6, xmm5, 17 + vpclmulqdq xmm8, xmm6, xmm5, 0 + vpxor xmm9, xmm9, xmm5 + vpxor xmm10, xmm10, xmm6 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpxor xmm9, xmm9, xmm8 + vpxor xmm9, xmm9, xmm11 + vpslldq xmm10, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vpxor xmm8, xmm8, xmm10 + vpxor xmm6, xmm11, xmm9 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm6, xmm6, xmm14 +L_AES_GCM_encrypt_avx1_last_block_done: + mov ecx, r9d + mov edx, ecx + and ecx, 15 + jz L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_done + vmovdqu xmm4, OWORD PTR [rsp+128] + vpshufb xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_epi64 + vpxor xmm4, xmm4, [r15] + vaesenc xmm4, xmm4, [r15+16] + vaesenc xmm4, xmm4, [r15+32] + vaesenc xmm4, xmm4, [r15+48] + vaesenc xmm4, xmm4, [r15+64] + vaesenc xmm4, xmm4, [r15+80] + vaesenc xmm4, xmm4, [r15+96] + vaesenc xmm4, xmm4, [r15+112] + vaesenc xmm4, xmm4, [r15+128] + vaesenc xmm4, xmm4, [r15+144] + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc xmm4, xmm4, xmm9 + vaesenc xmm4, xmm4, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc xmm4, xmm4, xmm9 + vaesenc xmm4, xmm4, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last: + vaesenclast xmm4, xmm4, xmm9 + sub rsp, 16 + xor ecx, ecx + vmovdqu OWORD PTR [rsp], xmm4 +L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_loop: + movzx r13d, BYTE PTR [rdi+rbx] + xor r13b, BYTE PTR [rsp+rcx] + mov BYTE PTR [rsi+rbx], r13b + mov BYTE PTR [rsp+rcx], r13b + inc ebx + inc ecx + cmp ebx, edx + jl L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_loop + xor r13, r13 + cmp ecx, 16 + je L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_finish_enc +L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_byte_loop: + mov BYTE PTR [rsp+rcx], r13b + inc ecx + cmp ecx, 16 + jl L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_byte_loop +L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_finish_enc: + vmovdqu xmm4, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm4 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpshufd xmm10, xmm6, 78 + vpclmulqdq xmm11, xmm6, xmm5, 17 + vpclmulqdq xmm8, xmm6, xmm5, 0 + vpxor xmm9, xmm9, xmm5 + vpxor xmm10, xmm10, xmm6 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpxor xmm9, xmm9, xmm8 + vpxor xmm9, xmm9, xmm11 + vpslldq xmm10, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vpxor xmm8, xmm8, xmm10 + vpxor xmm6, xmm11, xmm9 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm6, xmm6, xmm14 +L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_done: +L_AES_GCM_encrypt_avx1_done_enc: + mov edx, r9d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + vmovq xmm0, rdx + vmovq xmm1, rcx + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm6, xmm6, xmm0 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpshufd xmm10, xmm6, 78 + vpclmulqdq xmm11, xmm6, xmm5, 17 + vpclmulqdq xmm8, xmm6, xmm5, 0 + vpxor xmm9, xmm9, xmm5 + vpxor xmm10, xmm10, xmm6 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpxor xmm9, xmm9, xmm8 + vpxor xmm9, xmm9, xmm11 + vpslldq xmm10, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vpxor xmm8, xmm8, xmm10 + vpxor xmm6, xmm11, xmm9 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm6, xmm6, xmm14 + vpshufb xmm6, xmm6, OWORD PTR L_avx1_aes_gcm_bswap_mask + vmovdqu xmm0, OWORD PTR [rsp+144] + vpxor xmm0, xmm0, xmm6 + cmp r14d, 16 + je L_AES_GCM_encrypt_avx1_store_tag_16 + xor rcx, rcx + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_encrypt_avx1_store_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + mov BYTE PTR [r8+rcx], r13b + inc ecx + cmp ecx, r14d + jne L_AES_GCM_encrypt_avx1_store_tag_loop + jmp L_AES_GCM_encrypt_avx1_store_tag_done +L_AES_GCM_encrypt_avx1_store_tag_16: + vmovdqu OWORD PTR [r8], xmm0 +L_AES_GCM_encrypt_avx1_store_tag_done: + vzeroupper + add rsp, 160 + pop r15 + pop r14 + pop rbx + pop r12 + pop rsi + pop rdi + pop r13 + ret +AES_GCM_encrypt_avx1 ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_decrypt_avx1 PROC + push r13 + push rdi + push rsi + push r12 + push rbx + push r14 + push r15 + push rbp + mov rdi, rcx + mov rsi, rdx + mov r12, r8 + mov rax, r9 + mov r8, QWORD PTR [rsp+104] + mov r9d, DWORD PTR [rsp+112] + mov r11d, DWORD PTR [rsp+120] + mov ebx, DWORD PTR [rsp+128] + mov r14d, DWORD PTR [rsp+136] + mov r15, QWORD PTR [rsp+144] + mov r10d, DWORD PTR [rsp+152] + mov rbp, QWORD PTR [rsp+160] + sub rsp, 168 + vpxor xmm4, xmm4, xmm4 + vpxor xmm6, xmm6, xmm6 + cmp ebx, 12 + mov edx, ebx + jne L_AES_GCM_decrypt_avx1_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + vmovq xmm4, QWORD PTR [rax] + vpinsrd xmm4, xmm4, DWORD PTR [rax+8], 2 + vpinsrd xmm4, xmm4, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqa xmm5, OWORD PTR [r15] + vpxor xmm1, xmm4, xmm5 + vmovdqa xmm7, OWORD PTR [r15+16] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+32] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+48] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+64] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+80] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+96] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+112] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+128] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+144] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + cmp r10d, 11 + vmovdqa xmm7, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_avx1_calc_iv_12_last + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+176] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + cmp r10d, 13 + vmovdqa xmm7, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_avx1_calc_iv_12_last + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+208] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [r15+224] +L_AES_GCM_decrypt_avx1_calc_iv_12_last: + vaesenclast xmm5, xmm5, xmm7 + vaesenclast xmm1, xmm1, xmm7 + vpshufb xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask + vmovdqu OWORD PTR [rsp+144], xmm1 + jmp L_AES_GCM_decrypt_avx1_iv_done +L_AES_GCM_decrypt_avx1_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqa xmm5, OWORD PTR [r15] + vaesenc xmm5, xmm5, [r15+16] + vaesenc xmm5, xmm5, [r15+32] + vaesenc xmm5, xmm5, [r15+48] + vaesenc xmm5, xmm5, [r15+64] + vaesenc xmm5, xmm5, [r15+80] + vaesenc xmm5, xmm5, [r15+96] + vaesenc xmm5, xmm5, [r15+112] + vaesenc xmm5, xmm5, [r15+128] + vaesenc xmm5, xmm5, [r15+144] + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm9 + vaesenc xmm5, xmm5, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm9 + vaesenc xmm5, xmm5, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last: + vaesenclast xmm5, xmm5, xmm9 + vpshufb xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov rcx, 0 + je L_AES_GCM_decrypt_avx1_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_decrypt_avx1_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_decrypt_avx1_calc_iv_16_loop: + vmovdqu xmm8, OWORD PTR [rax+rcx] + vpshufb xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_decrypt_avx1_calc_iv_16_loop + mov edx, ebx + cmp ecx, edx + je L_AES_GCM_decrypt_avx1_calc_iv_done +L_AES_GCM_decrypt_avx1_calc_iv_lt16: + sub rsp, 16 + vpxor xmm8, xmm8, xmm8 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm8 +L_AES_GCM_decrypt_avx1_calc_iv_loop: + movzx r13d, BYTE PTR [rax+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_decrypt_avx1_calc_iv_loop + vmovdqu xmm8, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 +L_AES_GCM_decrypt_avx1_calc_iv_done: + ; T = Encrypt counter + vpxor xmm0, xmm0, xmm0 + shl edx, 3 + vmovq xmm0, rdx + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 + vpshufb xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask + ; Encrypt counter + vmovdqa xmm8, OWORD PTR [r15] + vpxor xmm8, xmm8, xmm4 + vaesenc xmm8, xmm8, [r15+16] + vaesenc xmm8, xmm8, [r15+32] + vaesenc xmm8, xmm8, [r15+48] + vaesenc xmm8, xmm8, [r15+64] + vaesenc xmm8, xmm8, [r15+80] + vaesenc xmm8, xmm8, [r15+96] + vaesenc xmm8, xmm8, [r15+112] + vaesenc xmm8, xmm8, [r15+128] + vaesenc xmm8, xmm8, [r15+144] + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqu OWORD PTR [rsp+144], xmm8 +L_AES_GCM_decrypt_avx1_iv_done: + ; Additional authentication data + mov edx, r11d + cmp edx, 0 + je L_AES_GCM_decrypt_avx1_calc_aad_done + xor ecx, ecx + cmp edx, 16 + jl L_AES_GCM_decrypt_avx1_calc_aad_lt16 + and edx, 4294967280 +L_AES_GCM_decrypt_avx1_calc_aad_16_loop: + vmovdqu xmm8, OWORD PTR [r12+rcx] + vpshufb xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm6, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm6, 17 + vpclmulqdq xmm0, xmm5, xmm6, 0 + vpxor xmm1, xmm1, xmm6 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm6, xmm6, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm6, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm6, xmm6, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm6, xmm6, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_decrypt_avx1_calc_aad_16_loop + mov edx, r11d + cmp ecx, edx + je L_AES_GCM_decrypt_avx1_calc_aad_done +L_AES_GCM_decrypt_avx1_calc_aad_lt16: + sub rsp, 16 + vpxor xmm8, xmm8, xmm8 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm8 +L_AES_GCM_decrypt_avx1_calc_aad_loop: + movzx r13d, BYTE PTR [r12+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_decrypt_avx1_calc_aad_loop + vmovdqu xmm8, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm6, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm6, 17 + vpclmulqdq xmm0, xmm5, xmm6, 0 + vpxor xmm1, xmm1, xmm6 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm6, xmm6, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm6, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm6, xmm6, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm6, xmm6, xmm2 +L_AES_GCM_decrypt_avx1_calc_aad_done: + ; Calculate counter and H + vpsrlq xmm9, xmm5, 63 + vpsllq xmm8, xmm5, 1 + vpslldq xmm9, xmm9, 8 + vpor xmm8, xmm8, xmm9 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpshufb xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_epi64 + vpand xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128 + vpaddd xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_one + vpxor xmm5, xmm5, xmm8 + vmovdqu OWORD PTR [rsp+128], xmm4 + xor ebx, ebx + cmp r9d, 128 + mov r13d, r9d + jl L_AES_GCM_decrypt_avx1_done_128 + and r13d, 4294967168 + vmovdqa xmm2, xmm6 + ; H ^ 1 + vmovdqu OWORD PTR [rsp], xmm5 + ; H ^ 2 + vpclmulqdq xmm8, xmm5, xmm5, 0 + vpclmulqdq xmm0, xmm5, xmm5, 17 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm0, xmm0, xmm14 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpshufd xmm10, xmm0, 78 + vpclmulqdq xmm11, xmm0, xmm5, 17 + vpclmulqdq xmm8, xmm0, xmm5, 0 + vpxor xmm9, xmm9, xmm5 + vpxor xmm10, xmm10, xmm0 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpxor xmm9, xmm9, xmm8 + vpxor xmm9, xmm9, xmm11 + vpslldq xmm10, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vpxor xmm8, xmm8, xmm10 + vpxor xmm1, xmm11, xmm9 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm1, xmm1, xmm14 + vmovdqu OWORD PTR [rsp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm3, xmm0, xmm0, 17 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm3, xmm3, xmm14 + vmovdqu OWORD PTR [rsp+48], xmm3 + ; H ^ 5 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpshufd xmm10, xmm1, 78 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpxor xmm9, xmm9, xmm0 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpxor xmm9, xmm9, xmm8 + vpxor xmm9, xmm9, xmm11 + vpslldq xmm10, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vpxor xmm8, xmm8, xmm10 + vpxor xmm7, xmm11, xmm9 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm7, xmm7, xmm14 + vmovdqu OWORD PTR [rsp+64], xmm7 + ; H ^ 6 + vpclmulqdq xmm8, xmm1, xmm1, 0 + vpclmulqdq xmm7, xmm1, xmm1, 17 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm7, xmm7, xmm14 + vmovdqu OWORD PTR [rsp+80], xmm7 + ; H ^ 7 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm1, 78 + vpshufd xmm10, xmm3, 78 + vpclmulqdq xmm11, xmm3, xmm1, 17 + vpclmulqdq xmm8, xmm3, xmm1, 0 + vpxor xmm9, xmm9, xmm1 + vpxor xmm10, xmm10, xmm3 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpxor xmm9, xmm9, xmm8 + vpxor xmm9, xmm9, xmm11 + vpslldq xmm10, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vpxor xmm8, xmm8, xmm10 + vpxor xmm7, xmm11, xmm9 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm7, xmm7, xmm14 + vmovdqu OWORD PTR [rsp+96], xmm7 + ; H ^ 8 + vpclmulqdq xmm8, xmm3, xmm3, 0 + vpclmulqdq xmm7, xmm3, xmm3, 17 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm7, xmm7, xmm14 + vmovdqu OWORD PTR [rsp+112], xmm7 +L_AES_GCM_decrypt_avx1_ghash_128: + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [rsi+rbx] + vmovdqu xmm0, OWORD PTR [rsp+128] + vmovdqa xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64 + vpshufb xmm8, xmm0, xmm1 + vpaddd xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one + vpshufb xmm9, xmm9, xmm1 + vpaddd xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two + vpshufb xmm10, xmm10, xmm1 + vpaddd xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three + vpshufb xmm11, xmm11, xmm1 + vpaddd xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four + vpshufb xmm12, xmm12, xmm1 + vpaddd xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five + vpshufb xmm13, xmm13, xmm1 + vpaddd xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six + vpshufb xmm14, xmm14, xmm1 + vpaddd xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven + vpshufb xmm15, xmm15, xmm1 + vpaddd xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight + vmovdqa xmm7, OWORD PTR [r15] + vmovdqu OWORD PTR [rsp+128], xmm0 + vpxor xmm8, xmm8, xmm7 + vpxor xmm9, xmm9, xmm7 + vpxor xmm10, xmm10, xmm7 + vpxor xmm11, xmm11, xmm7 + vpxor xmm12, xmm12, xmm7 + vpxor xmm13, xmm13, xmm7 + vpxor xmm14, xmm14, xmm7 + vpxor xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rsp+112] + vmovdqu xmm0, OWORD PTR [rcx] + vaesenc xmm8, xmm8, [r15+16] + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm0, xmm0, xmm2 + vpshufd xmm1, xmm7, 78 + vpshufd xmm5, xmm0, 78 + vpxor xmm1, xmm1, xmm7 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm3, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [r15+16] + vaesenc xmm10, xmm10, [r15+16] + vpclmulqdq xmm2, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [r15+16] + vaesenc xmm12, xmm12, [r15+16] + vpclmulqdq xmm1, xmm1, xmm5, 0 + vaesenc xmm13, xmm13, [r15+16] + vaesenc xmm14, xmm14, [r15+16] + vaesenc xmm15, xmm15, [r15+16] + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmm7, OWORD PTR [rsp+96] + vmovdqu xmm0, OWORD PTR [rcx+16] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [r15+32] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [r15+32] + vaesenc xmm10, xmm10, [r15+32] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [r15+32] + vaesenc xmm12, xmm12, [r15+32] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [r15+32] + vaesenc xmm14, xmm14, [r15+32] + vaesenc xmm15, xmm15, [r15+32] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm0, OWORD PTR [rcx+32] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [r15+48] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [r15+48] + vaesenc xmm10, xmm10, [r15+48] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [r15+48] + vaesenc xmm12, xmm12, [r15+48] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [r15+48] + vaesenc xmm14, xmm14, [r15+48] + vaesenc xmm15, xmm15, [r15+48] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp+64] + vmovdqu xmm0, OWORD PTR [rcx+48] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [r15+64] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [r15+64] + vaesenc xmm10, xmm10, [r15+64] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [r15+64] + vaesenc xmm12, xmm12, [r15+64] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [r15+64] + vaesenc xmm14, xmm14, [r15+64] + vaesenc xmm15, xmm15, [r15+64] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp+48] + vmovdqu xmm0, OWORD PTR [rcx+64] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [r15+80] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [r15+80] + vaesenc xmm10, xmm10, [r15+80] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [r15+80] + vaesenc xmm12, xmm12, [r15+80] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [r15+80] + vaesenc xmm14, xmm14, [r15+80] + vaesenc xmm15, xmm15, [r15+80] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp+32] + vmovdqu xmm0, OWORD PTR [rcx+80] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [r15+96] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [r15+96] + vaesenc xmm10, xmm10, [r15+96] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [r15+96] + vaesenc xmm12, xmm12, [r15+96] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [r15+96] + vaesenc xmm14, xmm14, [r15+96] + vaesenc xmm15, xmm15, [r15+96] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm0, OWORD PTR [rcx+96] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [r15+112] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [r15+112] + vaesenc xmm10, xmm10, [r15+112] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [r15+112] + vaesenc xmm12, xmm12, [r15+112] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [r15+112] + vaesenc xmm14, xmm14, [r15+112] + vaesenc xmm15, xmm15, [r15+112] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp] + vmovdqu xmm0, OWORD PTR [rcx+112] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [r15+128] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [r15+128] + vaesenc xmm10, xmm10, [r15+128] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [r15+128] + vaesenc xmm12, xmm12, [r15+128] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [r15+128] + vaesenc xmm14, xmm14, [r15+128] + vaesenc xmm15, xmm15, [r15+128] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vpslldq xmm5, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vaesenc xmm8, xmm8, [r15+144] + vpxor xmm2, xmm2, xmm5 + vpxor xmm3, xmm3, xmm1 + vaesenc xmm9, xmm9, [r15+144] + vpslld xmm7, xmm2, 31 + vpslld xmm4, xmm2, 30 + vpslld xmm5, xmm2, 25 + vaesenc xmm10, xmm10, [r15+144] + vpxor xmm7, xmm7, xmm4 + vpxor xmm7, xmm7, xmm5 + vaesenc xmm11, xmm11, [r15+144] + vpsrldq xmm4, xmm7, 4 + vpslldq xmm7, xmm7, 12 + vaesenc xmm12, xmm12, [r15+144] + vpxor xmm2, xmm2, xmm7 + vpsrld xmm5, xmm2, 1 + vaesenc xmm13, xmm13, [r15+144] + vpsrld xmm1, xmm2, 2 + vpsrld xmm0, xmm2, 7 + vaesenc xmm14, xmm14, [r15+144] + vpxor xmm5, xmm5, xmm1 + vpxor xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, [r15+144] + vpxor xmm5, xmm5, xmm4 + vpxor xmm2, xmm2, xmm5 + vpxor xmm2, xmm2, xmm3 + cmp r10d, 11 + vmovdqa xmm7, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_avx1_aesenc_128_ghash_avx_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [r15+176] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + cmp r10d, 13 + vmovdqa xmm7, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_avx1_aesenc_128_ghash_avx_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [r15+208] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [r15+224] +L_AES_GCM_decrypt_avx1_aesenc_128_ghash_avx_done: + vaesenclast xmm8, xmm8, xmm7 + vaesenclast xmm9, xmm9, xmm7 + vmovdqu xmm0, OWORD PTR [rcx] + vmovdqu xmm1, OWORD PTR [rcx+16] + vpxor xmm8, xmm8, xmm0 + vpxor xmm9, xmm9, xmm1 + vmovdqu OWORD PTR [rdx], xmm8 + vmovdqu OWORD PTR [rdx+16], xmm9 + vaesenclast xmm10, xmm10, xmm7 + vaesenclast xmm11, xmm11, xmm7 + vmovdqu xmm0, OWORD PTR [rcx+32] + vmovdqu xmm1, OWORD PTR [rcx+48] + vpxor xmm10, xmm10, xmm0 + vpxor xmm11, xmm11, xmm1 + vmovdqu OWORD PTR [rdx+32], xmm10 + vmovdqu OWORD PTR [rdx+48], xmm11 + vaesenclast xmm12, xmm12, xmm7 + vaesenclast xmm13, xmm13, xmm7 + vmovdqu xmm0, OWORD PTR [rcx+64] + vmovdqu xmm1, OWORD PTR [rcx+80] + vpxor xmm12, xmm12, xmm0 + vpxor xmm13, xmm13, xmm1 + vmovdqu OWORD PTR [rdx+64], xmm12 + vmovdqu OWORD PTR [rdx+80], xmm13 + vaesenclast xmm14, xmm14, xmm7 + vaesenclast xmm15, xmm15, xmm7 + vmovdqu xmm0, OWORD PTR [rcx+96] + vmovdqu xmm1, OWORD PTR [rcx+112] + vpxor xmm14, xmm14, xmm0 + vpxor xmm15, xmm15, xmm1 + vmovdqu OWORD PTR [rdx+96], xmm14 + vmovdqu OWORD PTR [rdx+112], xmm15 + add ebx, 128 + cmp ebx, r13d + jl L_AES_GCM_decrypt_avx1_ghash_128 + vmovdqa xmm6, xmm2 + vmovdqu xmm5, OWORD PTR [rsp] +L_AES_GCM_decrypt_avx1_done_128: + mov edx, r9d + cmp ebx, edx + jge L_AES_GCM_decrypt_avx1_done_dec + mov r13d, r9d + and r13d, 4294967280 + cmp ebx, r13d + jge L_AES_GCM_decrypt_avx1_last_block_done +L_AES_GCM_decrypt_avx1_last_block_start: + vmovdqu xmm13, OWORD PTR [rdi+rbx] + vmovdqa xmm0, xmm5 + vpshufb xmm1, xmm13, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm1, xmm1, xmm6 + vmovdqu xmm9, OWORD PTR [rsp+128] + vpshufb xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one + vmovdqu OWORD PTR [rsp+128], xmm9 + vpxor xmm8, xmm8, [r15] + vpclmulqdq xmm10, xmm1, xmm0, 16 + vaesenc xmm8, xmm8, [r15+16] + vaesenc xmm8, xmm8, [r15+32] + vpclmulqdq xmm11, xmm1, xmm0, 1 + vaesenc xmm8, xmm8, [r15+48] + vaesenc xmm8, xmm8, [r15+64] + vpclmulqdq xmm12, xmm1, xmm0, 0 + vaesenc xmm8, xmm8, [r15+80] + vpclmulqdq xmm1, xmm1, xmm0, 17 + vaesenc xmm8, xmm8, [r15+96] + vpxor xmm10, xmm10, xmm11 + vpslldq xmm2, xmm10, 8 + vpsrldq xmm10, xmm10, 8 + vaesenc xmm8, xmm8, [r15+112] + vpxor xmm2, xmm2, xmm12 + vpxor xmm3, xmm1, xmm10 + vmovdqa xmm0, OWORD PTR L_avx1_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm2, xmm0, 16 + vaesenc xmm8, xmm8, [r15+128] + vpshufd xmm10, xmm2, 78 + vpxor xmm10, xmm10, xmm11 + vpclmulqdq xmm11, xmm10, xmm0, 16 + vaesenc xmm8, xmm8, [r15+144] + vpshufd xmm10, xmm10, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm6, xmm10, xmm3 + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_avx1_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_avx1_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_decrypt_avx1_aesenc_gfmul_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqa xmm0, xmm13 + vpxor xmm8, xmm8, xmm0 + vmovdqu OWORD PTR [rsi+rbx], xmm8 + add ebx, 16 + cmp ebx, r13d + jl L_AES_GCM_decrypt_avx1_last_block_start +L_AES_GCM_decrypt_avx1_last_block_done: + mov ecx, r9d + mov edx, ecx + and ecx, 15 + jz L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_done + vmovdqu xmm4, OWORD PTR [rsp+128] + vpshufb xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_epi64 + vpxor xmm4, xmm4, [r15] + vaesenc xmm4, xmm4, [r15+16] + vaesenc xmm4, xmm4, [r15+32] + vaesenc xmm4, xmm4, [r15+48] + vaesenc xmm4, xmm4, [r15+64] + vaesenc xmm4, xmm4, [r15+80] + vaesenc xmm4, xmm4, [r15+96] + vaesenc xmm4, xmm4, [r15+112] + vaesenc xmm4, xmm4, [r15+128] + vaesenc xmm4, xmm4, [r15+144] + cmp r10d, 11 + vmovdqa xmm9, OWORD PTR [r15+160] + jl L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc xmm4, xmm4, xmm9 + vaesenc xmm4, xmm4, [r15+176] + cmp r10d, 13 + vmovdqa xmm9, OWORD PTR [r15+192] + jl L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc xmm4, xmm4, xmm9 + vaesenc xmm4, xmm4, [r15+208] + vmovdqa xmm9, OWORD PTR [r15+224] +L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last: + vaesenclast xmm4, xmm4, xmm9 + sub rsp, 32 + xor ecx, ecx + vmovdqu OWORD PTR [rsp], xmm4 + vpxor xmm0, xmm0, xmm0 + vmovdqu OWORD PTR [rsp+16], xmm0 +L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_loop: + movzx r13d, BYTE PTR [rdi+rbx] + mov BYTE PTR [rsp+rcx+16], r13b + xor r13b, BYTE PTR [rsp+rcx] + mov BYTE PTR [rsi+rbx], r13b + inc ebx + inc ecx + cmp ebx, edx + jl L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_loop + vmovdqu xmm4, OWORD PTR [rsp+16] + add rsp, 32 + vpshufb xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm4 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpshufd xmm10, xmm6, 78 + vpclmulqdq xmm11, xmm6, xmm5, 17 + vpclmulqdq xmm8, xmm6, xmm5, 0 + vpxor xmm9, xmm9, xmm5 + vpxor xmm10, xmm10, xmm6 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpxor xmm9, xmm9, xmm8 + vpxor xmm9, xmm9, xmm11 + vpslldq xmm10, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vpxor xmm8, xmm8, xmm10 + vpxor xmm6, xmm11, xmm9 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm6, xmm6, xmm14 +L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_done: +L_AES_GCM_decrypt_avx1_done_dec: + mov edx, r9d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + vmovq xmm0, rdx + vmovq xmm1, rcx + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm6, xmm6, xmm0 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpshufd xmm10, xmm6, 78 + vpclmulqdq xmm11, xmm6, xmm5, 17 + vpclmulqdq xmm8, xmm6, xmm5, 0 + vpxor xmm9, xmm9, xmm5 + vpxor xmm10, xmm10, xmm6 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpxor xmm9, xmm9, xmm8 + vpxor xmm9, xmm9, xmm11 + vpslldq xmm10, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vpxor xmm8, xmm8, xmm10 + vpxor xmm6, xmm11, xmm9 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm6, xmm6, xmm14 + vpshufb xmm6, xmm6, OWORD PTR L_avx1_aes_gcm_bswap_mask + vmovdqu xmm0, OWORD PTR [rsp+144] + vpxor xmm0, xmm0, xmm6 + cmp r14d, 16 + je L_AES_GCM_decrypt_avx1_cmp_tag_16 + sub rsp, 16 + xor rcx, rcx + xor rbx, rbx + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_decrypt_avx1_cmp_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + xor r13b, BYTE PTR [r8+rcx] + or bl, r13b + inc ecx + cmp ecx, r14d + jne L_AES_GCM_decrypt_avx1_cmp_tag_loop + cmp rbx, 0 + sete bl + add rsp, 16 + xor rcx, rcx + jmp L_AES_GCM_decrypt_avx1_cmp_tag_done +L_AES_GCM_decrypt_avx1_cmp_tag_16: + vmovdqu xmm1, OWORD PTR [r8] + vpcmpeqb xmm0, xmm0, xmm1 + vpmovmskb rdx, xmm0 + ; %%edx == 0xFFFF then return 1 else => return 0 + xor ebx, ebx + cmp edx, 65535 + sete bl +L_AES_GCM_decrypt_avx1_cmp_tag_done: + mov DWORD PTR [rbp], ebx + vzeroupper + add rsp, 168 + pop rbp + pop r15 + pop r14 + pop rbx + pop r12 + pop rsi + pop rdi + pop r13 + ret +AES_GCM_decrypt_avx1 ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_init_avx1 PROC + push rdi + push rsi + push r12 + push r13 + mov rdi, rcx + mov rsi, rdx + mov r10, r8 + mov r11d, r9d + mov rax, QWORD PTR [rsp+72] + mov r8, QWORD PTR [rsp+80] + mov r9, QWORD PTR [rsp+88] + sub rsp, 16 + vpxor xmm4, xmm4, xmm4 + mov edx, r11d + cmp edx, 12 + jne L_AES_GCM_init_avx1_iv_not_12 + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + mov ecx, 16777216 + vmovq xmm4, QWORD PTR [r10] + vpinsrd xmm4, xmm4, DWORD PTR [r10+8], 2 + vpinsrd xmm4, xmm4, ecx, 3 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqa xmm5, OWORD PTR [rdi] + vpxor xmm1, xmm4, xmm5 + vmovdqa xmm7, OWORD PTR [rdi+16] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [rdi+32] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [rdi+48] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [rdi+64] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [rdi+80] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [rdi+96] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [rdi+112] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [rdi+128] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [rdi+144] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + cmp esi, 11 + vmovdqa xmm7, OWORD PTR [rdi+160] + jl L_AES_GCM_init_avx1_calc_iv_12_last + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [rdi+176] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + cmp esi, 13 + vmovdqa xmm7, OWORD PTR [rdi+192] + jl L_AES_GCM_init_avx1_calc_iv_12_last + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [rdi+208] + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm1, xmm1, xmm7 + vmovdqa xmm7, OWORD PTR [rdi+224] +L_AES_GCM_init_avx1_calc_iv_12_last: + vaesenclast xmm5, xmm5, xmm7 + vaesenclast xmm1, xmm1, xmm7 + vpshufb xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask + vmovdqu xmm15, xmm1 + jmp L_AES_GCM_init_avx1_iv_done +L_AES_GCM_init_avx1_iv_not_12: + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqa xmm5, OWORD PTR [rdi] + vaesenc xmm5, xmm5, [rdi+16] + vaesenc xmm5, xmm5, [rdi+32] + vaesenc xmm5, xmm5, [rdi+48] + vaesenc xmm5, xmm5, [rdi+64] + vaesenc xmm5, xmm5, [rdi+80] + vaesenc xmm5, xmm5, [rdi+96] + vaesenc xmm5, xmm5, [rdi+112] + vaesenc xmm5, xmm5, [rdi+128] + vaesenc xmm5, xmm5, [rdi+144] + cmp esi, 11 + vmovdqa xmm9, OWORD PTR [rdi+160] + jl L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm9 + vaesenc xmm5, xmm5, [rdi+176] + cmp esi, 13 + vmovdqa xmm9, OWORD PTR [rdi+192] + jl L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm9 + vaesenc xmm5, xmm5, [rdi+208] + vmovdqa xmm9, OWORD PTR [rdi+224] +L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last: + vaesenclast xmm5, xmm5, xmm9 + vpshufb xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov rcx, 0 + je L_AES_GCM_init_avx1_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_init_avx1_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_init_avx1_calc_iv_16_loop: + vmovdqu xmm8, OWORD PTR [r10+rcx] + vpshufb xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_init_avx1_calc_iv_16_loop + mov edx, r11d + cmp ecx, edx + je L_AES_GCM_init_avx1_calc_iv_done +L_AES_GCM_init_avx1_calc_iv_lt16: + sub rsp, 16 + vpxor xmm8, xmm8, xmm8 + xor r13d, r13d + vmovdqu OWORD PTR [rsp], xmm8 +L_AES_GCM_init_avx1_calc_iv_loop: + movzx r12d, BYTE PTR [r10+rcx] + mov BYTE PTR [rsp+r13], r12b + inc ecx + inc r13d + cmp ecx, edx + jl L_AES_GCM_init_avx1_calc_iv_loop + vmovdqu xmm8, OWORD PTR [rsp] + add rsp, 16 + vpshufb xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 +L_AES_GCM_init_avx1_calc_iv_done: + ; T = Encrypt counter + vpxor xmm0, xmm0, xmm0 + shl edx, 3 + vmovq xmm0, rdx + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm7, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm7, xmm7, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm7, 31 + vpslld xmm1, xmm7, 30 + vpslld xmm2, xmm7, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm7, xmm7, xmm0 + vpsrld xmm2, xmm7, 1 + vpsrld xmm3, xmm7, 2 + vpsrld xmm0, xmm7, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm7 + vpxor xmm4, xmm4, xmm2 + vpshufb xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask + ; Encrypt counter + vmovdqa xmm8, OWORD PTR [rdi] + vpxor xmm8, xmm8, xmm4 + vaesenc xmm8, xmm8, [rdi+16] + vaesenc xmm8, xmm8, [rdi+32] + vaesenc xmm8, xmm8, [rdi+48] + vaesenc xmm8, xmm8, [rdi+64] + vaesenc xmm8, xmm8, [rdi+80] + vaesenc xmm8, xmm8, [rdi+96] + vaesenc xmm8, xmm8, [rdi+112] + vaesenc xmm8, xmm8, [rdi+128] + vaesenc xmm8, xmm8, [rdi+144] + cmp esi, 11 + vmovdqa xmm9, OWORD PTR [rdi+160] + jl L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [rdi+176] + cmp esi, 13 + vmovdqa xmm9, OWORD PTR [rdi+192] + jl L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [rdi+208] + vmovdqa xmm9, OWORD PTR [rdi+224] +L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqu xmm15, xmm8 +L_AES_GCM_init_avx1_iv_done: + vmovdqa OWORD PTR [r9], xmm15 + vpshufb xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_epi64 + vpaddd xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_one + vmovdqa OWORD PTR [rax], xmm5 + vmovdqa OWORD PTR [r8], xmm4 + vzeroupper + add rsp, 16 + pop r13 + pop r12 + pop rsi + pop rdi + ret +AES_GCM_init_avx1 ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_aad_update_avx1 PROC + mov rax, rcx + vmovdqa xmm5, OWORD PTR [r8] + vmovdqa xmm6, OWORD PTR [r9] + xor ecx, ecx +L_AES_GCM_aad_update_avx1_16_loop: + vmovdqu xmm8, OWORD PTR [rax+rcx] + vpshufb xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm5, 78 + vpshufd xmm2, xmm6, 78 + vpclmulqdq xmm3, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm1, xmm1, xmm5 + vpxor xmm2, xmm2, xmm6 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm5, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm5, xmm5, xmm1 + vpsrld xmm0, xmm4, 31 + vpsrld xmm1, xmm5, 31 + vpslld xmm4, xmm4, 1 + vpslld xmm5, xmm5, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm5, xmm5, xmm2 + vpor xmm4, xmm4, xmm0 + vpor xmm5, xmm5, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm5, xmm5, xmm2 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_aad_update_avx1_16_loop + vmovdqa OWORD PTR [r8], xmm5 + vzeroupper + ret +AES_GCM_aad_update_avx1 ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_encrypt_block_avx1 PROC + mov r10, r8 + mov r11, r9 + mov rax, QWORD PTR [rsp+40] + vmovdqu xmm9, OWORD PTR [rax] + vpshufb xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one + vmovdqu OWORD PTR [rax], xmm9 + vpxor xmm8, xmm8, [rcx] + vaesenc xmm8, xmm8, [rcx+16] + vaesenc xmm8, xmm8, [rcx+32] + vaesenc xmm8, xmm8, [rcx+48] + vaesenc xmm8, xmm8, [rcx+64] + vaesenc xmm8, xmm8, [rcx+80] + vaesenc xmm8, xmm8, [rcx+96] + vaesenc xmm8, xmm8, [rcx+112] + vaesenc xmm8, xmm8, [rcx+128] + vaesenc xmm8, xmm8, [rcx+144] + cmp edx, 11 + vmovdqa xmm9, OWORD PTR [rcx+160] + jl L_AES_GCM_encrypt_block_avx1_aesenc_block_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [rcx+176] + cmp edx, 13 + vmovdqa xmm9, OWORD PTR [rcx+192] + jl L_AES_GCM_encrypt_block_avx1_aesenc_block_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [rcx+208] + vmovdqa xmm9, OWORD PTR [rcx+224] +L_AES_GCM_encrypt_block_avx1_aesenc_block_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqu xmm9, OWORD PTR [r11] + vpxor xmm8, xmm8, xmm9 + vmovdqu OWORD PTR [r10], xmm8 + vpshufb xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask + vzeroupper + ret +AES_GCM_encrypt_block_avx1 ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_ghash_block_avx1 PROC + vmovdqa xmm4, OWORD PTR [rdx] + vmovdqa xmm5, OWORD PTR [r8] + vmovdqu xmm8, OWORD PTR [rcx] + vpshufb xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm8 + ; ghash_gfmul_avx + vpshufd xmm1, xmm4, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpxor xmm1, xmm1, xmm4 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm6, xmm0 + vmovdqa xmm4, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm6, xmm6, xmm2 + vpxor xmm4, xmm4, xmm1 + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + vpslld xmm0, xmm6, 31 + vpslld xmm1, xmm6, 30 + vpslld xmm2, xmm6, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm6, xmm6, xmm0 + vpsrld xmm2, xmm6, 1 + vpsrld xmm3, xmm6, 2 + vpsrld xmm0, xmm6, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm6 + vpxor xmm4, xmm4, xmm2 + vmovdqa OWORD PTR [rdx], xmm4 + vzeroupper + ret +AES_GCM_ghash_block_avx1 ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_encrypt_update_avx1 PROC + push r13 + push r12 + push r14 + push r15 + push rdi + mov rax, rcx + mov r10, r8 + mov r8d, edx + mov r11, r9 + mov r9d, DWORD PTR [rsp+80] + mov r12, QWORD PTR [rsp+88] + mov r14, QWORD PTR [rsp+96] + mov r15, QWORD PTR [rsp+104] + sub rsp, 160 + vmovdqa xmm6, OWORD PTR [r12] + vmovdqa xmm5, OWORD PTR [r14] + vpsrlq xmm9, xmm5, 63 + vpsllq xmm8, xmm5, 1 + vpslldq xmm9, xmm9, 8 + vpor xmm8, xmm8, xmm9 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128 + vpxor xmm5, xmm5, xmm8 + xor edi, edi + cmp r9d, 128 + mov r13d, r9d + jl L_AES_GCM_encrypt_update_avx1_done_128 + and r13d, 4294967168 + vmovdqa xmm2, xmm6 + ; H ^ 1 + vmovdqu OWORD PTR [rsp], xmm5 + ; H ^ 2 + vpclmulqdq xmm8, xmm5, xmm5, 0 + vpclmulqdq xmm0, xmm5, xmm5, 17 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm0, xmm0, xmm14 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpshufd xmm10, xmm0, 78 + vpclmulqdq xmm11, xmm0, xmm5, 17 + vpclmulqdq xmm8, xmm0, xmm5, 0 + vpxor xmm9, xmm9, xmm5 + vpxor xmm10, xmm10, xmm0 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpxor xmm9, xmm9, xmm8 + vpxor xmm9, xmm9, xmm11 + vpslldq xmm10, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vpxor xmm8, xmm8, xmm10 + vpxor xmm1, xmm11, xmm9 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm1, xmm1, xmm14 + vmovdqu OWORD PTR [rsp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm3, xmm0, xmm0, 17 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm3, xmm3, xmm14 + vmovdqu OWORD PTR [rsp+48], xmm3 + ; H ^ 5 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpshufd xmm10, xmm1, 78 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpxor xmm9, xmm9, xmm0 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpxor xmm9, xmm9, xmm8 + vpxor xmm9, xmm9, xmm11 + vpslldq xmm10, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vpxor xmm8, xmm8, xmm10 + vpxor xmm7, xmm11, xmm9 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm7, xmm7, xmm14 + vmovdqu OWORD PTR [rsp+64], xmm7 + ; H ^ 6 + vpclmulqdq xmm8, xmm1, xmm1, 0 + vpclmulqdq xmm7, xmm1, xmm1, 17 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm7, xmm7, xmm14 + vmovdqu OWORD PTR [rsp+80], xmm7 + ; H ^ 7 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm1, 78 + vpshufd xmm10, xmm3, 78 + vpclmulqdq xmm11, xmm3, xmm1, 17 + vpclmulqdq xmm8, xmm3, xmm1, 0 + vpxor xmm9, xmm9, xmm1 + vpxor xmm10, xmm10, xmm3 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpxor xmm9, xmm9, xmm8 + vpxor xmm9, xmm9, xmm11 + vpslldq xmm10, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vpxor xmm8, xmm8, xmm10 + vpxor xmm7, xmm11, xmm9 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm7, xmm7, xmm14 + vmovdqu OWORD PTR [rsp+96], xmm7 + ; H ^ 8 + vpclmulqdq xmm8, xmm3, xmm3, 0 + vpclmulqdq xmm7, xmm3, xmm3, 17 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm7, xmm7, xmm14 + vmovdqu OWORD PTR [rsp+112], xmm7 + ; First 128 bytes of input + vmovdqu xmm0, OWORD PTR [r15] + vmovdqa xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64 + vpshufb xmm8, xmm0, xmm1 + vpaddd xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one + vpshufb xmm9, xmm9, xmm1 + vpaddd xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two + vpshufb xmm10, xmm10, xmm1 + vpaddd xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three + vpshufb xmm11, xmm11, xmm1 + vpaddd xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four + vpshufb xmm12, xmm12, xmm1 + vpaddd xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five + vpshufb xmm13, xmm13, xmm1 + vpaddd xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six + vpshufb xmm14, xmm14, xmm1 + vpaddd xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven + vpshufb xmm15, xmm15, xmm1 + vpaddd xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight + vmovdqa xmm7, OWORD PTR [rax] + vmovdqu OWORD PTR [r15], xmm0 + vpxor xmm8, xmm8, xmm7 + vpxor xmm9, xmm9, xmm7 + vpxor xmm10, xmm10, xmm7 + vpxor xmm11, xmm11, xmm7 + vpxor xmm12, xmm12, xmm7 + vpxor xmm13, xmm13, xmm7 + vpxor xmm14, xmm14, xmm7 + vpxor xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [rax+16] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [rax+32] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [rax+48] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [rax+64] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [rax+80] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [rax+96] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [rax+112] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [rax+128] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [rax+144] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + cmp r8d, 11 + vmovdqa xmm7, OWORD PTR [rax+160] + jl L_AES_GCM_encrypt_update_avx1_aesenc_128_enc_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [rax+176] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + cmp r8d, 13 + vmovdqa xmm7, OWORD PTR [rax+192] + jl L_AES_GCM_encrypt_update_avx1_aesenc_128_enc_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [rax+208] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [rax+224] +L_AES_GCM_encrypt_update_avx1_aesenc_128_enc_done: + vaesenclast xmm8, xmm8, xmm7 + vaesenclast xmm9, xmm9, xmm7 + vmovdqu xmm0, OWORD PTR [r11] + vmovdqu xmm1, OWORD PTR [r11+16] + vpxor xmm8, xmm8, xmm0 + vpxor xmm9, xmm9, xmm1 + vmovdqu OWORD PTR [r10], xmm8 + vmovdqu OWORD PTR [r10+16], xmm9 + vaesenclast xmm10, xmm10, xmm7 + vaesenclast xmm11, xmm11, xmm7 + vmovdqu xmm0, OWORD PTR [r11+32] + vmovdqu xmm1, OWORD PTR [r11+48] + vpxor xmm10, xmm10, xmm0 + vpxor xmm11, xmm11, xmm1 + vmovdqu OWORD PTR [r10+32], xmm10 + vmovdqu OWORD PTR [r10+48], xmm11 + vaesenclast xmm12, xmm12, xmm7 + vaesenclast xmm13, xmm13, xmm7 + vmovdqu xmm0, OWORD PTR [r11+64] + vmovdqu xmm1, OWORD PTR [r11+80] + vpxor xmm12, xmm12, xmm0 + vpxor xmm13, xmm13, xmm1 + vmovdqu OWORD PTR [r10+64], xmm12 + vmovdqu OWORD PTR [r10+80], xmm13 + vaesenclast xmm14, xmm14, xmm7 + vaesenclast xmm15, xmm15, xmm7 + vmovdqu xmm0, OWORD PTR [r11+96] + vmovdqu xmm1, OWORD PTR [r11+112] + vpxor xmm14, xmm14, xmm0 + vpxor xmm15, xmm15, xmm1 + vmovdqu OWORD PTR [r10+96], xmm14 + vmovdqu OWORD PTR [r10+112], xmm15 + cmp r13d, 128 + mov edi, 128 + jle L_AES_GCM_encrypt_update_avx1_end_128 + ; More 128 bytes of input +L_AES_GCM_encrypt_update_avx1_ghash_128: + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu xmm0, OWORD PTR [r15] + vmovdqa xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64 + vpshufb xmm8, xmm0, xmm1 + vpaddd xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one + vpshufb xmm9, xmm9, xmm1 + vpaddd xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two + vpshufb xmm10, xmm10, xmm1 + vpaddd xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three + vpshufb xmm11, xmm11, xmm1 + vpaddd xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four + vpshufb xmm12, xmm12, xmm1 + vpaddd xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five + vpshufb xmm13, xmm13, xmm1 + vpaddd xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six + vpshufb xmm14, xmm14, xmm1 + vpaddd xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven + vpshufb xmm15, xmm15, xmm1 + vpaddd xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight + vmovdqa xmm7, OWORD PTR [rax] + vmovdqu OWORD PTR [r15], xmm0 + vpxor xmm8, xmm8, xmm7 + vpxor xmm9, xmm9, xmm7 + vpxor xmm10, xmm10, xmm7 + vpxor xmm11, xmm11, xmm7 + vpxor xmm12, xmm12, xmm7 + vpxor xmm13, xmm13, xmm7 + vpxor xmm14, xmm14, xmm7 + vpxor xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rsp+112] + vmovdqu xmm0, OWORD PTR [rdx+-128] + vaesenc xmm8, xmm8, [rax+16] + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm0, xmm0, xmm2 + vpshufd xmm1, xmm7, 78 + vpshufd xmm5, xmm0, 78 + vpxor xmm1, xmm1, xmm7 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm3, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [rax+16] + vaesenc xmm10, xmm10, [rax+16] + vpclmulqdq xmm2, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [rax+16] + vaesenc xmm12, xmm12, [rax+16] + vpclmulqdq xmm1, xmm1, xmm5, 0 + vaesenc xmm13, xmm13, [rax+16] + vaesenc xmm14, xmm14, [rax+16] + vaesenc xmm15, xmm15, [rax+16] + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmm7, OWORD PTR [rsp+96] + vmovdqu xmm0, OWORD PTR [rdx+-112] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [rax+32] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [rax+32] + vaesenc xmm10, xmm10, [rax+32] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [rax+32] + vaesenc xmm12, xmm12, [rax+32] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [rax+32] + vaesenc xmm14, xmm14, [rax+32] + vaesenc xmm15, xmm15, [rax+32] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm0, OWORD PTR [rdx+-96] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [rax+48] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [rax+48] + vaesenc xmm10, xmm10, [rax+48] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [rax+48] + vaesenc xmm12, xmm12, [rax+48] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [rax+48] + vaesenc xmm14, xmm14, [rax+48] + vaesenc xmm15, xmm15, [rax+48] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp+64] + vmovdqu xmm0, OWORD PTR [rdx+-80] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [rax+64] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [rax+64] + vaesenc xmm10, xmm10, [rax+64] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [rax+64] + vaesenc xmm12, xmm12, [rax+64] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [rax+64] + vaesenc xmm14, xmm14, [rax+64] + vaesenc xmm15, xmm15, [rax+64] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp+48] + vmovdqu xmm0, OWORD PTR [rdx+-64] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [rax+80] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [rax+80] + vaesenc xmm10, xmm10, [rax+80] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [rax+80] + vaesenc xmm12, xmm12, [rax+80] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [rax+80] + vaesenc xmm14, xmm14, [rax+80] + vaesenc xmm15, xmm15, [rax+80] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp+32] + vmovdqu xmm0, OWORD PTR [rdx+-48] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [rax+96] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [rax+96] + vaesenc xmm10, xmm10, [rax+96] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [rax+96] + vaesenc xmm12, xmm12, [rax+96] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [rax+96] + vaesenc xmm14, xmm14, [rax+96] + vaesenc xmm15, xmm15, [rax+96] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm0, OWORD PTR [rdx+-32] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [rax+112] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [rax+112] + vaesenc xmm10, xmm10, [rax+112] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [rax+112] + vaesenc xmm12, xmm12, [rax+112] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [rax+112] + vaesenc xmm14, xmm14, [rax+112] + vaesenc xmm15, xmm15, [rax+112] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp] + vmovdqu xmm0, OWORD PTR [rdx+-16] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [rax+128] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [rax+128] + vaesenc xmm10, xmm10, [rax+128] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [rax+128] + vaesenc xmm12, xmm12, [rax+128] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [rax+128] + vaesenc xmm14, xmm14, [rax+128] + vaesenc xmm15, xmm15, [rax+128] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vpslldq xmm5, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vaesenc xmm8, xmm8, [rax+144] + vpxor xmm2, xmm2, xmm5 + vpxor xmm3, xmm3, xmm1 + vaesenc xmm9, xmm9, [rax+144] + vpslld xmm7, xmm2, 31 + vpslld xmm4, xmm2, 30 + vpslld xmm5, xmm2, 25 + vaesenc xmm10, xmm10, [rax+144] + vpxor xmm7, xmm7, xmm4 + vpxor xmm7, xmm7, xmm5 + vaesenc xmm11, xmm11, [rax+144] + vpsrldq xmm4, xmm7, 4 + vpslldq xmm7, xmm7, 12 + vaesenc xmm12, xmm12, [rax+144] + vpxor xmm2, xmm2, xmm7 + vpsrld xmm5, xmm2, 1 + vaesenc xmm13, xmm13, [rax+144] + vpsrld xmm1, xmm2, 2 + vpsrld xmm0, xmm2, 7 + vaesenc xmm14, xmm14, [rax+144] + vpxor xmm5, xmm5, xmm1 + vpxor xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, [rax+144] + vpxor xmm5, xmm5, xmm4 + vpxor xmm2, xmm2, xmm5 + vpxor xmm2, xmm2, xmm3 + cmp r8d, 11 + vmovdqa xmm7, OWORD PTR [rax+160] + jl L_AES_GCM_encrypt_update_avx1_aesenc_128_ghash_avx_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [rax+176] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + cmp r8d, 13 + vmovdqa xmm7, OWORD PTR [rax+192] + jl L_AES_GCM_encrypt_update_avx1_aesenc_128_ghash_avx_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [rax+208] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [rax+224] +L_AES_GCM_encrypt_update_avx1_aesenc_128_ghash_avx_done: + vaesenclast xmm8, xmm8, xmm7 + vaesenclast xmm9, xmm9, xmm7 + vmovdqu xmm0, OWORD PTR [rcx] + vmovdqu xmm1, OWORD PTR [rcx+16] + vpxor xmm8, xmm8, xmm0 + vpxor xmm9, xmm9, xmm1 + vmovdqu OWORD PTR [rdx], xmm8 + vmovdqu OWORD PTR [rdx+16], xmm9 + vaesenclast xmm10, xmm10, xmm7 + vaesenclast xmm11, xmm11, xmm7 + vmovdqu xmm0, OWORD PTR [rcx+32] + vmovdqu xmm1, OWORD PTR [rcx+48] + vpxor xmm10, xmm10, xmm0 + vpxor xmm11, xmm11, xmm1 + vmovdqu OWORD PTR [rdx+32], xmm10 + vmovdqu OWORD PTR [rdx+48], xmm11 + vaesenclast xmm12, xmm12, xmm7 + vaesenclast xmm13, xmm13, xmm7 + vmovdqu xmm0, OWORD PTR [rcx+64] + vmovdqu xmm1, OWORD PTR [rcx+80] + vpxor xmm12, xmm12, xmm0 + vpxor xmm13, xmm13, xmm1 + vmovdqu OWORD PTR [rdx+64], xmm12 + vmovdqu OWORD PTR [rdx+80], xmm13 + vaesenclast xmm14, xmm14, xmm7 + vaesenclast xmm15, xmm15, xmm7 + vmovdqu xmm0, OWORD PTR [rcx+96] + vmovdqu xmm1, OWORD PTR [rcx+112] + vpxor xmm14, xmm14, xmm0 + vpxor xmm15, xmm15, xmm1 + vmovdqu OWORD PTR [rdx+96], xmm14 + vmovdqu OWORD PTR [rdx+112], xmm15 + add edi, 128 + cmp edi, r13d + jl L_AES_GCM_encrypt_update_avx1_ghash_128 +L_AES_GCM_encrypt_update_avx1_end_128: + vmovdqa xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpshufb xmm8, xmm8, xmm4 + vpshufb xmm9, xmm9, xmm4 + vpshufb xmm10, xmm10, xmm4 + vpshufb xmm11, xmm11, xmm4 + vpxor xmm8, xmm8, xmm2 + vpshufb xmm12, xmm12, xmm4 + vpshufb xmm13, xmm13, xmm4 + vpshufb xmm14, xmm14, xmm4 + vpshufb xmm15, xmm15, xmm4 + vmovdqu xmm7, OWORD PTR [rsp] + vmovdqu xmm5, OWORD PTR [rsp+16] + ; ghash_gfmul_avx + vpshufd xmm1, xmm15, 78 + vpshufd xmm2, xmm7, 78 + vpclmulqdq xmm3, xmm7, xmm15, 17 + vpclmulqdq xmm0, xmm7, xmm15, 0 + vpxor xmm1, xmm1, xmm15 + vpxor xmm2, xmm2, xmm7 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vmovdqa xmm4, xmm0 + vmovdqa xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm6, xmm6, xmm1 + ; ghash_gfmul_xor_avx + vpshufd xmm1, xmm14, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm14, 17 + vpclmulqdq xmm0, xmm5, xmm14, 0 + vpxor xmm1, xmm1, xmm14 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vpxor xmm4, xmm4, xmm0 + vpxor xmm6, xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm6, xmm6, xmm1 + vmovdqu xmm7, OWORD PTR [rsp+32] + vmovdqu xmm5, OWORD PTR [rsp+48] + ; ghash_gfmul_xor_avx + vpshufd xmm1, xmm13, 78 + vpshufd xmm2, xmm7, 78 + vpclmulqdq xmm3, xmm7, xmm13, 17 + vpclmulqdq xmm0, xmm7, xmm13, 0 + vpxor xmm1, xmm1, xmm13 + vpxor xmm2, xmm2, xmm7 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vpxor xmm4, xmm4, xmm0 + vpxor xmm6, xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm6, xmm6, xmm1 + ; ghash_gfmul_xor_avx + vpshufd xmm1, xmm12, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm12, 17 + vpclmulqdq xmm0, xmm5, xmm12, 0 + vpxor xmm1, xmm1, xmm12 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vpxor xmm4, xmm4, xmm0 + vpxor xmm6, xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm6, xmm6, xmm1 + vmovdqu xmm7, OWORD PTR [rsp+64] + vmovdqu xmm5, OWORD PTR [rsp+80] + ; ghash_gfmul_xor_avx + vpshufd xmm1, xmm11, 78 + vpshufd xmm2, xmm7, 78 + vpclmulqdq xmm3, xmm7, xmm11, 17 + vpclmulqdq xmm0, xmm7, xmm11, 0 + vpxor xmm1, xmm1, xmm11 + vpxor xmm2, xmm2, xmm7 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vpxor xmm4, xmm4, xmm0 + vpxor xmm6, xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm6, xmm6, xmm1 + ; ghash_gfmul_xor_avx + vpshufd xmm1, xmm10, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm10, 17 + vpclmulqdq xmm0, xmm5, xmm10, 0 + vpxor xmm1, xmm1, xmm10 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vpxor xmm4, xmm4, xmm0 + vpxor xmm6, xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm6, xmm6, xmm1 + vmovdqu xmm7, OWORD PTR [rsp+96] + vmovdqu xmm5, OWORD PTR [rsp+112] + ; ghash_gfmul_xor_avx + vpshufd xmm1, xmm9, 78 + vpshufd xmm2, xmm7, 78 + vpclmulqdq xmm3, xmm7, xmm9, 17 + vpclmulqdq xmm0, xmm7, xmm9, 0 + vpxor xmm1, xmm1, xmm9 + vpxor xmm2, xmm2, xmm7 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vpxor xmm4, xmm4, xmm0 + vpxor xmm6, xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm6, xmm6, xmm1 + ; ghash_gfmul_xor_avx + vpshufd xmm1, xmm8, 78 + vpshufd xmm2, xmm5, 78 + vpclmulqdq xmm3, xmm5, xmm8, 17 + vpclmulqdq xmm0, xmm5, xmm8, 0 + vpxor xmm1, xmm1, xmm8 + vpxor xmm2, xmm2, xmm5 + vpclmulqdq xmm1, xmm1, xmm2, 0 + vpxor xmm1, xmm1, xmm0 + vpxor xmm1, xmm1, xmm3 + vpxor xmm4, xmm4, xmm0 + vpxor xmm6, xmm6, xmm3 + vpslldq xmm2, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vpxor xmm4, xmm4, xmm2 + vpxor xmm6, xmm6, xmm1 + vpslld xmm0, xmm4, 31 + vpslld xmm1, xmm4, 30 + vpslld xmm2, xmm4, 25 + vpxor xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm2 + vmovdqa xmm1, xmm0 + vpsrldq xmm1, xmm1, 4 + vpslldq xmm0, xmm0, 12 + vpxor xmm4, xmm4, xmm0 + vpsrld xmm2, xmm4, 1 + vpsrld xmm3, xmm4, 2 + vpsrld xmm0, xmm4, 7 + vpxor xmm2, xmm2, xmm3 + vpxor xmm2, xmm2, xmm0 + vpxor xmm2, xmm2, xmm1 + vpxor xmm2, xmm2, xmm4 + vpxor xmm6, xmm6, xmm2 + vmovdqu xmm5, OWORD PTR [rsp] +L_AES_GCM_encrypt_update_avx1_done_128: + mov edx, r9d + cmp edi, edx + jge L_AES_GCM_encrypt_update_avx1_done_enc + mov r13d, r9d + and r13d, 4294967280 + cmp edi, r13d + jge L_AES_GCM_encrypt_update_avx1_last_block_done + vmovdqu xmm9, OWORD PTR [r15] + vpshufb xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one + vmovdqu OWORD PTR [r15], xmm9 + vpxor xmm8, xmm8, [rax] + vaesenc xmm8, xmm8, [rax+16] + vaesenc xmm8, xmm8, [rax+32] + vaesenc xmm8, xmm8, [rax+48] + vaesenc xmm8, xmm8, [rax+64] + vaesenc xmm8, xmm8, [rax+80] + vaesenc xmm8, xmm8, [rax+96] + vaesenc xmm8, xmm8, [rax+112] + vaesenc xmm8, xmm8, [rax+128] + vaesenc xmm8, xmm8, [rax+144] + cmp r8d, 11 + vmovdqa xmm9, OWORD PTR [rax+160] + jl L_AES_GCM_encrypt_update_avx1_aesenc_block_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [rax+176] + cmp r8d, 13 + vmovdqa xmm9, OWORD PTR [rax+192] + jl L_AES_GCM_encrypt_update_avx1_aesenc_block_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [rax+208] + vmovdqa xmm9, OWORD PTR [rax+224] +L_AES_GCM_encrypt_update_avx1_aesenc_block_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqu xmm9, OWORD PTR [r11+rdi] + vpxor xmm8, xmm8, xmm9 + vmovdqu OWORD PTR [r10+rdi], xmm8 + vpshufb xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm8 + add edi, 16 + cmp edi, r13d + jge L_AES_GCM_encrypt_update_avx1_last_block_ghash +L_AES_GCM_encrypt_update_avx1_last_block_start: + vmovdqu xmm13, OWORD PTR [r11+rdi] + vmovdqu xmm9, OWORD PTR [r15] + vpshufb xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one + vmovdqu OWORD PTR [r15], xmm9 + vpxor xmm8, xmm8, [rax] + vpclmulqdq xmm10, xmm6, xmm5, 16 + vaesenc xmm8, xmm8, [rax+16] + vaesenc xmm8, xmm8, [rax+32] + vpclmulqdq xmm11, xmm6, xmm5, 1 + vaesenc xmm8, xmm8, [rax+48] + vaesenc xmm8, xmm8, [rax+64] + vpclmulqdq xmm12, xmm6, xmm5, 0 + vaesenc xmm8, xmm8, [rax+80] + vpclmulqdq xmm1, xmm6, xmm5, 17 + vaesenc xmm8, xmm8, [rax+96] + vpxor xmm10, xmm10, xmm11 + vpslldq xmm2, xmm10, 8 + vpsrldq xmm10, xmm10, 8 + vaesenc xmm8, xmm8, [rax+112] + vpxor xmm2, xmm2, xmm12 + vpxor xmm3, xmm1, xmm10 + vmovdqa xmm0, OWORD PTR L_avx1_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm2, xmm0, 16 + vaesenc xmm8, xmm8, [rax+128] + vpshufd xmm10, xmm2, 78 + vpxor xmm10, xmm10, xmm11 + vpclmulqdq xmm11, xmm10, xmm0, 16 + vaesenc xmm8, xmm8, [rax+144] + vpshufd xmm10, xmm10, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm6, xmm10, xmm3 + cmp r8d, 11 + vmovdqa xmm9, OWORD PTR [rax+160] + jl L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [rax+176] + cmp r8d, 13 + vmovdqa xmm9, OWORD PTR [rax+192] + jl L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [rax+208] + vmovdqa xmm9, OWORD PTR [rax+224] +L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqa xmm0, xmm13 + vpxor xmm8, xmm8, xmm0 + vmovdqu OWORD PTR [r10+rdi], xmm8 + vpshufb xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask + add edi, 16 + vpxor xmm6, xmm6, xmm8 + cmp edi, r13d + jl L_AES_GCM_encrypt_update_avx1_last_block_start +L_AES_GCM_encrypt_update_avx1_last_block_ghash: + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpshufd xmm10, xmm6, 78 + vpclmulqdq xmm11, xmm6, xmm5, 17 + vpclmulqdq xmm8, xmm6, xmm5, 0 + vpxor xmm9, xmm9, xmm5 + vpxor xmm10, xmm10, xmm6 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpxor xmm9, xmm9, xmm8 + vpxor xmm9, xmm9, xmm11 + vpslldq xmm10, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vpxor xmm8, xmm8, xmm10 + vpxor xmm6, xmm11, xmm9 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm6, xmm6, xmm14 +L_AES_GCM_encrypt_update_avx1_last_block_done: +L_AES_GCM_encrypt_update_avx1_done_enc: + vmovdqa OWORD PTR [r12], xmm6 + vzeroupper + add rsp, 160 + pop rdi + pop r15 + pop r14 + pop r12 + pop r13 + ret +AES_GCM_encrypt_update_avx1 ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_encrypt_final_avx1 PROC + push r13 + push r12 + push r14 + mov rax, rcx + mov r10d, r9d + mov r9, rdx + mov r11d, DWORD PTR [rsp+64] + mov r12, QWORD PTR [rsp+72] + mov r14, QWORD PTR [rsp+80] + sub rsp, 16 + vmovdqa xmm4, OWORD PTR [rax] + vmovdqa xmm5, OWORD PTR [r12] + vmovdqa xmm6, OWORD PTR [r14] + vpsrlq xmm9, xmm5, 63 + vpsllq xmm8, xmm5, 1 + vpslldq xmm9, xmm9, 8 + vpor xmm8, xmm8, xmm9 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128 + vpxor xmm5, xmm5, xmm8 + mov edx, r10d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + vmovq xmm0, rdx + vmovq xmm1, rcx + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpshufd xmm10, xmm4, 78 + vpclmulqdq xmm11, xmm4, xmm5, 17 + vpclmulqdq xmm8, xmm4, xmm5, 0 + vpxor xmm9, xmm9, xmm5 + vpxor xmm10, xmm10, xmm4 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpxor xmm9, xmm9, xmm8 + vpxor xmm9, xmm9, xmm11 + vpslldq xmm10, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vpxor xmm8, xmm8, xmm10 + vpxor xmm4, xmm11, xmm9 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm4, xmm4, xmm14 + vpshufb xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm0, xmm4, xmm6 + cmp r8d, 16 + je L_AES_GCM_encrypt_final_avx1_store_tag_16 + xor rcx, rcx + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_encrypt_final_avx1_store_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + mov BYTE PTR [r9+rcx], r13b + inc ecx + cmp ecx, r8d + jne L_AES_GCM_encrypt_final_avx1_store_tag_loop + jmp L_AES_GCM_encrypt_final_avx1_store_tag_done +L_AES_GCM_encrypt_final_avx1_store_tag_16: + vmovdqu OWORD PTR [r9], xmm0 +L_AES_GCM_encrypt_final_avx1_store_tag_done: + vzeroupper + add rsp, 16 + pop r14 + pop r12 + pop r13 + ret +AES_GCM_encrypt_final_avx1 ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_decrypt_update_avx1 PROC + push r13 + push r12 + push r14 + push r15 + push rdi + mov rax, rcx + mov r10, r8 + mov r8d, edx + mov r11, r9 + mov r9d, DWORD PTR [rsp+80] + mov r12, QWORD PTR [rsp+88] + mov r14, QWORD PTR [rsp+96] + mov r15, QWORD PTR [rsp+104] + sub rsp, 168 + vmovdqa xmm6, OWORD PTR [r12] + vmovdqa xmm5, OWORD PTR [r14] + vpsrlq xmm9, xmm5, 63 + vpsllq xmm8, xmm5, 1 + vpslldq xmm9, xmm9, 8 + vpor xmm8, xmm8, xmm9 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128 + vpxor xmm5, xmm5, xmm8 + xor edi, edi + cmp r9d, 128 + mov r13d, r9d + jl L_AES_GCM_decrypt_update_avx1_done_128 + and r13d, 4294967168 + vmovdqa xmm2, xmm6 + ; H ^ 1 + vmovdqu OWORD PTR [rsp], xmm5 + ; H ^ 2 + vpclmulqdq xmm8, xmm5, xmm5, 0 + vpclmulqdq xmm0, xmm5, xmm5, 17 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm0, xmm0, xmm14 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpshufd xmm10, xmm0, 78 + vpclmulqdq xmm11, xmm0, xmm5, 17 + vpclmulqdq xmm8, xmm0, xmm5, 0 + vpxor xmm9, xmm9, xmm5 + vpxor xmm10, xmm10, xmm0 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpxor xmm9, xmm9, xmm8 + vpxor xmm9, xmm9, xmm11 + vpslldq xmm10, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vpxor xmm8, xmm8, xmm10 + vpxor xmm1, xmm11, xmm9 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm1, xmm1, xmm14 + vmovdqu OWORD PTR [rsp+32], xmm1 + ; H ^ 4 + vpclmulqdq xmm8, xmm0, xmm0, 0 + vpclmulqdq xmm3, xmm0, xmm0, 17 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm3, xmm3, xmm14 + vmovdqu OWORD PTR [rsp+48], xmm3 + ; H ^ 5 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm0, 78 + vpshufd xmm10, xmm1, 78 + vpclmulqdq xmm11, xmm1, xmm0, 17 + vpclmulqdq xmm8, xmm1, xmm0, 0 + vpxor xmm9, xmm9, xmm0 + vpxor xmm10, xmm10, xmm1 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpxor xmm9, xmm9, xmm8 + vpxor xmm9, xmm9, xmm11 + vpslldq xmm10, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vpxor xmm8, xmm8, xmm10 + vpxor xmm7, xmm11, xmm9 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm7, xmm7, xmm14 + vmovdqu OWORD PTR [rsp+64], xmm7 + ; H ^ 6 + vpclmulqdq xmm8, xmm1, xmm1, 0 + vpclmulqdq xmm7, xmm1, xmm1, 17 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm7, xmm7, xmm14 + vmovdqu OWORD PTR [rsp+80], xmm7 + ; H ^ 7 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm1, 78 + vpshufd xmm10, xmm3, 78 + vpclmulqdq xmm11, xmm3, xmm1, 17 + vpclmulqdq xmm8, xmm3, xmm1, 0 + vpxor xmm9, xmm9, xmm1 + vpxor xmm10, xmm10, xmm3 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpxor xmm9, xmm9, xmm8 + vpxor xmm9, xmm9, xmm11 + vpslldq xmm10, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vpxor xmm8, xmm8, xmm10 + vpxor xmm7, xmm11, xmm9 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm7, xmm7, xmm14 + vmovdqu OWORD PTR [rsp+96], xmm7 + ; H ^ 8 + vpclmulqdq xmm8, xmm3, xmm3, 0 + vpclmulqdq xmm7, xmm3, xmm3, 17 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm7, xmm7, xmm14 + vmovdqu OWORD PTR [rsp+112], xmm7 +L_AES_GCM_decrypt_update_avx1_ghash_128: + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + vmovdqu xmm0, OWORD PTR [r15] + vmovdqa xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64 + vpshufb xmm8, xmm0, xmm1 + vpaddd xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one + vpshufb xmm9, xmm9, xmm1 + vpaddd xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two + vpshufb xmm10, xmm10, xmm1 + vpaddd xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three + vpshufb xmm11, xmm11, xmm1 + vpaddd xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four + vpshufb xmm12, xmm12, xmm1 + vpaddd xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five + vpshufb xmm13, xmm13, xmm1 + vpaddd xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six + vpshufb xmm14, xmm14, xmm1 + vpaddd xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven + vpshufb xmm15, xmm15, xmm1 + vpaddd xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight + vmovdqa xmm7, OWORD PTR [rax] + vmovdqu OWORD PTR [r15], xmm0 + vpxor xmm8, xmm8, xmm7 + vpxor xmm9, xmm9, xmm7 + vpxor xmm10, xmm10, xmm7 + vpxor xmm11, xmm11, xmm7 + vpxor xmm12, xmm12, xmm7 + vpxor xmm13, xmm13, xmm7 + vpxor xmm14, xmm14, xmm7 + vpxor xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rsp+112] + vmovdqu xmm0, OWORD PTR [rcx] + vaesenc xmm8, xmm8, [rax+16] + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm0, xmm0, xmm2 + vpshufd xmm1, xmm7, 78 + vpshufd xmm5, xmm0, 78 + vpxor xmm1, xmm1, xmm7 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm3, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [rax+16] + vaesenc xmm10, xmm10, [rax+16] + vpclmulqdq xmm2, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [rax+16] + vaesenc xmm12, xmm12, [rax+16] + vpclmulqdq xmm1, xmm1, xmm5, 0 + vaesenc xmm13, xmm13, [rax+16] + vaesenc xmm14, xmm14, [rax+16] + vaesenc xmm15, xmm15, [rax+16] + vpxor xmm1, xmm1, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmm7, OWORD PTR [rsp+96] + vmovdqu xmm0, OWORD PTR [rcx+16] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [rax+32] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [rax+32] + vaesenc xmm10, xmm10, [rax+32] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [rax+32] + vaesenc xmm12, xmm12, [rax+32] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [rax+32] + vaesenc xmm14, xmm14, [rax+32] + vaesenc xmm15, xmm15, [rax+32] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp+80] + vmovdqu xmm0, OWORD PTR [rcx+32] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [rax+48] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [rax+48] + vaesenc xmm10, xmm10, [rax+48] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [rax+48] + vaesenc xmm12, xmm12, [rax+48] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [rax+48] + vaesenc xmm14, xmm14, [rax+48] + vaesenc xmm15, xmm15, [rax+48] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp+64] + vmovdqu xmm0, OWORD PTR [rcx+48] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [rax+64] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [rax+64] + vaesenc xmm10, xmm10, [rax+64] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [rax+64] + vaesenc xmm12, xmm12, [rax+64] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [rax+64] + vaesenc xmm14, xmm14, [rax+64] + vaesenc xmm15, xmm15, [rax+64] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp+48] + vmovdqu xmm0, OWORD PTR [rcx+64] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [rax+80] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [rax+80] + vaesenc xmm10, xmm10, [rax+80] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [rax+80] + vaesenc xmm12, xmm12, [rax+80] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [rax+80] + vaesenc xmm14, xmm14, [rax+80] + vaesenc xmm15, xmm15, [rax+80] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp+32] + vmovdqu xmm0, OWORD PTR [rcx+80] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [rax+96] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [rax+96] + vaesenc xmm10, xmm10, [rax+96] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [rax+96] + vaesenc xmm12, xmm12, [rax+96] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [rax+96] + vaesenc xmm14, xmm14, [rax+96] + vaesenc xmm15, xmm15, [rax+96] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp+16] + vmovdqu xmm0, OWORD PTR [rcx+96] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [rax+112] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [rax+112] + vaesenc xmm10, xmm10, [rax+112] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [rax+112] + vaesenc xmm12, xmm12, [rax+112] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [rax+112] + vaesenc xmm14, xmm14, [rax+112] + vaesenc xmm15, xmm15, [rax+112] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vmovdqu xmm7, OWORD PTR [rsp] + vmovdqu xmm0, OWORD PTR [rcx+112] + vpshufd xmm4, xmm7, 78 + vpshufb xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask + vaesenc xmm8, xmm8, [rax+128] + vpxor xmm4, xmm4, xmm7 + vpshufd xmm5, xmm0, 78 + vpxor xmm5, xmm5, xmm0 + vpclmulqdq xmm6, xmm0, xmm7, 17 + vaesenc xmm9, xmm9, [rax+128] + vaesenc xmm10, xmm10, [rax+128] + vpclmulqdq xmm7, xmm0, xmm7, 0 + vaesenc xmm11, xmm11, [rax+128] + vaesenc xmm12, xmm12, [rax+128] + vpclmulqdq xmm4, xmm4, xmm5, 0 + vaesenc xmm13, xmm13, [rax+128] + vaesenc xmm14, xmm14, [rax+128] + vaesenc xmm15, xmm15, [rax+128] + vpxor xmm1, xmm1, xmm7 + vpxor xmm2, xmm2, xmm7 + vpxor xmm1, xmm1, xmm6 + vpxor xmm3, xmm3, xmm6 + vpxor xmm1, xmm1, xmm4 + vpslldq xmm5, xmm1, 8 + vpsrldq xmm1, xmm1, 8 + vaesenc xmm8, xmm8, [rax+144] + vpxor xmm2, xmm2, xmm5 + vpxor xmm3, xmm3, xmm1 + vaesenc xmm9, xmm9, [rax+144] + vpslld xmm7, xmm2, 31 + vpslld xmm4, xmm2, 30 + vpslld xmm5, xmm2, 25 + vaesenc xmm10, xmm10, [rax+144] + vpxor xmm7, xmm7, xmm4 + vpxor xmm7, xmm7, xmm5 + vaesenc xmm11, xmm11, [rax+144] + vpsrldq xmm4, xmm7, 4 + vpslldq xmm7, xmm7, 12 + vaesenc xmm12, xmm12, [rax+144] + vpxor xmm2, xmm2, xmm7 + vpsrld xmm5, xmm2, 1 + vaesenc xmm13, xmm13, [rax+144] + vpsrld xmm1, xmm2, 2 + vpsrld xmm0, xmm2, 7 + vaesenc xmm14, xmm14, [rax+144] + vpxor xmm5, xmm5, xmm1 + vpxor xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, [rax+144] + vpxor xmm5, xmm5, xmm4 + vpxor xmm2, xmm2, xmm5 + vpxor xmm2, xmm2, xmm3 + cmp r8d, 11 + vmovdqa xmm7, OWORD PTR [rax+160] + jl L_AES_GCM_decrypt_update_avx1_aesenc_128_ghash_avx_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [rax+176] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + cmp r8d, 13 + vmovdqa xmm7, OWORD PTR [rax+192] + jl L_AES_GCM_decrypt_update_avx1_aesenc_128_ghash_avx_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [rax+208] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqa xmm7, OWORD PTR [rax+224] +L_AES_GCM_decrypt_update_avx1_aesenc_128_ghash_avx_done: + vaesenclast xmm8, xmm8, xmm7 + vaesenclast xmm9, xmm9, xmm7 + vmovdqu xmm0, OWORD PTR [rcx] + vmovdqu xmm1, OWORD PTR [rcx+16] + vpxor xmm8, xmm8, xmm0 + vpxor xmm9, xmm9, xmm1 + vmovdqu OWORD PTR [rdx], xmm8 + vmovdqu OWORD PTR [rdx+16], xmm9 + vaesenclast xmm10, xmm10, xmm7 + vaesenclast xmm11, xmm11, xmm7 + vmovdqu xmm0, OWORD PTR [rcx+32] + vmovdqu xmm1, OWORD PTR [rcx+48] + vpxor xmm10, xmm10, xmm0 + vpxor xmm11, xmm11, xmm1 + vmovdqu OWORD PTR [rdx+32], xmm10 + vmovdqu OWORD PTR [rdx+48], xmm11 + vaesenclast xmm12, xmm12, xmm7 + vaesenclast xmm13, xmm13, xmm7 + vmovdqu xmm0, OWORD PTR [rcx+64] + vmovdqu xmm1, OWORD PTR [rcx+80] + vpxor xmm12, xmm12, xmm0 + vpxor xmm13, xmm13, xmm1 + vmovdqu OWORD PTR [rdx+64], xmm12 + vmovdqu OWORD PTR [rdx+80], xmm13 + vaesenclast xmm14, xmm14, xmm7 + vaesenclast xmm15, xmm15, xmm7 + vmovdqu xmm0, OWORD PTR [rcx+96] + vmovdqu xmm1, OWORD PTR [rcx+112] + vpxor xmm14, xmm14, xmm0 + vpxor xmm15, xmm15, xmm1 + vmovdqu OWORD PTR [rdx+96], xmm14 + vmovdqu OWORD PTR [rdx+112], xmm15 + add edi, 128 + cmp edi, r13d + jl L_AES_GCM_decrypt_update_avx1_ghash_128 + vmovdqa xmm6, xmm2 + vmovdqu xmm5, OWORD PTR [rsp] +L_AES_GCM_decrypt_update_avx1_done_128: + mov edx, r9d + cmp edi, edx + jge L_AES_GCM_decrypt_update_avx1_done_dec + mov r13d, r9d + and r13d, 4294967280 + cmp edi, r13d + jge L_AES_GCM_decrypt_update_avx1_last_block_done +L_AES_GCM_decrypt_update_avx1_last_block_start: + vmovdqu xmm13, OWORD PTR [r11+rdi] + vmovdqa xmm0, xmm5 + vpshufb xmm1, xmm13, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm1, xmm1, xmm6 + vmovdqu xmm9, OWORD PTR [r15] + vpshufb xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one + vmovdqu OWORD PTR [r15], xmm9 + vpxor xmm8, xmm8, [rax] + vpclmulqdq xmm10, xmm1, xmm0, 16 + vaesenc xmm8, xmm8, [rax+16] + vaesenc xmm8, xmm8, [rax+32] + vpclmulqdq xmm11, xmm1, xmm0, 1 + vaesenc xmm8, xmm8, [rax+48] + vaesenc xmm8, xmm8, [rax+64] + vpclmulqdq xmm12, xmm1, xmm0, 0 + vaesenc xmm8, xmm8, [rax+80] + vpclmulqdq xmm1, xmm1, xmm0, 17 + vaesenc xmm8, xmm8, [rax+96] + vpxor xmm10, xmm10, xmm11 + vpslldq xmm2, xmm10, 8 + vpsrldq xmm10, xmm10, 8 + vaesenc xmm8, xmm8, [rax+112] + vpxor xmm2, xmm2, xmm12 + vpxor xmm3, xmm1, xmm10 + vmovdqa xmm0, OWORD PTR L_avx1_aes_gcm_mod2_128 + vpclmulqdq xmm11, xmm2, xmm0, 16 + vaesenc xmm8, xmm8, [rax+128] + vpshufd xmm10, xmm2, 78 + vpxor xmm10, xmm10, xmm11 + vpclmulqdq xmm11, xmm10, xmm0, 16 + vaesenc xmm8, xmm8, [rax+144] + vpshufd xmm10, xmm10, 78 + vpxor xmm10, xmm10, xmm11 + vpxor xmm6, xmm10, xmm3 + cmp r8d, 11 + vmovdqa xmm9, OWORD PTR [rax+160] + jl L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [rax+176] + cmp r8d, 13 + vmovdqa xmm9, OWORD PTR [rax+192] + jl L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last + vaesenc xmm8, xmm8, xmm9 + vaesenc xmm8, xmm8, [rax+208] + vmovdqa xmm9, OWORD PTR [rax+224] +L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last: + vaesenclast xmm8, xmm8, xmm9 + vmovdqa xmm0, xmm13 + vpxor xmm8, xmm8, xmm0 + vmovdqu OWORD PTR [r10+rdi], xmm8 + add edi, 16 + cmp edi, r13d + jl L_AES_GCM_decrypt_update_avx1_last_block_start +L_AES_GCM_decrypt_update_avx1_last_block_done: +L_AES_GCM_decrypt_update_avx1_done_dec: + vmovdqa OWORD PTR [r12], xmm6 + vzeroupper + add rsp, 168 + pop rdi + pop r15 + pop r14 + pop r12 + pop r13 + ret +AES_GCM_decrypt_update_avx1 ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_decrypt_final_avx1 PROC + push r13 + push r12 + push r14 + push rbp + push r15 + mov rax, rcx + mov r10d, r9d + mov r9, rdx + mov r11d, DWORD PTR [rsp+80] + mov r12, QWORD PTR [rsp+88] + mov r14, QWORD PTR [rsp+96] + mov rbp, QWORD PTR [rsp+104] + sub rsp, 16 + vmovdqa xmm6, OWORD PTR [rax] + vmovdqa xmm5, OWORD PTR [r12] + vmovdqa xmm15, OWORD PTR [r14] + vpsrlq xmm9, xmm5, 63 + vpsllq xmm8, xmm5, 1 + vpslldq xmm9, xmm9, 8 + vpor xmm8, xmm8, xmm9 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128 + vpxor xmm5, xmm5, xmm8 + mov edx, r10d + mov ecx, r11d + shl rdx, 3 + shl rcx, 3 + vmovq xmm0, rdx + vmovq xmm1, rcx + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm6, xmm6, xmm0 + ; ghash_gfmul_red_avx + vpshufd xmm9, xmm5, 78 + vpshufd xmm10, xmm6, 78 + vpclmulqdq xmm11, xmm6, xmm5, 17 + vpclmulqdq xmm8, xmm6, xmm5, 0 + vpxor xmm9, xmm9, xmm5 + vpxor xmm10, xmm10, xmm6 + vpclmulqdq xmm9, xmm9, xmm10, 0 + vpxor xmm9, xmm9, xmm8 + vpxor xmm9, xmm9, xmm11 + vpslldq xmm10, xmm9, 8 + vpsrldq xmm9, xmm9, 8 + vpxor xmm8, xmm8, xmm10 + vpxor xmm6, xmm11, xmm9 + vpslld xmm12, xmm8, 31 + vpslld xmm13, xmm8, 30 + vpslld xmm14, xmm8, 25 + vpxor xmm12, xmm12, xmm13 + vpxor xmm12, xmm12, xmm14 + vpsrldq xmm13, xmm12, 4 + vpslldq xmm12, xmm12, 12 + vpxor xmm8, xmm8, xmm12 + vpsrld xmm14, xmm8, 1 + vpsrld xmm10, xmm8, 2 + vpsrld xmm9, xmm8, 7 + vpxor xmm14, xmm14, xmm10 + vpxor xmm14, xmm14, xmm9 + vpxor xmm14, xmm14, xmm13 + vpxor xmm14, xmm14, xmm8 + vpxor xmm6, xmm6, xmm14 + vpshufb xmm6, xmm6, OWORD PTR L_avx1_aes_gcm_bswap_mask + vpxor xmm0, xmm6, xmm15 + cmp r8d, 16 + je L_AES_GCM_decrypt_final_avx1_cmp_tag_16 + sub rsp, 16 + xor rcx, rcx + xor r15, r15 + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_decrypt_final_avx1_cmp_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + xor r13b, BYTE PTR [r9+rcx] + or r15b, r13b + inc ecx + cmp ecx, r8d + jne L_AES_GCM_decrypt_final_avx1_cmp_tag_loop + cmp r15, 0 + sete r15b + add rsp, 16 + xor rcx, rcx + jmp L_AES_GCM_decrypt_final_avx1_cmp_tag_done +L_AES_GCM_decrypt_final_avx1_cmp_tag_16: + vmovdqu xmm1, OWORD PTR [r9] + vpcmpeqb xmm0, xmm0, xmm1 + vpmovmskb rdx, xmm0 + ; %%edx == 0xFFFF then return 1 else => return 0 + xor r15d, r15d + cmp edx, 65535 + sete r15b +L_AES_GCM_decrypt_final_avx1_cmp_tag_done: + mov DWORD PTR [rbp], r15d + vzeroupper + add rsp, 16 + pop r15 + pop rbp + pop r14 + pop r12 + pop r13 + ret +AES_GCM_decrypt_final_avx1 ENDP +_text ENDS +ENDIF +IFDEF HAVE_INTEL_AVX2 +_DATA SEGMENT +ALIGN 16 +L_avx2_aes_gcm_one QWORD 0, 1 +ptr_L_avx2_aes_gcm_one QWORD L_avx2_aes_gcm_one +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_aes_gcm_two QWORD 0, 2 +ptr_L_avx2_aes_gcm_two QWORD L_avx2_aes_gcm_two +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_aes_gcm_three QWORD 0, 3 +ptr_L_avx2_aes_gcm_three QWORD L_avx2_aes_gcm_three +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_aes_gcm_four QWORD 0, 4 +ptr_L_avx2_aes_gcm_four QWORD L_avx2_aes_gcm_four +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_aes_gcm_five QWORD 0, 5 +ptr_L_avx2_aes_gcm_five QWORD L_avx2_aes_gcm_five +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_aes_gcm_six QWORD 0, 6 +ptr_L_avx2_aes_gcm_six QWORD L_avx2_aes_gcm_six +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_aes_gcm_seven QWORD 0, 7 +ptr_L_avx2_aes_gcm_seven QWORD L_avx2_aes_gcm_seven +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_aes_gcm_eight QWORD 0, 8 +ptr_L_avx2_aes_gcm_eight QWORD L_avx2_aes_gcm_eight +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_aes_gcm_bswap_one QWORD 0, 72057594037927936 +ptr_L_avx2_aes_gcm_bswap_one QWORD L_avx2_aes_gcm_bswap_one +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_aes_gcm_bswap_epi64 QWORD 283686952306183, 579005069656919567 +ptr_L_avx2_aes_gcm_bswap_epi64 QWORD L_avx2_aes_gcm_bswap_epi64 +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_aes_gcm_bswap_mask QWORD 579005069656919567, 283686952306183 +ptr_L_avx2_aes_gcm_bswap_mask QWORD L_avx2_aes_gcm_bswap_mask +_DATA ENDS +_DATA SEGMENT +ALIGN 16 +L_avx2_aes_gcm_mod2_128 QWORD 1, 13979173243358019584 +ptr_L_avx2_aes_gcm_mod2_128 QWORD L_avx2_aes_gcm_mod2_128 +_DATA ENDS +_text SEGMENT READONLY PARA +AES_GCM_encrypt_avx2 PROC + push r13 + push rdi + push r12 + push r15 + push rbx + push r14 + push rsi + mov rdi, rcx + mov r12, r8 + mov rax, r9 + mov r15, QWORD PTR [rsp+96] + mov r8, rdx + mov r10d, DWORD PTR [rsp+104] + mov r11d, DWORD PTR [rsp+112] + mov ebx, DWORD PTR [rsp+120] + mov r14d, DWORD PTR [rsp+128] + mov rsi, QWORD PTR [rsp+136] + mov r9d, DWORD PTR [rsp+144] + sub rsp, 160 + vpxor xmm4, xmm4, xmm4 + vpxor xmm6, xmm6, xmm6 + mov edx, ebx + cmp edx, 12 + je L_AES_GCM_encrypt_avx2_iv_12 + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqu xmm5, OWORD PTR [rsi] + vaesenc xmm5, xmm5, [rsi+16] + vaesenc xmm5, xmm5, [rsi+32] + vaesenc xmm5, xmm5, [rsi+48] + vaesenc xmm5, xmm5, [rsi+64] + vaesenc xmm5, xmm5, [rsi+80] + vaesenc xmm5, xmm5, [rsi+96] + vaesenc xmm5, xmm5, [rsi+112] + vaesenc xmm5, xmm5, [rsi+128] + vaesenc xmm5, xmm5, [rsi+144] + cmp r9d, 11 + vmovdqu xmm0, OWORD PTR [rsi+160] + jl L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm5, xmm5, [rsi+176] + cmp r9d, 13 + vmovdqu xmm0, OWORD PTR [rsi+192] + jl L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm5, xmm5, [rsi+208] + vmovdqu xmm0, OWORD PTR [rsi+224] +L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last: + vaesenclast xmm5, xmm5, xmm0 + vpshufb xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov rcx, 0 + je L_AES_GCM_encrypt_avx2_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_encrypt_avx2_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_encrypt_avx2_calc_iv_16_loop: + vmovdqu xmm0, OWORD PTR [rax+rcx] + vpshufb xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm4, 16 + vpclmulqdq xmm1, xmm5, xmm4, 1 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm7, xmm0, xmm1 + vpxor xmm4, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpclmulqdq xmm0, xmm7, xmm2, 16 + vpshufd xmm1, xmm7, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_encrypt_avx2_calc_iv_16_loop + mov edx, ebx + cmp ecx, edx + je L_AES_GCM_encrypt_avx2_calc_iv_done +L_AES_GCM_encrypt_avx2_calc_iv_lt16: + vpxor xmm0, xmm0, xmm0 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_encrypt_avx2_calc_iv_loop: + movzx r13d, BYTE PTR [rax+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_encrypt_avx2_calc_iv_loop + vmovdqu xmm0, OWORD PTR [rsp] + vpshufb xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm4, 16 + vpclmulqdq xmm1, xmm5, xmm4, 1 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm7, xmm0, xmm1 + vpxor xmm4, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpclmulqdq xmm0, xmm7, xmm2, 16 + vpshufd xmm1, xmm7, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 +L_AES_GCM_encrypt_avx2_calc_iv_done: + ; T = Encrypt counter + vpxor xmm0, xmm0, xmm0 + shl edx, 3 + vmovq xmm0, rdx + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm4, 16 + vpclmulqdq xmm1, xmm5, xmm4, 1 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm7, xmm0, xmm1 + vpxor xmm4, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpclmulqdq xmm0, xmm7, xmm2, 16 + vpshufd xmm1, xmm7, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 + vpshufb xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask + ; Encrypt counter + vmovdqu xmm15, OWORD PTR [rsi] + vpxor xmm15, xmm15, xmm4 + vaesenc xmm15, xmm15, [rsi+16] + vaesenc xmm15, xmm15, [rsi+32] + vaesenc xmm15, xmm15, [rsi+48] + vaesenc xmm15, xmm15, [rsi+64] + vaesenc xmm15, xmm15, [rsi+80] + vaesenc xmm15, xmm15, [rsi+96] + vaesenc xmm15, xmm15, [rsi+112] + vaesenc xmm15, xmm15, [rsi+128] + vaesenc xmm15, xmm15, [rsi+144] + cmp r9d, 11 + vmovdqu xmm0, OWORD PTR [rsi+160] + jl L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last + vaesenc xmm15, xmm15, xmm0 + vaesenc xmm15, xmm15, [rsi+176] + cmp r9d, 13 + vmovdqu xmm0, OWORD PTR [rsi+192] + jl L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last + vaesenc xmm15, xmm15, xmm0 + vaesenc xmm15, xmm15, [rsi+208] + vmovdqu xmm0, OWORD PTR [rsi+224] +L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last: + vaesenclast xmm15, xmm15, xmm0 + jmp L_AES_GCM_encrypt_avx2_iv_done +L_AES_GCM_encrypt_avx2_iv_12: + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + vmovdqu xmm4, OWORD PTR L_avx2_aes_gcm_bswap_one + vmovdqu xmm5, OWORD PTR [rsi] + vpblendd xmm4, xmm4, [rax], 7 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqu xmm7, OWORD PTR [rsi+16] + vpxor xmm15, xmm4, xmm5 + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm0, OWORD PTR [rsi+32] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + vmovdqu xmm0, OWORD PTR [rsi+48] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + vmovdqu xmm0, OWORD PTR [rsi+64] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + vmovdqu xmm0, OWORD PTR [rsi+80] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + vmovdqu xmm0, OWORD PTR [rsi+96] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + vmovdqu xmm0, OWORD PTR [rsi+112] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + vmovdqu xmm0, OWORD PTR [rsi+128] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + vmovdqu xmm0, OWORD PTR [rsi+144] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + cmp r9d, 11 + vmovdqu xmm0, OWORD PTR [rsi+160] + jl L_AES_GCM_encrypt_avx2_calc_iv_12_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + vmovdqu xmm0, OWORD PTR [rsi+176] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + cmp r9d, 13 + vmovdqu xmm0, OWORD PTR [rsi+192] + jl L_AES_GCM_encrypt_avx2_calc_iv_12_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + vmovdqu xmm0, OWORD PTR [rsi+208] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + vmovdqu xmm0, OWORD PTR [rsi+224] +L_AES_GCM_encrypt_avx2_calc_iv_12_last: + vaesenclast xmm5, xmm5, xmm0 + vaesenclast xmm15, xmm15, xmm0 + vpshufb xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask +L_AES_GCM_encrypt_avx2_iv_done: + ; Additional authentication data + mov edx, r11d + cmp edx, 0 + je L_AES_GCM_encrypt_avx2_calc_aad_done + xor ecx, ecx + cmp edx, 16 + jl L_AES_GCM_encrypt_avx2_calc_aad_lt16 + and edx, 4294967280 +L_AES_GCM_encrypt_avx2_calc_aad_16_loop: + vmovdqu xmm0, OWORD PTR [r12+rcx] + vpshufb xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm6, 16 + vpclmulqdq xmm1, xmm5, xmm6, 1 + vpclmulqdq xmm0, xmm5, xmm6, 0 + vpclmulqdq xmm3, xmm5, xmm6, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm7, xmm0, xmm1 + vpxor xmm6, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm6, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm6, xmm6, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpclmulqdq xmm0, xmm7, xmm2, 16 + vpshufd xmm1, xmm7, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm6, xmm6, xmm1 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_encrypt_avx2_calc_aad_16_loop + mov edx, r11d + cmp ecx, edx + je L_AES_GCM_encrypt_avx2_calc_aad_done +L_AES_GCM_encrypt_avx2_calc_aad_lt16: + vpxor xmm0, xmm0, xmm0 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_encrypt_avx2_calc_aad_loop: + movzx r13d, BYTE PTR [r12+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_encrypt_avx2_calc_aad_loop + vmovdqu xmm0, OWORD PTR [rsp] + vpshufb xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm6, 16 + vpclmulqdq xmm1, xmm5, xmm6, 1 + vpclmulqdq xmm0, xmm5, xmm6, 0 + vpclmulqdq xmm3, xmm5, xmm6, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm7, xmm0, xmm1 + vpxor xmm6, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm6, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm6, xmm6, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpclmulqdq xmm0, xmm7, xmm2, 16 + vpshufd xmm1, xmm7, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm6, xmm6, xmm1 +L_AES_GCM_encrypt_avx2_calc_aad_done: + ; Calculate counter and H + vpsrlq xmm1, xmm5, 63 + vpsllq xmm0, xmm5, 1 + vpslldq xmm1, xmm1, 8 + vpor xmm0, xmm0, xmm1 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpshufb xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64 + vpand xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpaddd xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one + vpxor xmm5, xmm5, xmm0 + xor ebx, ebx + cmp r10d, 128 + mov r13d, r10d + jl L_AES_GCM_encrypt_avx2_done_128 + and r13d, 4294967168 + vmovdqu OWORD PTR [rsp+128], xmm4 + vmovdqu OWORD PTR [rsp+144], xmm15 + vmovdqu xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128 + ; H ^ 1 and H ^ 2 + vpclmulqdq xmm9, xmm5, xmm5, 0 + vpclmulqdq xmm10, xmm5, xmm5, 17 + vpclmulqdq xmm8, xmm9, xmm3, 16 + vpshufd xmm9, xmm9, 78 + vpxor xmm9, xmm9, xmm8 + vpclmulqdq xmm8, xmm9, xmm3, 16 + vpshufd xmm9, xmm9, 78 + vpxor xmm9, xmm9, xmm8 + vpxor xmm0, xmm10, xmm9 + vmovdqu OWORD PTR [rsp], xmm5 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 and H ^ 4 + vpclmulqdq xmm11, xmm0, xmm5, 16 + vpclmulqdq xmm10, xmm0, xmm5, 1 + vpclmulqdq xmm9, xmm0, xmm5, 0 + vpclmulqdq xmm12, xmm0, xmm5, 17 + vpclmulqdq xmm13, xmm0, xmm0, 0 + vpclmulqdq xmm14, xmm0, xmm0, 17 + vpxor xmm11, xmm11, xmm10 + vpslldq xmm10, xmm11, 8 + vpsrldq xmm11, xmm11, 8 + vpxor xmm10, xmm10, xmm9 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm10, xmm10, xmm9 + vpxor xmm13, xmm13, xmm8 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm12, xmm12, xmm11 + vpxor xmm13, xmm13, xmm8 + vpxor xmm10, xmm10, xmm12 + vpxor xmm2, xmm13, xmm14 + vpxor xmm1, xmm10, xmm9 + vmovdqu OWORD PTR [rsp+32], xmm1 + vmovdqu OWORD PTR [rsp+48], xmm2 + ; H ^ 5 and H ^ 6 + vpclmulqdq xmm11, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 0 + vpclmulqdq xmm12, xmm1, xmm0, 17 + vpclmulqdq xmm13, xmm1, xmm1, 0 + vpclmulqdq xmm14, xmm1, xmm1, 17 + vpxor xmm11, xmm11, xmm10 + vpslldq xmm10, xmm11, 8 + vpsrldq xmm11, xmm11, 8 + vpxor xmm10, xmm10, xmm9 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm10, xmm10, xmm9 + vpxor xmm13, xmm13, xmm8 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm12, xmm12, xmm11 + vpxor xmm13, xmm13, xmm8 + vpxor xmm10, xmm10, xmm12 + vpxor xmm0, xmm13, xmm14 + vpxor xmm7, xmm10, xmm9 + vmovdqu OWORD PTR [rsp+64], xmm7 + vmovdqu OWORD PTR [rsp+80], xmm0 + ; H ^ 7 and H ^ 8 + vpclmulqdq xmm11, xmm2, xmm1, 16 + vpclmulqdq xmm10, xmm2, xmm1, 1 + vpclmulqdq xmm9, xmm2, xmm1, 0 + vpclmulqdq xmm12, xmm2, xmm1, 17 + vpclmulqdq xmm13, xmm2, xmm2, 0 + vpclmulqdq xmm14, xmm2, xmm2, 17 + vpxor xmm11, xmm11, xmm10 + vpslldq xmm10, xmm11, 8 + vpsrldq xmm11, xmm11, 8 + vpxor xmm10, xmm10, xmm9 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm10, xmm10, xmm9 + vpxor xmm13, xmm13, xmm8 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm12, xmm12, xmm11 + vpxor xmm13, xmm13, xmm8 + vpxor xmm10, xmm10, xmm12 + vpxor xmm0, xmm13, xmm14 + vpxor xmm7, xmm10, xmm9 + vmovdqu OWORD PTR [rsp+96], xmm7 + vmovdqu OWORD PTR [rsp+112], xmm0 + ; First 128 bytes of input + ; aesenc_128 + ; aesenc_ctr + vmovdqu xmm0, OWORD PTR [rsp+128] + vmovdqu xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one + vpshufb xmm8, xmm0, xmm1 + vpaddd xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two + vpshufb xmm9, xmm9, xmm1 + vpaddd xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three + vpshufb xmm10, xmm10, xmm1 + vpaddd xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four + vpshufb xmm11, xmm11, xmm1 + vpaddd xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five + vpshufb xmm12, xmm12, xmm1 + vpaddd xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six + vpshufb xmm13, xmm13, xmm1 + vpaddd xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven + vpshufb xmm14, xmm14, xmm1 + vpaddd xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight + vpshufb xmm15, xmm15, xmm1 + ; aesenc_xor + vmovdqu xmm7, OWORD PTR [rsi] + vmovdqu OWORD PTR [rsp+128], xmm0 + vpxor xmm8, xmm8, xmm7 + vpxor xmm9, xmm9, xmm7 + vpxor xmm10, xmm10, xmm7 + vpxor xmm11, xmm11, xmm7 + vpxor xmm12, xmm12, xmm7 + vpxor xmm13, xmm13, xmm7 + vpxor xmm14, xmm14, xmm7 + vpxor xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rsi+16] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rsi+32] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rsi+48] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rsi+64] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rsi+80] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rsi+96] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rsi+112] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rsi+128] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rsi+144] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + cmp r9d, 11 + vmovdqu xmm7, OWORD PTR [rsi+160] + jl L_AES_GCM_encrypt_avx2_aesenc_128_enc_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rsi+176] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + cmp r9d, 13 + vmovdqu xmm7, OWORD PTR [rsi+192] + jl L_AES_GCM_encrypt_avx2_aesenc_128_enc_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rsi+208] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rsi+224] +L_AES_GCM_encrypt_avx2_aesenc_128_enc_done: + ; aesenc_last + vaesenclast xmm8, xmm8, xmm7 + vaesenclast xmm9, xmm9, xmm7 + vaesenclast xmm10, xmm10, xmm7 + vaesenclast xmm11, xmm11, xmm7 + vmovdqu xmm0, OWORD PTR [rdi] + vmovdqu xmm1, OWORD PTR [rdi+16] + vmovdqu xmm2, OWORD PTR [rdi+32] + vmovdqu xmm3, OWORD PTR [rdi+48] + vpxor xmm8, xmm8, xmm0 + vpxor xmm9, xmm9, xmm1 + vpxor xmm10, xmm10, xmm2 + vpxor xmm11, xmm11, xmm3 + vmovdqu OWORD PTR [r8], xmm8 + vmovdqu OWORD PTR [r8+16], xmm9 + vmovdqu OWORD PTR [r8+32], xmm10 + vmovdqu OWORD PTR [r8+48], xmm11 + vaesenclast xmm12, xmm12, xmm7 + vaesenclast xmm13, xmm13, xmm7 + vaesenclast xmm14, xmm14, xmm7 + vaesenclast xmm15, xmm15, xmm7 + vmovdqu xmm0, OWORD PTR [rdi+64] + vmovdqu xmm1, OWORD PTR [rdi+80] + vmovdqu xmm2, OWORD PTR [rdi+96] + vmovdqu xmm3, OWORD PTR [rdi+112] + vpxor xmm12, xmm12, xmm0 + vpxor xmm13, xmm13, xmm1 + vpxor xmm14, xmm14, xmm2 + vpxor xmm15, xmm15, xmm3 + vmovdqu OWORD PTR [r8+64], xmm12 + vmovdqu OWORD PTR [r8+80], xmm13 + vmovdqu OWORD PTR [r8+96], xmm14 + vmovdqu OWORD PTR [r8+112], xmm15 + cmp r13d, 128 + mov ebx, 128 + jle L_AES_GCM_encrypt_avx2_end_128 + ; More 128 bytes of input +L_AES_GCM_encrypt_avx2_ghash_128: + ; aesenc_128_ghash + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [r8+rbx] + ; aesenc_ctr + vmovdqu xmm0, OWORD PTR [rsp+128] + vmovdqu xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one + vpshufb xmm8, xmm0, xmm1 + vpaddd xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two + vpshufb xmm9, xmm9, xmm1 + vpaddd xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three + vpshufb xmm10, xmm10, xmm1 + vpaddd xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four + vpshufb xmm11, xmm11, xmm1 + vpaddd xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five + vpshufb xmm12, xmm12, xmm1 + vpaddd xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six + vpshufb xmm13, xmm13, xmm1 + vpaddd xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven + vpshufb xmm14, xmm14, xmm1 + vpaddd xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight + vpshufb xmm15, xmm15, xmm1 + ; aesenc_xor + vmovdqu xmm7, OWORD PTR [rsi] + vmovdqu OWORD PTR [rsp+128], xmm0 + vpxor xmm8, xmm8, xmm7 + vpxor xmm9, xmm9, xmm7 + vpxor xmm10, xmm10, xmm7 + vpxor xmm11, xmm11, xmm7 + vpxor xmm12, xmm12, xmm7 + vpxor xmm13, xmm13, xmm7 + vpxor xmm14, xmm14, xmm7 + vpxor xmm15, xmm15, xmm7 + ; aesenc_pclmul_1 + vmovdqu xmm1, OWORD PTR [rdx+-128] + vmovdqu xmm0, OWORD PTR [rsi+16] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vmovdqu xmm2, OWORD PTR [rsp+112] + vpxor xmm1, xmm1, xmm6 + vpclmulqdq xmm5, xmm1, xmm2, 16 + vpclmulqdq xmm3, xmm1, xmm2, 1 + vpclmulqdq xmm6, xmm1, xmm2, 0 + vpclmulqdq xmm7, xmm1, xmm2, 17 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_2 + vmovdqu xmm1, OWORD PTR [rdx+-112] + vmovdqu xmm0, OWORD PTR [rsp+96] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rsi+32] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rdx+-96] + vmovdqu xmm0, OWORD PTR [rsp+80] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rsi+48] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rdx+-80] + vmovdqu xmm0, OWORD PTR [rsp+64] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rsi+64] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rdx+-64] + vmovdqu xmm0, OWORD PTR [rsp+48] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rsi+80] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rdx+-48] + vmovdqu xmm0, OWORD PTR [rsp+32] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rsi+96] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rdx+-32] + vmovdqu xmm0, OWORD PTR [rsp+16] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rsi+112] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rdx+-16] + vmovdqu xmm0, OWORD PTR [rsp] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rsi+128] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_l + vpxor xmm5, xmm5, xmm2 + vpxor xmm6, xmm6, xmm4 + vpxor xmm5, xmm5, xmm3 + vpslldq xmm1, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vmovdqu xmm4, OWORD PTR [rsi+144] + vmovdqu xmm0, OWORD PTR L_avx2_aes_gcm_mod2_128 + vaesenc xmm8, xmm8, xmm4 + vpxor xmm6, xmm6, xmm1 + vpxor xmm7, xmm7, xmm5 + vpclmulqdq xmm3, xmm6, xmm0, 16 + vaesenc xmm9, xmm9, xmm4 + vaesenc xmm10, xmm10, xmm4 + vaesenc xmm11, xmm11, xmm4 + vpshufd xmm6, xmm6, 78 + vpxor xmm6, xmm6, xmm3 + vpclmulqdq xmm3, xmm6, xmm0, 16 + vaesenc xmm12, xmm12, xmm4 + vaesenc xmm13, xmm13, xmm4 + vaesenc xmm14, xmm14, xmm4 + vpshufd xmm6, xmm6, 78 + vpxor xmm6, xmm6, xmm3 + vpxor xmm6, xmm6, xmm7 + vaesenc xmm15, xmm15, xmm4 + cmp r9d, 11 + vmovdqu xmm7, OWORD PTR [rsi+160] + jl L_AES_GCM_encrypt_avx2_aesenc_128_ghash_avx_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rsi+176] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + cmp r9d, 13 + vmovdqu xmm7, OWORD PTR [rsi+192] + jl L_AES_GCM_encrypt_avx2_aesenc_128_ghash_avx_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rsi+208] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rsi+224] +L_AES_GCM_encrypt_avx2_aesenc_128_ghash_avx_done: + ; aesenc_last + vaesenclast xmm8, xmm8, xmm7 + vaesenclast xmm9, xmm9, xmm7 + vaesenclast xmm10, xmm10, xmm7 + vaesenclast xmm11, xmm11, xmm7 + vmovdqu xmm0, OWORD PTR [rcx] + vmovdqu xmm1, OWORD PTR [rcx+16] + vmovdqu xmm2, OWORD PTR [rcx+32] + vmovdqu xmm3, OWORD PTR [rcx+48] + vpxor xmm8, xmm8, xmm0 + vpxor xmm9, xmm9, xmm1 + vpxor xmm10, xmm10, xmm2 + vpxor xmm11, xmm11, xmm3 + vmovdqu OWORD PTR [rdx], xmm8 + vmovdqu OWORD PTR [rdx+16], xmm9 + vmovdqu OWORD PTR [rdx+32], xmm10 + vmovdqu OWORD PTR [rdx+48], xmm11 + vaesenclast xmm12, xmm12, xmm7 + vaesenclast xmm13, xmm13, xmm7 + vaesenclast xmm14, xmm14, xmm7 + vaesenclast xmm15, xmm15, xmm7 + vmovdqu xmm0, OWORD PTR [rcx+64] + vmovdqu xmm1, OWORD PTR [rcx+80] + vmovdqu xmm2, OWORD PTR [rcx+96] + vmovdqu xmm3, OWORD PTR [rcx+112] + vpxor xmm12, xmm12, xmm0 + vpxor xmm13, xmm13, xmm1 + vpxor xmm14, xmm14, xmm2 + vpxor xmm15, xmm15, xmm3 + vmovdqu OWORD PTR [rdx+64], xmm12 + vmovdqu OWORD PTR [rdx+80], xmm13 + vmovdqu OWORD PTR [rdx+96], xmm14 + vmovdqu OWORD PTR [rdx+112], xmm15 + ; aesenc_128_ghash - end + add ebx, 128 + cmp ebx, r13d + jl L_AES_GCM_encrypt_avx2_ghash_128 +L_AES_GCM_encrypt_avx2_end_128: + vmovdqu xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpshufb xmm8, xmm8, xmm4 + vpshufb xmm9, xmm9, xmm4 + vpshufb xmm10, xmm10, xmm4 + vpshufb xmm11, xmm11, xmm4 + vpshufb xmm12, xmm12, xmm4 + vpshufb xmm13, xmm13, xmm4 + vpshufb xmm14, xmm14, xmm4 + vpshufb xmm15, xmm15, xmm4 + vpxor xmm8, xmm8, xmm6 + vmovdqu xmm7, OWORD PTR [rsp] + vpclmulqdq xmm5, xmm7, xmm15, 16 + vpclmulqdq xmm1, xmm7, xmm15, 1 + vpclmulqdq xmm4, xmm7, xmm15, 0 + vpclmulqdq xmm6, xmm7, xmm15, 17 + vpxor xmm5, xmm5, xmm1 + vmovdqu xmm7, OWORD PTR [rsp+16] + vpclmulqdq xmm2, xmm7, xmm14, 16 + vpclmulqdq xmm1, xmm7, xmm14, 1 + vpclmulqdq xmm0, xmm7, xmm14, 0 + vpclmulqdq xmm3, xmm7, xmm14, 17 + vpxor xmm2, xmm2, xmm1 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm4, xmm4, xmm0 + vmovdqu xmm15, OWORD PTR [rsp+32] + vmovdqu xmm7, OWORD PTR [rsp+48] + vpclmulqdq xmm2, xmm15, xmm13, 16 + vpclmulqdq xmm1, xmm15, xmm13, 1 + vpclmulqdq xmm0, xmm15, xmm13, 0 + vpclmulqdq xmm3, xmm15, xmm13, 17 + vpxor xmm2, xmm2, xmm1 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm4, xmm4, xmm0 + vpclmulqdq xmm2, xmm7, xmm12, 16 + vpclmulqdq xmm1, xmm7, xmm12, 1 + vpclmulqdq xmm0, xmm7, xmm12, 0 + vpclmulqdq xmm3, xmm7, xmm12, 17 + vpxor xmm2, xmm2, xmm1 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm4, xmm4, xmm0 + vmovdqu xmm15, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vpclmulqdq xmm2, xmm15, xmm11, 16 + vpclmulqdq xmm1, xmm15, xmm11, 1 + vpclmulqdq xmm0, xmm15, xmm11, 0 + vpclmulqdq xmm3, xmm15, xmm11, 17 + vpxor xmm2, xmm2, xmm1 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm4, xmm4, xmm0 + vpclmulqdq xmm2, xmm7, xmm10, 16 + vpclmulqdq xmm1, xmm7, xmm10, 1 + vpclmulqdq xmm0, xmm7, xmm10, 0 + vpclmulqdq xmm3, xmm7, xmm10, 17 + vpxor xmm2, xmm2, xmm1 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm4, xmm4, xmm0 + vmovdqu xmm15, OWORD PTR [rsp+96] + vmovdqu xmm7, OWORD PTR [rsp+112] + vpclmulqdq xmm2, xmm15, xmm9, 16 + vpclmulqdq xmm1, xmm15, xmm9, 1 + vpclmulqdq xmm0, xmm15, xmm9, 0 + vpclmulqdq xmm3, xmm15, xmm9, 17 + vpxor xmm2, xmm2, xmm1 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm4, xmm4, xmm0 + vpclmulqdq xmm2, xmm7, xmm8, 16 + vpclmulqdq xmm1, xmm7, xmm8, 1 + vpclmulqdq xmm0, xmm7, xmm8, 0 + vpclmulqdq xmm3, xmm7, xmm8, 17 + vpxor xmm2, xmm2, xmm1 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm4, xmm4, xmm0 + vpslldq xmm7, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm4, xmm4, xmm7 + vpxor xmm6, xmm6, xmm5 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpclmulqdq xmm0, xmm4, xmm2, 16 + vpshufd xmm1, xmm4, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm6, xmm6, xmm1 + vmovdqu xmm5, OWORD PTR [rsp] + vmovdqu xmm4, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] +L_AES_GCM_encrypt_avx2_done_128: + cmp ebx, r10d + je L_AES_GCM_encrypt_avx2_done_enc + mov r13d, r10d + and r13d, 4294967280 + cmp ebx, r13d + jge L_AES_GCM_encrypt_avx2_last_block_done + ; aesenc_block + vmovdqu xmm1, xmm4 + vpshufb xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64 + vpaddd xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_one + vpxor xmm0, xmm0, [rsi] + vmovdqu xmm2, OWORD PTR [rsi+16] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rsi+32] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rsi+48] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rsi+64] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rsi+80] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rsi+96] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rsi+112] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rsi+128] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rsi+144] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm4, xmm1 + cmp r9d, 11 + vmovdqu xmm1, OWORD PTR [rsi+160] + jl L_AES_GCM_encrypt_avx2_aesenc_block_last + vaesenc xmm0, xmm0, xmm1 + vmovdqu xmm2, OWORD PTR [rsi+176] + vaesenc xmm0, xmm0, xmm2 + cmp r9d, 13 + vmovdqu xmm1, OWORD PTR [rsi+192] + jl L_AES_GCM_encrypt_avx2_aesenc_block_last + vaesenc xmm0, xmm0, xmm1 + vmovdqu xmm2, OWORD PTR [rsi+208] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm1, OWORD PTR [rsi+224] +L_AES_GCM_encrypt_avx2_aesenc_block_last: + vaesenclast xmm0, xmm0, xmm1 + vmovdqu xmm1, OWORD PTR [rdi+rbx] + vpxor xmm0, xmm0, xmm1 + vmovdqu OWORD PTR [r8+rbx], xmm0 + vpshufb xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm0 + add ebx, 16 + cmp ebx, r13d + jge L_AES_GCM_encrypt_avx2_last_block_ghash +L_AES_GCM_encrypt_avx2_last_block_start: + vmovdqu xmm12, OWORD PTR [rdi+rbx] + vpshufb xmm11, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64 + vpaddd xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one + ; aesenc_gfmul_sb + vpclmulqdq xmm2, xmm6, xmm5, 1 + vpclmulqdq xmm3, xmm6, xmm5, 16 + vpclmulqdq xmm1, xmm6, xmm5, 0 + vpclmulqdq xmm8, xmm6, xmm5, 17 + vpxor xmm11, xmm11, [rsi] + vaesenc xmm11, xmm11, [rsi+16] + vpxor xmm3, xmm3, xmm2 + vpslldq xmm2, xmm3, 8 + vpsrldq xmm3, xmm3, 8 + vaesenc xmm11, xmm11, [rsi+32] + vpxor xmm2, xmm2, xmm1 + vpclmulqdq xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vaesenc xmm11, xmm11, [rsi+48] + vaesenc xmm11, xmm11, [rsi+64] + vaesenc xmm11, xmm11, [rsi+80] + vpshufd xmm2, xmm2, 78 + vpxor xmm2, xmm2, xmm1 + vpclmulqdq xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vaesenc xmm11, xmm11, [rsi+96] + vaesenc xmm11, xmm11, [rsi+112] + vaesenc xmm11, xmm11, [rsi+128] + vpshufd xmm2, xmm2, 78 + vaesenc xmm11, xmm11, [rsi+144] + vpxor xmm8, xmm8, xmm3 + vpxor xmm2, xmm2, xmm8 + vmovdqu xmm0, OWORD PTR [rsi+160] + cmp r9d, 11 + jl L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm11, xmm11, [rsi+176] + vmovdqu xmm0, OWORD PTR [rsi+192] + cmp r9d, 13 + jl L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm11, xmm11, [rsi+208] + vmovdqu xmm0, OWORD PTR [rsi+224] +L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last: + vaesenclast xmm11, xmm11, xmm0 + vpxor xmm6, xmm2, xmm1 + vpxor xmm11, xmm11, xmm12 + vmovdqu OWORD PTR [r8+rbx], xmm11 + vpshufb xmm11, xmm11, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm11 + add ebx, 16 + cmp ebx, r13d + jl L_AES_GCM_encrypt_avx2_last_block_start +L_AES_GCM_encrypt_avx2_last_block_ghash: + ; ghash_gfmul_red + vpclmulqdq xmm10, xmm6, xmm5, 16 + vpclmulqdq xmm9, xmm6, xmm5, 1 + vpclmulqdq xmm8, xmm6, xmm5, 0 + vpxor xmm10, xmm10, xmm9 + vpslldq xmm9, xmm10, 8 + vpsrldq xmm10, xmm10, 8 + vpxor xmm9, xmm9, xmm8 + vpclmulqdq xmm6, xmm6, xmm5, 17 + vpclmulqdq xmm8, xmm9, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vpshufd xmm9, xmm9, 78 + vpxor xmm9, xmm9, xmm8 + vpclmulqdq xmm8, xmm9, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vpshufd xmm9, xmm9, 78 + vpxor xmm6, xmm6, xmm10 + vpxor xmm6, xmm6, xmm9 + vpxor xmm6, xmm6, xmm8 +L_AES_GCM_encrypt_avx2_last_block_done: + mov ecx, r10d + mov edx, r10d + and ecx, 15 + jz L_AES_GCM_encrypt_avx2_done_enc + ; aesenc_last15_enc + vpshufb xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64 + vpxor xmm4, xmm4, [rsi] + vaesenc xmm4, xmm4, [rsi+16] + vaesenc xmm4, xmm4, [rsi+32] + vaesenc xmm4, xmm4, [rsi+48] + vaesenc xmm4, xmm4, [rsi+64] + vaesenc xmm4, xmm4, [rsi+80] + vaesenc xmm4, xmm4, [rsi+96] + vaesenc xmm4, xmm4, [rsi+112] + vaesenc xmm4, xmm4, [rsi+128] + vaesenc xmm4, xmm4, [rsi+144] + cmp r9d, 11 + vmovdqu xmm0, OWORD PTR [rsi+160] + jl L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc xmm4, xmm4, xmm0 + vaesenc xmm4, xmm4, [rsi+176] + cmp r9d, 13 + vmovdqu xmm0, OWORD PTR [rsi+192] + jl L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last + vaesenc xmm4, xmm4, xmm0 + vaesenc xmm4, xmm4, [rsi+208] + vmovdqu xmm0, OWORD PTR [rsi+224] +L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last: + vaesenclast xmm4, xmm4, xmm0 + xor ecx, ecx + vpxor xmm0, xmm0, xmm0 + vmovdqu OWORD PTR [rsp], xmm4 + vmovdqu OWORD PTR [rsp+16], xmm0 +L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_loop: + movzx r13d, BYTE PTR [rdi+rbx] + xor r13b, BYTE PTR [rsp+rcx] + mov BYTE PTR [rsp+rcx+16], r13b + mov BYTE PTR [r8+rbx], r13b + inc ebx + inc ecx + cmp ebx, edx + jl L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_loop +L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_finish_enc: + vmovdqu xmm4, OWORD PTR [rsp+16] + vpshufb xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm4 + ; ghash_gfmul_red + vpclmulqdq xmm2, xmm6, xmm5, 16 + vpclmulqdq xmm1, xmm6, xmm5, 1 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm6, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm6, xmm6, xmm2 + vpxor xmm6, xmm6, xmm1 + vpxor xmm6, xmm6, xmm0 +L_AES_GCM_encrypt_avx2_done_enc: + ; calc_tag + shl r10, 3 + shl r11, 3 + vmovq xmm0, r10 + vmovq xmm1, r11 + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm6 + ; ghash_gfmul_red + vpclmulqdq xmm4, xmm0, xmm5, 16 + vpclmulqdq xmm3, xmm0, xmm5, 1 + vpclmulqdq xmm2, xmm0, xmm5, 0 + vpxor xmm4, xmm4, xmm3 + vpslldq xmm3, xmm4, 8 + vpsrldq xmm4, xmm4, 8 + vpxor xmm3, xmm3, xmm2 + vpclmulqdq xmm0, xmm0, xmm5, 17 + vpclmulqdq xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vpshufd xmm3, xmm3, 78 + vpxor xmm3, xmm3, xmm2 + vpclmulqdq xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vpshufd xmm3, xmm3, 78 + vpxor xmm0, xmm0, xmm4 + vpxor xmm0, xmm0, xmm3 + vpxor xmm0, xmm0, xmm2 + vpshufb xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm0, xmm0, xmm15 + ; store_tag + cmp r14d, 16 + je L_AES_GCM_encrypt_avx2_store_tag_16 + xor rcx, rcx + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_encrypt_avx2_store_tag_loop: + movzx r13d, BYTE PTR [rsp+rcx] + mov BYTE PTR [r15+rcx], r13b + inc ecx + cmp ecx, r14d + jne L_AES_GCM_encrypt_avx2_store_tag_loop + jmp L_AES_GCM_encrypt_avx2_store_tag_done +L_AES_GCM_encrypt_avx2_store_tag_16: + vmovdqu OWORD PTR [r15], xmm0 +L_AES_GCM_encrypt_avx2_store_tag_done: + vzeroupper + add rsp, 160 + pop rsi + pop r14 + pop rbx + pop r15 + pop r12 + pop rdi + pop r13 + ret +AES_GCM_encrypt_avx2 ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_decrypt_avx2 PROC + push r13 + push rdi + push r12 + push r14 + push rbx + push r15 + push rsi + push rbp + mov rdi, rcx + mov r12, r8 + mov rax, r9 + mov r14, QWORD PTR [rsp+104] + mov r8, rdx + mov r10d, DWORD PTR [rsp+112] + mov r11d, DWORD PTR [rsp+120] + mov ebx, DWORD PTR [rsp+128] + mov r15d, DWORD PTR [rsp+136] + mov rsi, QWORD PTR [rsp+144] + mov r9d, DWORD PTR [rsp+152] + mov rbp, QWORD PTR [rsp+160] + sub rsp, 168 + vpxor xmm4, xmm4, xmm4 + vpxor xmm6, xmm6, xmm6 + mov edx, ebx + cmp edx, 12 + je L_AES_GCM_decrypt_avx2_iv_12 + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqu xmm5, OWORD PTR [rsi] + vaesenc xmm5, xmm5, [rsi+16] + vaesenc xmm5, xmm5, [rsi+32] + vaesenc xmm5, xmm5, [rsi+48] + vaesenc xmm5, xmm5, [rsi+64] + vaesenc xmm5, xmm5, [rsi+80] + vaesenc xmm5, xmm5, [rsi+96] + vaesenc xmm5, xmm5, [rsi+112] + vaesenc xmm5, xmm5, [rsi+128] + vaesenc xmm5, xmm5, [rsi+144] + cmp r9d, 11 + vmovdqu xmm0, OWORD PTR [rsi+160] + jl L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm5, xmm5, [rsi+176] + cmp r9d, 13 + vmovdqu xmm0, OWORD PTR [rsi+192] + jl L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm5, xmm5, [rsi+208] + vmovdqu xmm0, OWORD PTR [rsi+224] +L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last: + vaesenclast xmm5, xmm5, xmm0 + vpshufb xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov rcx, 0 + je L_AES_GCM_decrypt_avx2_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_decrypt_avx2_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_decrypt_avx2_calc_iv_16_loop: + vmovdqu xmm0, OWORD PTR [rax+rcx] + vpshufb xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm4, 16 + vpclmulqdq xmm1, xmm5, xmm4, 1 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm7, xmm0, xmm1 + vpxor xmm4, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpclmulqdq xmm0, xmm7, xmm2, 16 + vpshufd xmm1, xmm7, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_decrypt_avx2_calc_iv_16_loop + mov edx, ebx + cmp ecx, edx + je L_AES_GCM_decrypt_avx2_calc_iv_done +L_AES_GCM_decrypt_avx2_calc_iv_lt16: + vpxor xmm0, xmm0, xmm0 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_decrypt_avx2_calc_iv_loop: + movzx r13d, BYTE PTR [rax+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_decrypt_avx2_calc_iv_loop + vmovdqu xmm0, OWORD PTR [rsp] + vpshufb xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm4, 16 + vpclmulqdq xmm1, xmm5, xmm4, 1 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm7, xmm0, xmm1 + vpxor xmm4, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpclmulqdq xmm0, xmm7, xmm2, 16 + vpshufd xmm1, xmm7, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 +L_AES_GCM_decrypt_avx2_calc_iv_done: + ; T = Encrypt counter + vpxor xmm0, xmm0, xmm0 + shl edx, 3 + vmovq xmm0, rdx + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm4, 16 + vpclmulqdq xmm1, xmm5, xmm4, 1 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm7, xmm0, xmm1 + vpxor xmm4, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm4, xmm4, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpclmulqdq xmm0, xmm7, xmm2, 16 + vpshufd xmm1, xmm7, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 + vpshufb xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask + ; Encrypt counter + vmovdqu xmm15, OWORD PTR [rsi] + vpxor xmm15, xmm15, xmm4 + vaesenc xmm15, xmm15, [rsi+16] + vaesenc xmm15, xmm15, [rsi+32] + vaesenc xmm15, xmm15, [rsi+48] + vaesenc xmm15, xmm15, [rsi+64] + vaesenc xmm15, xmm15, [rsi+80] + vaesenc xmm15, xmm15, [rsi+96] + vaesenc xmm15, xmm15, [rsi+112] + vaesenc xmm15, xmm15, [rsi+128] + vaesenc xmm15, xmm15, [rsi+144] + cmp r9d, 11 + vmovdqu xmm0, OWORD PTR [rsi+160] + jl L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last + vaesenc xmm15, xmm15, xmm0 + vaesenc xmm15, xmm15, [rsi+176] + cmp r9d, 13 + vmovdqu xmm0, OWORD PTR [rsi+192] + jl L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last + vaesenc xmm15, xmm15, xmm0 + vaesenc xmm15, xmm15, [rsi+208] + vmovdqu xmm0, OWORD PTR [rsi+224] +L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last: + vaesenclast xmm15, xmm15, xmm0 + jmp L_AES_GCM_decrypt_avx2_iv_done +L_AES_GCM_decrypt_avx2_iv_12: + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + vmovdqu xmm4, OWORD PTR L_avx2_aes_gcm_bswap_one + vmovdqu xmm5, OWORD PTR [rsi] + vpblendd xmm4, xmm4, [rax], 7 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqu xmm7, OWORD PTR [rsi+16] + vpxor xmm15, xmm4, xmm5 + vaesenc xmm5, xmm5, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm0, OWORD PTR [rsi+32] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + vmovdqu xmm0, OWORD PTR [rsi+48] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + vmovdqu xmm0, OWORD PTR [rsi+64] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + vmovdqu xmm0, OWORD PTR [rsi+80] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + vmovdqu xmm0, OWORD PTR [rsi+96] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + vmovdqu xmm0, OWORD PTR [rsi+112] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + vmovdqu xmm0, OWORD PTR [rsi+128] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + vmovdqu xmm0, OWORD PTR [rsi+144] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + cmp r9d, 11 + vmovdqu xmm0, OWORD PTR [rsi+160] + jl L_AES_GCM_decrypt_avx2_calc_iv_12_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + vmovdqu xmm0, OWORD PTR [rsi+176] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + cmp r9d, 13 + vmovdqu xmm0, OWORD PTR [rsi+192] + jl L_AES_GCM_decrypt_avx2_calc_iv_12_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + vmovdqu xmm0, OWORD PTR [rsi+208] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm15, xmm15, xmm0 + vmovdqu xmm0, OWORD PTR [rsi+224] +L_AES_GCM_decrypt_avx2_calc_iv_12_last: + vaesenclast xmm5, xmm5, xmm0 + vaesenclast xmm15, xmm15, xmm0 + vpshufb xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask +L_AES_GCM_decrypt_avx2_iv_done: + ; Additional authentication data + mov edx, r11d + cmp edx, 0 + je L_AES_GCM_decrypt_avx2_calc_aad_done + xor ecx, ecx + cmp edx, 16 + jl L_AES_GCM_decrypt_avx2_calc_aad_lt16 + and edx, 4294967280 +L_AES_GCM_decrypt_avx2_calc_aad_16_loop: + vmovdqu xmm0, OWORD PTR [r12+rcx] + vpshufb xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm6, 16 + vpclmulqdq xmm1, xmm5, xmm6, 1 + vpclmulqdq xmm0, xmm5, xmm6, 0 + vpclmulqdq xmm3, xmm5, xmm6, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm7, xmm0, xmm1 + vpxor xmm6, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm6, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm6, xmm6, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpclmulqdq xmm0, xmm7, xmm2, 16 + vpshufd xmm1, xmm7, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm6, xmm6, xmm1 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_decrypt_avx2_calc_aad_16_loop + mov edx, r11d + cmp ecx, edx + je L_AES_GCM_decrypt_avx2_calc_aad_done +L_AES_GCM_decrypt_avx2_calc_aad_lt16: + vpxor xmm0, xmm0, xmm0 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_decrypt_avx2_calc_aad_loop: + movzx r13d, BYTE PTR [r12+rcx] + mov BYTE PTR [rsp+rbx], r13b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_decrypt_avx2_calc_aad_loop + vmovdqu xmm0, OWORD PTR [rsp] + vpshufb xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm6, 16 + vpclmulqdq xmm1, xmm5, xmm6, 1 + vpclmulqdq xmm0, xmm5, xmm6, 0 + vpclmulqdq xmm3, xmm5, xmm6, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm7, xmm0, xmm1 + vpxor xmm6, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm7, 31 + vpsrld xmm1, xmm6, 31 + vpslld xmm7, xmm7, 1 + vpslld xmm6, xmm6, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm6, xmm6, xmm2 + vpor xmm7, xmm7, xmm0 + vpor xmm6, xmm6, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpclmulqdq xmm0, xmm7, xmm2, 16 + vpshufd xmm1, xmm7, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm6, xmm6, xmm1 +L_AES_GCM_decrypt_avx2_calc_aad_done: + ; Calculate counter and H + vpsrlq xmm1, xmm5, 63 + vpsllq xmm0, xmm5, 1 + vpslldq xmm1, xmm1, 8 + vpor xmm0, xmm0, xmm1 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpshufb xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64 + vpand xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpaddd xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one + vpxor xmm5, xmm5, xmm0 + xor ebx, ebx + cmp r10d, 128 + mov r13d, r10d + jl L_AES_GCM_decrypt_avx2_done_128 + and r13d, 4294967168 + vmovdqu OWORD PTR [rsp+128], xmm4 + vmovdqu OWORD PTR [rsp+144], xmm15 + vmovdqu xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128 + ; H ^ 1 and H ^ 2 + vpclmulqdq xmm9, xmm5, xmm5, 0 + vpclmulqdq xmm10, xmm5, xmm5, 17 + vpclmulqdq xmm8, xmm9, xmm3, 16 + vpshufd xmm9, xmm9, 78 + vpxor xmm9, xmm9, xmm8 + vpclmulqdq xmm8, xmm9, xmm3, 16 + vpshufd xmm9, xmm9, 78 + vpxor xmm9, xmm9, xmm8 + vpxor xmm0, xmm10, xmm9 + vmovdqu OWORD PTR [rsp], xmm5 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 and H ^ 4 + vpclmulqdq xmm11, xmm0, xmm5, 16 + vpclmulqdq xmm10, xmm0, xmm5, 1 + vpclmulqdq xmm9, xmm0, xmm5, 0 + vpclmulqdq xmm12, xmm0, xmm5, 17 + vpclmulqdq xmm13, xmm0, xmm0, 0 + vpclmulqdq xmm14, xmm0, xmm0, 17 + vpxor xmm11, xmm11, xmm10 + vpslldq xmm10, xmm11, 8 + vpsrldq xmm11, xmm11, 8 + vpxor xmm10, xmm10, xmm9 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm10, xmm10, xmm9 + vpxor xmm13, xmm13, xmm8 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm12, xmm12, xmm11 + vpxor xmm13, xmm13, xmm8 + vpxor xmm10, xmm10, xmm12 + vpxor xmm2, xmm13, xmm14 + vpxor xmm1, xmm10, xmm9 + vmovdqu OWORD PTR [rsp+32], xmm1 + vmovdqu OWORD PTR [rsp+48], xmm2 + ; H ^ 5 and H ^ 6 + vpclmulqdq xmm11, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 0 + vpclmulqdq xmm12, xmm1, xmm0, 17 + vpclmulqdq xmm13, xmm1, xmm1, 0 + vpclmulqdq xmm14, xmm1, xmm1, 17 + vpxor xmm11, xmm11, xmm10 + vpslldq xmm10, xmm11, 8 + vpsrldq xmm11, xmm11, 8 + vpxor xmm10, xmm10, xmm9 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm10, xmm10, xmm9 + vpxor xmm13, xmm13, xmm8 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm12, xmm12, xmm11 + vpxor xmm13, xmm13, xmm8 + vpxor xmm10, xmm10, xmm12 + vpxor xmm0, xmm13, xmm14 + vpxor xmm7, xmm10, xmm9 + vmovdqu OWORD PTR [rsp+64], xmm7 + vmovdqu OWORD PTR [rsp+80], xmm0 + ; H ^ 7 and H ^ 8 + vpclmulqdq xmm11, xmm2, xmm1, 16 + vpclmulqdq xmm10, xmm2, xmm1, 1 + vpclmulqdq xmm9, xmm2, xmm1, 0 + vpclmulqdq xmm12, xmm2, xmm1, 17 + vpclmulqdq xmm13, xmm2, xmm2, 0 + vpclmulqdq xmm14, xmm2, xmm2, 17 + vpxor xmm11, xmm11, xmm10 + vpslldq xmm10, xmm11, 8 + vpsrldq xmm11, xmm11, 8 + vpxor xmm10, xmm10, xmm9 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm10, xmm10, xmm9 + vpxor xmm13, xmm13, xmm8 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm12, xmm12, xmm11 + vpxor xmm13, xmm13, xmm8 + vpxor xmm10, xmm10, xmm12 + vpxor xmm0, xmm13, xmm14 + vpxor xmm7, xmm10, xmm9 + vmovdqu OWORD PTR [rsp+96], xmm7 + vmovdqu OWORD PTR [rsp+112], xmm0 +L_AES_GCM_decrypt_avx2_ghash_128: + ; aesenc_128_ghash + lea rcx, QWORD PTR [rdi+rbx] + lea rdx, QWORD PTR [r8+rbx] + ; aesenc_ctr + vmovdqu xmm0, OWORD PTR [rsp+128] + vmovdqu xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one + vpshufb xmm8, xmm0, xmm1 + vpaddd xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two + vpshufb xmm9, xmm9, xmm1 + vpaddd xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three + vpshufb xmm10, xmm10, xmm1 + vpaddd xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four + vpshufb xmm11, xmm11, xmm1 + vpaddd xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five + vpshufb xmm12, xmm12, xmm1 + vpaddd xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six + vpshufb xmm13, xmm13, xmm1 + vpaddd xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven + vpshufb xmm14, xmm14, xmm1 + vpaddd xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight + vpshufb xmm15, xmm15, xmm1 + ; aesenc_xor + vmovdqu xmm7, OWORD PTR [rsi] + vmovdqu OWORD PTR [rsp+128], xmm0 + vpxor xmm8, xmm8, xmm7 + vpxor xmm9, xmm9, xmm7 + vpxor xmm10, xmm10, xmm7 + vpxor xmm11, xmm11, xmm7 + vpxor xmm12, xmm12, xmm7 + vpxor xmm13, xmm13, xmm7 + vpxor xmm14, xmm14, xmm7 + vpxor xmm15, xmm15, xmm7 + ; aesenc_pclmul_1 + vmovdqu xmm1, OWORD PTR [rcx] + vmovdqu xmm0, OWORD PTR [rsi+16] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vmovdqu xmm2, OWORD PTR [rsp+112] + vpxor xmm1, xmm1, xmm6 + vpclmulqdq xmm5, xmm1, xmm2, 16 + vpclmulqdq xmm3, xmm1, xmm2, 1 + vpclmulqdq xmm6, xmm1, xmm2, 0 + vpclmulqdq xmm7, xmm1, xmm2, 17 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_2 + vmovdqu xmm1, OWORD PTR [rcx+16] + vmovdqu xmm0, OWORD PTR [rsp+96] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rsi+32] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rcx+32] + vmovdqu xmm0, OWORD PTR [rsp+80] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rsi+48] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rcx+48] + vmovdqu xmm0, OWORD PTR [rsp+64] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rsi+64] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rcx+64] + vmovdqu xmm0, OWORD PTR [rsp+48] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rsi+80] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rcx+80] + vmovdqu xmm0, OWORD PTR [rsp+32] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rsi+96] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rcx+96] + vmovdqu xmm0, OWORD PTR [rsp+16] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rsi+112] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rcx+112] + vmovdqu xmm0, OWORD PTR [rsp] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rsi+128] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_l + vpxor xmm5, xmm5, xmm2 + vpxor xmm6, xmm6, xmm4 + vpxor xmm5, xmm5, xmm3 + vpslldq xmm1, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vmovdqu xmm4, OWORD PTR [rsi+144] + vmovdqu xmm0, OWORD PTR L_avx2_aes_gcm_mod2_128 + vaesenc xmm8, xmm8, xmm4 + vpxor xmm6, xmm6, xmm1 + vpxor xmm7, xmm7, xmm5 + vpclmulqdq xmm3, xmm6, xmm0, 16 + vaesenc xmm9, xmm9, xmm4 + vaesenc xmm10, xmm10, xmm4 + vaesenc xmm11, xmm11, xmm4 + vpshufd xmm6, xmm6, 78 + vpxor xmm6, xmm6, xmm3 + vpclmulqdq xmm3, xmm6, xmm0, 16 + vaesenc xmm12, xmm12, xmm4 + vaesenc xmm13, xmm13, xmm4 + vaesenc xmm14, xmm14, xmm4 + vpshufd xmm6, xmm6, 78 + vpxor xmm6, xmm6, xmm3 + vpxor xmm6, xmm6, xmm7 + vaesenc xmm15, xmm15, xmm4 + cmp r9d, 11 + vmovdqu xmm7, OWORD PTR [rsi+160] + jl L_AES_GCM_decrypt_avx2_aesenc_128_ghash_avx_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rsi+176] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + cmp r9d, 13 + vmovdqu xmm7, OWORD PTR [rsi+192] + jl L_AES_GCM_decrypt_avx2_aesenc_128_ghash_avx_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rsi+208] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rsi+224] +L_AES_GCM_decrypt_avx2_aesenc_128_ghash_avx_done: + ; aesenc_last + vaesenclast xmm8, xmm8, xmm7 + vaesenclast xmm9, xmm9, xmm7 + vaesenclast xmm10, xmm10, xmm7 + vaesenclast xmm11, xmm11, xmm7 + vmovdqu xmm0, OWORD PTR [rcx] + vmovdqu xmm1, OWORD PTR [rcx+16] + vmovdqu xmm2, OWORD PTR [rcx+32] + vmovdqu xmm3, OWORD PTR [rcx+48] + vpxor xmm8, xmm8, xmm0 + vpxor xmm9, xmm9, xmm1 + vpxor xmm10, xmm10, xmm2 + vpxor xmm11, xmm11, xmm3 + vmovdqu OWORD PTR [rdx], xmm8 + vmovdqu OWORD PTR [rdx+16], xmm9 + vmovdqu OWORD PTR [rdx+32], xmm10 + vmovdqu OWORD PTR [rdx+48], xmm11 + vaesenclast xmm12, xmm12, xmm7 + vaesenclast xmm13, xmm13, xmm7 + vaesenclast xmm14, xmm14, xmm7 + vaesenclast xmm15, xmm15, xmm7 + vmovdqu xmm0, OWORD PTR [rcx+64] + vmovdqu xmm1, OWORD PTR [rcx+80] + vmovdqu xmm2, OWORD PTR [rcx+96] + vmovdqu xmm3, OWORD PTR [rcx+112] + vpxor xmm12, xmm12, xmm0 + vpxor xmm13, xmm13, xmm1 + vpxor xmm14, xmm14, xmm2 + vpxor xmm15, xmm15, xmm3 + vmovdqu OWORD PTR [rdx+64], xmm12 + vmovdqu OWORD PTR [rdx+80], xmm13 + vmovdqu OWORD PTR [rdx+96], xmm14 + vmovdqu OWORD PTR [rdx+112], xmm15 + ; aesenc_128_ghash - end + add ebx, 128 + cmp ebx, r13d + jl L_AES_GCM_decrypt_avx2_ghash_128 + vmovdqu xmm5, OWORD PTR [rsp] + vmovdqu xmm4, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] +L_AES_GCM_decrypt_avx2_done_128: + cmp ebx, r10d + jge L_AES_GCM_decrypt_avx2_done_dec + mov r13d, r10d + and r13d, 4294967280 + cmp ebx, r13d + jge L_AES_GCM_decrypt_avx2_last_block_done +L_AES_GCM_decrypt_avx2_last_block_start: + vmovdqu xmm11, OWORD PTR [rdi+rbx] + vpshufb xmm10, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64 + vpshufb xmm12, xmm11, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpaddd xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one + vpxor xmm12, xmm12, xmm6 + ; aesenc_gfmul_sb + vpclmulqdq xmm2, xmm12, xmm5, 1 + vpclmulqdq xmm3, xmm12, xmm5, 16 + vpclmulqdq xmm1, xmm12, xmm5, 0 + vpclmulqdq xmm8, xmm12, xmm5, 17 + vpxor xmm10, xmm10, [rsi] + vaesenc xmm10, xmm10, [rsi+16] + vpxor xmm3, xmm3, xmm2 + vpslldq xmm2, xmm3, 8 + vpsrldq xmm3, xmm3, 8 + vaesenc xmm10, xmm10, [rsi+32] + vpxor xmm2, xmm2, xmm1 + vpclmulqdq xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vaesenc xmm10, xmm10, [rsi+48] + vaesenc xmm10, xmm10, [rsi+64] + vaesenc xmm10, xmm10, [rsi+80] + vpshufd xmm2, xmm2, 78 + vpxor xmm2, xmm2, xmm1 + vpclmulqdq xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vaesenc xmm10, xmm10, [rsi+96] + vaesenc xmm10, xmm10, [rsi+112] + vaesenc xmm10, xmm10, [rsi+128] + vpshufd xmm2, xmm2, 78 + vaesenc xmm10, xmm10, [rsi+144] + vpxor xmm8, xmm8, xmm3 + vpxor xmm2, xmm2, xmm8 + vmovdqu xmm0, OWORD PTR [rsi+160] + cmp r9d, 11 + jl L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm10, xmm10, [rsi+176] + vmovdqu xmm0, OWORD PTR [rsi+192] + cmp r9d, 13 + jl L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm10, xmm10, [rsi+208] + vmovdqu xmm0, OWORD PTR [rsi+224] +L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last: + vaesenclast xmm10, xmm10, xmm0 + vpxor xmm6, xmm2, xmm1 + vpxor xmm10, xmm10, xmm11 + vmovdqu OWORD PTR [r8+rbx], xmm10 + add ebx, 16 + cmp ebx, r13d + jl L_AES_GCM_decrypt_avx2_last_block_start +L_AES_GCM_decrypt_avx2_last_block_done: + mov ecx, r10d + mov edx, r10d + and ecx, 15 + jz L_AES_GCM_decrypt_avx2_done_dec + ; aesenc_last15_dec + vpshufb xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64 + vpxor xmm4, xmm4, [rsi] + vaesenc xmm4, xmm4, [rsi+16] + vaesenc xmm4, xmm4, [rsi+32] + vaesenc xmm4, xmm4, [rsi+48] + vaesenc xmm4, xmm4, [rsi+64] + vaesenc xmm4, xmm4, [rsi+80] + vaesenc xmm4, xmm4, [rsi+96] + vaesenc xmm4, xmm4, [rsi+112] + vaesenc xmm4, xmm4, [rsi+128] + vaesenc xmm4, xmm4, [rsi+144] + cmp r9d, 11 + vmovdqu xmm1, OWORD PTR [rsi+160] + jl L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc xmm4, xmm4, xmm1 + vaesenc xmm4, xmm4, [rsi+176] + cmp r9d, 13 + vmovdqu xmm1, OWORD PTR [rsi+192] + jl L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last + vaesenc xmm4, xmm4, xmm1 + vaesenc xmm4, xmm4, [rsi+208] + vmovdqu xmm1, OWORD PTR [rsi+224] +L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last: + vaesenclast xmm4, xmm4, xmm1 + xor ecx, ecx + vpxor xmm0, xmm0, xmm0 + vmovdqu OWORD PTR [rsp], xmm4 + vmovdqu OWORD PTR [rsp+16], xmm0 +L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_loop: + movzx r13d, BYTE PTR [rdi+rbx] + mov BYTE PTR [rsp+rcx+16], r13b + xor r13b, BYTE PTR [rsp+rcx] + mov BYTE PTR [r8+rbx], r13b + inc ebx + inc ecx + cmp ebx, edx + jl L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_loop + vmovdqu xmm4, OWORD PTR [rsp+16] + vpshufb xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm4 + ; ghash_gfmul_red + vpclmulqdq xmm2, xmm6, xmm5, 16 + vpclmulqdq xmm1, xmm6, xmm5, 1 + vpclmulqdq xmm0, xmm6, xmm5, 0 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm6, xmm6, xmm5, 17 + vpclmulqdq xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm6, xmm6, xmm2 + vpxor xmm6, xmm6, xmm1 + vpxor xmm6, xmm6, xmm0 +L_AES_GCM_decrypt_avx2_done_dec: + ; calc_tag + shl r10, 3 + shl r11, 3 + vmovq xmm0, r10 + vmovq xmm1, r11 + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm6 + ; ghash_gfmul_red + vpclmulqdq xmm4, xmm0, xmm5, 16 + vpclmulqdq xmm3, xmm0, xmm5, 1 + vpclmulqdq xmm2, xmm0, xmm5, 0 + vpxor xmm4, xmm4, xmm3 + vpslldq xmm3, xmm4, 8 + vpsrldq xmm4, xmm4, 8 + vpxor xmm3, xmm3, xmm2 + vpclmulqdq xmm0, xmm0, xmm5, 17 + vpclmulqdq xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vpshufd xmm3, xmm3, 78 + vpxor xmm3, xmm3, xmm2 + vpclmulqdq xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vpshufd xmm3, xmm3, 78 + vpxor xmm0, xmm0, xmm4 + vpxor xmm0, xmm0, xmm3 + vpxor xmm0, xmm0, xmm2 + vpshufb xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm0, xmm0, xmm15 + ; cmp_tag + cmp r15d, 16 + je L_AES_GCM_decrypt_avx2_cmp_tag_16 + xor rdx, rdx + xor rax, rax + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_decrypt_avx2_cmp_tag_loop: + movzx r13d, BYTE PTR [rsp+rdx] + xor r13b, BYTE PTR [r14+rdx] + or al, r13b + inc edx + cmp edx, r15d + jne L_AES_GCM_decrypt_avx2_cmp_tag_loop + cmp rax, 0 + sete al + jmp L_AES_GCM_decrypt_avx2_cmp_tag_done +L_AES_GCM_decrypt_avx2_cmp_tag_16: + vmovdqu xmm1, OWORD PTR [r14] + vpcmpeqb xmm0, xmm0, xmm1 + vpmovmskb rdx, xmm0 + ; %%edx == 0xFFFF then return 1 else => return 0 + xor eax, eax + cmp edx, 65535 + sete al +L_AES_GCM_decrypt_avx2_cmp_tag_done: + mov DWORD PTR [rbp], eax + vzeroupper + add rsp, 168 + pop rbp + pop rsi + pop r15 + pop rbx + pop r14 + pop r12 + pop rdi + pop r13 + ret +AES_GCM_decrypt_avx2 ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_init_avx2 PROC + push rbx + push rdi + push rsi + push r12 + mov rdi, rcx + mov rsi, rdx + mov r10, r8 + mov r11d, r9d + mov rax, QWORD PTR [rsp+72] + mov r8, QWORD PTR [rsp+80] + mov r9, QWORD PTR [rsp+88] + sub rsp, 16 + vpxor xmm4, xmm4, xmm4 + mov edx, r11d + cmp edx, 12 + je L_AES_GCM_init_avx2_iv_12 + ; Calculate values when IV is not 12 bytes + ; H = Encrypt X(=0) + vmovdqu xmm5, OWORD PTR [rdi] + vaesenc xmm5, xmm5, [rdi+16] + vaesenc xmm5, xmm5, [rdi+32] + vaesenc xmm5, xmm5, [rdi+48] + vaesenc xmm5, xmm5, [rdi+64] + vaesenc xmm5, xmm5, [rdi+80] + vaesenc xmm5, xmm5, [rdi+96] + vaesenc xmm5, xmm5, [rdi+112] + vaesenc xmm5, xmm5, [rdi+128] + vaesenc xmm5, xmm5, [rdi+144] + cmp esi, 11 + vmovdqu xmm0, OWORD PTR [rdi+160] + jl L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm5, xmm5, [rdi+176] + cmp esi, 13 + vmovdqu xmm0, OWORD PTR [rdi+192] + jl L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm5, xmm5, [rdi+208] + vmovdqu xmm0, OWORD PTR [rdi+224] +L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last: + vaesenclast xmm5, xmm5, xmm0 + vpshufb xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask + ; Calc counter + ; Initialization vector + cmp edx, 0 + mov rcx, 0 + je L_AES_GCM_init_avx2_calc_iv_done + cmp edx, 16 + jl L_AES_GCM_init_avx2_calc_iv_lt16 + and edx, 4294967280 +L_AES_GCM_init_avx2_calc_iv_16_loop: + vmovdqu xmm0, OWORD PTR [r10+rcx] + vpshufb xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm4, 16 + vpclmulqdq xmm1, xmm5, xmm4, 1 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm6, xmm0, xmm1 + vpxor xmm4, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpclmulqdq xmm0, xmm6, xmm2, 16 + vpshufd xmm1, xmm6, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_init_avx2_calc_iv_16_loop + mov edx, r11d + cmp ecx, edx + je L_AES_GCM_init_avx2_calc_iv_done +L_AES_GCM_init_avx2_calc_iv_lt16: + vpxor xmm0, xmm0, xmm0 + xor ebx, ebx + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_init_avx2_calc_iv_loop: + movzx r12d, BYTE PTR [r10+rcx] + mov BYTE PTR [rsp+rbx], r12b + inc ecx + inc ebx + cmp ecx, edx + jl L_AES_GCM_init_avx2_calc_iv_loop + vmovdqu xmm0, OWORD PTR [rsp] + vpshufb xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm4, 16 + vpclmulqdq xmm1, xmm5, xmm4, 1 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm6, xmm0, xmm1 + vpxor xmm4, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpclmulqdq xmm0, xmm6, xmm2, 16 + vpshufd xmm1, xmm6, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 +L_AES_GCM_init_avx2_calc_iv_done: + ; T = Encrypt counter + vpxor xmm0, xmm0, xmm0 + shl edx, 3 + vmovq xmm0, rdx + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm4, 16 + vpclmulqdq xmm1, xmm5, xmm4, 1 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm6, xmm0, xmm1 + vpxor xmm4, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpclmulqdq xmm0, xmm6, xmm2, 16 + vpshufd xmm1, xmm6, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 + vpshufb xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask + ; Encrypt counter + vmovdqu xmm7, OWORD PTR [rdi] + vpxor xmm7, xmm7, xmm4 + vaesenc xmm7, xmm7, [rdi+16] + vaesenc xmm7, xmm7, [rdi+32] + vaesenc xmm7, xmm7, [rdi+48] + vaesenc xmm7, xmm7, [rdi+64] + vaesenc xmm7, xmm7, [rdi+80] + vaesenc xmm7, xmm7, [rdi+96] + vaesenc xmm7, xmm7, [rdi+112] + vaesenc xmm7, xmm7, [rdi+128] + vaesenc xmm7, xmm7, [rdi+144] + cmp esi, 11 + vmovdqu xmm0, OWORD PTR [rdi+160] + jl L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last + vaesenc xmm7, xmm7, xmm0 + vaesenc xmm7, xmm7, [rdi+176] + cmp esi, 13 + vmovdqu xmm0, OWORD PTR [rdi+192] + jl L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last + vaesenc xmm7, xmm7, xmm0 + vaesenc xmm7, xmm7, [rdi+208] + vmovdqu xmm0, OWORD PTR [rdi+224] +L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last: + vaesenclast xmm7, xmm7, xmm0 + jmp L_AES_GCM_init_avx2_iv_done +L_AES_GCM_init_avx2_iv_12: + ; # Calculate values when IV is 12 bytes + ; Set counter based on IV + vmovdqu xmm4, OWORD PTR L_avx2_aes_gcm_bswap_one + vmovdqu xmm5, OWORD PTR [rdi] + vpblendd xmm4, xmm4, [r10], 7 + ; H = Encrypt X(=0) and T = Encrypt counter + vmovdqu xmm6, OWORD PTR [rdi+16] + vpxor xmm7, xmm4, xmm5 + vaesenc xmm5, xmm5, xmm6 + vaesenc xmm7, xmm7, xmm6 + vmovdqu xmm0, OWORD PTR [rdi+32] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + vmovdqu xmm0, OWORD PTR [rdi+48] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + vmovdqu xmm0, OWORD PTR [rdi+64] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + vmovdqu xmm0, OWORD PTR [rdi+80] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + vmovdqu xmm0, OWORD PTR [rdi+96] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + vmovdqu xmm0, OWORD PTR [rdi+112] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + vmovdqu xmm0, OWORD PTR [rdi+128] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + vmovdqu xmm0, OWORD PTR [rdi+144] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + cmp esi, 11 + vmovdqu xmm0, OWORD PTR [rdi+160] + jl L_AES_GCM_init_avx2_calc_iv_12_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + vmovdqu xmm0, OWORD PTR [rdi+176] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + cmp esi, 13 + vmovdqu xmm0, OWORD PTR [rdi+192] + jl L_AES_GCM_init_avx2_calc_iv_12_last + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + vmovdqu xmm0, OWORD PTR [rdi+208] + vaesenc xmm5, xmm5, xmm0 + vaesenc xmm7, xmm7, xmm0 + vmovdqu xmm0, OWORD PTR [rdi+224] +L_AES_GCM_init_avx2_calc_iv_12_last: + vaesenclast xmm5, xmm5, xmm0 + vaesenclast xmm7, xmm7, xmm0 + vpshufb xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask +L_AES_GCM_init_avx2_iv_done: + vmovdqu OWORD PTR [r9], xmm7 + vpshufb xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64 + vpaddd xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one + vmovdqu OWORD PTR [rax], xmm5 + vmovdqu OWORD PTR [r8], xmm4 + vzeroupper + add rsp, 16 + pop r12 + pop rsi + pop rdi + pop rbx + ret +AES_GCM_init_avx2 ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_aad_update_avx2 PROC + mov rax, rcx + vmovdqu xmm4, OWORD PTR [r8] + vmovdqu xmm5, OWORD PTR [r9] + xor ecx, ecx +L_AES_GCM_aad_update_avx2_16_loop: + vmovdqu xmm0, OWORD PTR [rax+rcx] + vpshufb xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm4, 16 + vpclmulqdq xmm1, xmm5, xmm4, 1 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm6, xmm0, xmm1 + vpxor xmm4, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpclmulqdq xmm0, xmm6, xmm2, 16 + vpshufd xmm1, xmm6, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 + add ecx, 16 + cmp ecx, edx + jl L_AES_GCM_aad_update_avx2_16_loop + vmovdqu OWORD PTR [r8], xmm4 + vzeroupper + ret +AES_GCM_aad_update_avx2 ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_encrypt_block_avx2 PROC + mov r10, r8 + mov r11, r9 + mov rax, QWORD PTR [rsp+40] + sub rsp, 152 + vmovdqu xmm3, OWORD PTR [rax] + ; aesenc_block + vmovdqu xmm1, xmm3 + vpshufb xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64 + vpaddd xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_one + vpxor xmm0, xmm0, [rcx] + vmovdqu xmm2, OWORD PTR [rcx+16] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rcx+32] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rcx+48] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rcx+64] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rcx+80] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rcx+96] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rcx+112] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rcx+128] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rcx+144] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm3, xmm1 + cmp edx, 11 + vmovdqu xmm1, OWORD PTR [rcx+160] + jl L_AES_GCM_encrypt_block_avx2_aesenc_block_last + vaesenc xmm0, xmm0, xmm1 + vmovdqu xmm2, OWORD PTR [rcx+176] + vaesenc xmm0, xmm0, xmm2 + cmp edx, 13 + vmovdqu xmm1, OWORD PTR [rcx+192] + jl L_AES_GCM_encrypt_block_avx2_aesenc_block_last + vaesenc xmm0, xmm0, xmm1 + vmovdqu xmm2, OWORD PTR [rcx+208] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm1, OWORD PTR [rcx+224] +L_AES_GCM_encrypt_block_avx2_aesenc_block_last: + vaesenclast xmm0, xmm0, xmm1 + vmovdqu xmm1, OWORD PTR [r11] + vpxor xmm0, xmm0, xmm1 + vmovdqu OWORD PTR [r10], xmm0 + vmovdqu OWORD PTR [rax], xmm3 + vzeroupper + add rsp, 152 + ret +AES_GCM_encrypt_block_avx2 ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_ghash_block_avx2 PROC + vmovdqu xmm4, OWORD PTR [rdx] + vmovdqu xmm5, OWORD PTR [r8] + vmovdqu xmm0, OWORD PTR [rcx] + vpshufb xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm4, xmm4, xmm0 + ; ghash_gfmul_avx + vpclmulqdq xmm2, xmm5, xmm4, 16 + vpclmulqdq xmm1, xmm5, xmm4, 1 + vpclmulqdq xmm0, xmm5, xmm4, 0 + vpclmulqdq xmm3, xmm5, xmm4, 17 + vpxor xmm2, xmm2, xmm1 + vpslldq xmm1, xmm2, 8 + vpsrldq xmm2, xmm2, 8 + vpxor xmm6, xmm0, xmm1 + vpxor xmm4, xmm3, xmm2 + ; ghash_mid + vpsrld xmm0, xmm6, 31 + vpsrld xmm1, xmm4, 31 + vpslld xmm6, xmm6, 1 + vpslld xmm4, xmm4, 1 + vpsrldq xmm2, xmm0, 12 + vpslldq xmm0, xmm0, 4 + vpslldq xmm1, xmm1, 4 + vpor xmm4, xmm4, xmm2 + vpor xmm6, xmm6, xmm0 + vpor xmm4, xmm4, xmm1 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpclmulqdq xmm0, xmm6, xmm2, 16 + vpshufd xmm1, xmm6, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm4, xmm4, xmm1 + vmovdqu OWORD PTR [rdx], xmm4 + vzeroupper + ret +AES_GCM_ghash_block_avx2 ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_encrypt_update_avx2 PROC + push r12 + push r13 + push r14 + push r15 + push rdi + mov rax, rcx + mov r10, r8 + mov r8d, edx + mov r11, r9 + mov r9d, DWORD PTR [rsp+80] + mov r12, QWORD PTR [rsp+88] + mov r13, QWORD PTR [rsp+96] + mov r14, QWORD PTR [rsp+104] + sub rsp, 152 + vmovdqu xmm6, OWORD PTR [r12] + vmovdqu xmm5, OWORD PTR [r13] + vmovdqu xmm4, OWORD PTR [r14] + vpsrlq xmm1, xmm5, 63 + vpsllq xmm0, xmm5, 1 + vpslldq xmm1, xmm1, 8 + vpor xmm0, xmm0, xmm1 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpxor xmm5, xmm5, xmm0 + xor edi, edi + cmp r9d, 128 + mov r15d, r9d + jl L_AES_GCM_encrypt_update_avx2_done_128 + and r15d, 4294967168 + vmovdqu OWORD PTR [rsp+128], xmm4 + vmovdqu xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128 + ; H ^ 1 and H ^ 2 + vpclmulqdq xmm9, xmm5, xmm5, 0 + vpclmulqdq xmm10, xmm5, xmm5, 17 + vpclmulqdq xmm8, xmm9, xmm3, 16 + vpshufd xmm9, xmm9, 78 + vpxor xmm9, xmm9, xmm8 + vpclmulqdq xmm8, xmm9, xmm3, 16 + vpshufd xmm9, xmm9, 78 + vpxor xmm9, xmm9, xmm8 + vpxor xmm0, xmm10, xmm9 + vmovdqu OWORD PTR [rsp], xmm5 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 and H ^ 4 + vpclmulqdq xmm11, xmm0, xmm5, 16 + vpclmulqdq xmm10, xmm0, xmm5, 1 + vpclmulqdq xmm9, xmm0, xmm5, 0 + vpclmulqdq xmm12, xmm0, xmm5, 17 + vpclmulqdq xmm13, xmm0, xmm0, 0 + vpclmulqdq xmm14, xmm0, xmm0, 17 + vpxor xmm11, xmm11, xmm10 + vpslldq xmm10, xmm11, 8 + vpsrldq xmm11, xmm11, 8 + vpxor xmm10, xmm10, xmm9 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm10, xmm10, xmm9 + vpxor xmm13, xmm13, xmm8 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm12, xmm12, xmm11 + vpxor xmm13, xmm13, xmm8 + vpxor xmm10, xmm10, xmm12 + vpxor xmm2, xmm13, xmm14 + vpxor xmm1, xmm10, xmm9 + vmovdqu OWORD PTR [rsp+32], xmm1 + vmovdqu OWORD PTR [rsp+48], xmm2 + ; H ^ 5 and H ^ 6 + vpclmulqdq xmm11, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 0 + vpclmulqdq xmm12, xmm1, xmm0, 17 + vpclmulqdq xmm13, xmm1, xmm1, 0 + vpclmulqdq xmm14, xmm1, xmm1, 17 + vpxor xmm11, xmm11, xmm10 + vpslldq xmm10, xmm11, 8 + vpsrldq xmm11, xmm11, 8 + vpxor xmm10, xmm10, xmm9 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm10, xmm10, xmm9 + vpxor xmm13, xmm13, xmm8 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm12, xmm12, xmm11 + vpxor xmm13, xmm13, xmm8 + vpxor xmm10, xmm10, xmm12 + vpxor xmm0, xmm13, xmm14 + vpxor xmm7, xmm10, xmm9 + vmovdqu OWORD PTR [rsp+64], xmm7 + vmovdqu OWORD PTR [rsp+80], xmm0 + ; H ^ 7 and H ^ 8 + vpclmulqdq xmm11, xmm2, xmm1, 16 + vpclmulqdq xmm10, xmm2, xmm1, 1 + vpclmulqdq xmm9, xmm2, xmm1, 0 + vpclmulqdq xmm12, xmm2, xmm1, 17 + vpclmulqdq xmm13, xmm2, xmm2, 0 + vpclmulqdq xmm14, xmm2, xmm2, 17 + vpxor xmm11, xmm11, xmm10 + vpslldq xmm10, xmm11, 8 + vpsrldq xmm11, xmm11, 8 + vpxor xmm10, xmm10, xmm9 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm10, xmm10, xmm9 + vpxor xmm13, xmm13, xmm8 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm12, xmm12, xmm11 + vpxor xmm13, xmm13, xmm8 + vpxor xmm10, xmm10, xmm12 + vpxor xmm0, xmm13, xmm14 + vpxor xmm7, xmm10, xmm9 + vmovdqu OWORD PTR [rsp+96], xmm7 + vmovdqu OWORD PTR [rsp+112], xmm0 + ; First 128 bytes of input + ; aesenc_128 + ; aesenc_ctr + vmovdqu xmm0, OWORD PTR [rsp+128] + vmovdqu xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one + vpshufb xmm8, xmm0, xmm1 + vpaddd xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two + vpshufb xmm9, xmm9, xmm1 + vpaddd xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three + vpshufb xmm10, xmm10, xmm1 + vpaddd xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four + vpshufb xmm11, xmm11, xmm1 + vpaddd xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five + vpshufb xmm12, xmm12, xmm1 + vpaddd xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six + vpshufb xmm13, xmm13, xmm1 + vpaddd xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven + vpshufb xmm14, xmm14, xmm1 + vpaddd xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight + vpshufb xmm15, xmm15, xmm1 + ; aesenc_xor + vmovdqu xmm7, OWORD PTR [rax] + vmovdqu OWORD PTR [rsp+128], xmm0 + vpxor xmm8, xmm8, xmm7 + vpxor xmm9, xmm9, xmm7 + vpxor xmm10, xmm10, xmm7 + vpxor xmm11, xmm11, xmm7 + vpxor xmm12, xmm12, xmm7 + vpxor xmm13, xmm13, xmm7 + vpxor xmm14, xmm14, xmm7 + vpxor xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rax+16] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rax+32] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rax+48] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rax+64] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rax+80] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rax+96] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rax+112] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rax+128] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rax+144] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + cmp r8d, 11 + vmovdqu xmm7, OWORD PTR [rax+160] + jl L_AES_GCM_encrypt_update_avx2_aesenc_128_enc_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rax+176] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + cmp r8d, 13 + vmovdqu xmm7, OWORD PTR [rax+192] + jl L_AES_GCM_encrypt_update_avx2_aesenc_128_enc_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rax+208] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rax+224] +L_AES_GCM_encrypt_update_avx2_aesenc_128_enc_done: + ; aesenc_last + vaesenclast xmm8, xmm8, xmm7 + vaesenclast xmm9, xmm9, xmm7 + vaesenclast xmm10, xmm10, xmm7 + vaesenclast xmm11, xmm11, xmm7 + vmovdqu xmm0, OWORD PTR [r11] + vmovdqu xmm1, OWORD PTR [r11+16] + vmovdqu xmm2, OWORD PTR [r11+32] + vmovdqu xmm3, OWORD PTR [r11+48] + vpxor xmm8, xmm8, xmm0 + vpxor xmm9, xmm9, xmm1 + vpxor xmm10, xmm10, xmm2 + vpxor xmm11, xmm11, xmm3 + vmovdqu OWORD PTR [r10], xmm8 + vmovdqu OWORD PTR [r10+16], xmm9 + vmovdqu OWORD PTR [r10+32], xmm10 + vmovdqu OWORD PTR [r10+48], xmm11 + vaesenclast xmm12, xmm12, xmm7 + vaesenclast xmm13, xmm13, xmm7 + vaesenclast xmm14, xmm14, xmm7 + vaesenclast xmm15, xmm15, xmm7 + vmovdqu xmm0, OWORD PTR [r11+64] + vmovdqu xmm1, OWORD PTR [r11+80] + vmovdqu xmm2, OWORD PTR [r11+96] + vmovdqu xmm3, OWORD PTR [r11+112] + vpxor xmm12, xmm12, xmm0 + vpxor xmm13, xmm13, xmm1 + vpxor xmm14, xmm14, xmm2 + vpxor xmm15, xmm15, xmm3 + vmovdqu OWORD PTR [r10+64], xmm12 + vmovdqu OWORD PTR [r10+80], xmm13 + vmovdqu OWORD PTR [r10+96], xmm14 + vmovdqu OWORD PTR [r10+112], xmm15 + cmp r15d, 128 + mov edi, 128 + jle L_AES_GCM_encrypt_update_avx2_end_128 + ; More 128 bytes of input +L_AES_GCM_encrypt_update_avx2_ghash_128: + ; aesenc_128_ghash + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + ; aesenc_ctr + vmovdqu xmm0, OWORD PTR [rsp+128] + vmovdqu xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one + vpshufb xmm8, xmm0, xmm1 + vpaddd xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two + vpshufb xmm9, xmm9, xmm1 + vpaddd xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three + vpshufb xmm10, xmm10, xmm1 + vpaddd xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four + vpshufb xmm11, xmm11, xmm1 + vpaddd xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five + vpshufb xmm12, xmm12, xmm1 + vpaddd xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six + vpshufb xmm13, xmm13, xmm1 + vpaddd xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven + vpshufb xmm14, xmm14, xmm1 + vpaddd xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight + vpshufb xmm15, xmm15, xmm1 + ; aesenc_xor + vmovdqu xmm7, OWORD PTR [rax] + vmovdqu OWORD PTR [rsp+128], xmm0 + vpxor xmm8, xmm8, xmm7 + vpxor xmm9, xmm9, xmm7 + vpxor xmm10, xmm10, xmm7 + vpxor xmm11, xmm11, xmm7 + vpxor xmm12, xmm12, xmm7 + vpxor xmm13, xmm13, xmm7 + vpxor xmm14, xmm14, xmm7 + vpxor xmm15, xmm15, xmm7 + ; aesenc_pclmul_1 + vmovdqu xmm1, OWORD PTR [rdx+-128] + vmovdqu xmm0, OWORD PTR [rax+16] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vmovdqu xmm2, OWORD PTR [rsp+112] + vpxor xmm1, xmm1, xmm6 + vpclmulqdq xmm5, xmm1, xmm2, 16 + vpclmulqdq xmm3, xmm1, xmm2, 1 + vpclmulqdq xmm6, xmm1, xmm2, 0 + vpclmulqdq xmm7, xmm1, xmm2, 17 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_2 + vmovdqu xmm1, OWORD PTR [rdx+-112] + vmovdqu xmm0, OWORD PTR [rsp+96] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rax+32] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rdx+-96] + vmovdqu xmm0, OWORD PTR [rsp+80] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rax+48] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rdx+-80] + vmovdqu xmm0, OWORD PTR [rsp+64] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rax+64] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rdx+-64] + vmovdqu xmm0, OWORD PTR [rsp+48] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rax+80] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rdx+-48] + vmovdqu xmm0, OWORD PTR [rsp+32] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rax+96] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rdx+-32] + vmovdqu xmm0, OWORD PTR [rsp+16] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rax+112] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rdx+-16] + vmovdqu xmm0, OWORD PTR [rsp] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rax+128] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_l + vpxor xmm5, xmm5, xmm2 + vpxor xmm6, xmm6, xmm4 + vpxor xmm5, xmm5, xmm3 + vpslldq xmm1, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vmovdqu xmm4, OWORD PTR [rax+144] + vmovdqu xmm0, OWORD PTR L_avx2_aes_gcm_mod2_128 + vaesenc xmm8, xmm8, xmm4 + vpxor xmm6, xmm6, xmm1 + vpxor xmm7, xmm7, xmm5 + vpclmulqdq xmm3, xmm6, xmm0, 16 + vaesenc xmm9, xmm9, xmm4 + vaesenc xmm10, xmm10, xmm4 + vaesenc xmm11, xmm11, xmm4 + vpshufd xmm6, xmm6, 78 + vpxor xmm6, xmm6, xmm3 + vpclmulqdq xmm3, xmm6, xmm0, 16 + vaesenc xmm12, xmm12, xmm4 + vaesenc xmm13, xmm13, xmm4 + vaesenc xmm14, xmm14, xmm4 + vpshufd xmm6, xmm6, 78 + vpxor xmm6, xmm6, xmm3 + vpxor xmm6, xmm6, xmm7 + vaesenc xmm15, xmm15, xmm4 + cmp r8d, 11 + vmovdqu xmm7, OWORD PTR [rax+160] + jl L_AES_GCM_encrypt_update_avx2_aesenc_128_ghash_avx_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rax+176] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + cmp r8d, 13 + vmovdqu xmm7, OWORD PTR [rax+192] + jl L_AES_GCM_encrypt_update_avx2_aesenc_128_ghash_avx_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rax+208] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rax+224] +L_AES_GCM_encrypt_update_avx2_aesenc_128_ghash_avx_done: + ; aesenc_last + vaesenclast xmm8, xmm8, xmm7 + vaesenclast xmm9, xmm9, xmm7 + vaesenclast xmm10, xmm10, xmm7 + vaesenclast xmm11, xmm11, xmm7 + vmovdqu xmm0, OWORD PTR [rcx] + vmovdqu xmm1, OWORD PTR [rcx+16] + vmovdqu xmm2, OWORD PTR [rcx+32] + vmovdqu xmm3, OWORD PTR [rcx+48] + vpxor xmm8, xmm8, xmm0 + vpxor xmm9, xmm9, xmm1 + vpxor xmm10, xmm10, xmm2 + vpxor xmm11, xmm11, xmm3 + vmovdqu OWORD PTR [rdx], xmm8 + vmovdqu OWORD PTR [rdx+16], xmm9 + vmovdqu OWORD PTR [rdx+32], xmm10 + vmovdqu OWORD PTR [rdx+48], xmm11 + vaesenclast xmm12, xmm12, xmm7 + vaesenclast xmm13, xmm13, xmm7 + vaesenclast xmm14, xmm14, xmm7 + vaesenclast xmm15, xmm15, xmm7 + vmovdqu xmm0, OWORD PTR [rcx+64] + vmovdqu xmm1, OWORD PTR [rcx+80] + vmovdqu xmm2, OWORD PTR [rcx+96] + vmovdqu xmm3, OWORD PTR [rcx+112] + vpxor xmm12, xmm12, xmm0 + vpxor xmm13, xmm13, xmm1 + vpxor xmm14, xmm14, xmm2 + vpxor xmm15, xmm15, xmm3 + vmovdqu OWORD PTR [rdx+64], xmm12 + vmovdqu OWORD PTR [rdx+80], xmm13 + vmovdqu OWORD PTR [rdx+96], xmm14 + vmovdqu OWORD PTR [rdx+112], xmm15 + ; aesenc_128_ghash - end + add edi, 128 + cmp edi, r15d + jl L_AES_GCM_encrypt_update_avx2_ghash_128 +L_AES_GCM_encrypt_update_avx2_end_128: + vmovdqu xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpshufb xmm8, xmm8, xmm4 + vpshufb xmm9, xmm9, xmm4 + vpshufb xmm10, xmm10, xmm4 + vpshufb xmm11, xmm11, xmm4 + vpshufb xmm12, xmm12, xmm4 + vpshufb xmm13, xmm13, xmm4 + vpshufb xmm14, xmm14, xmm4 + vpshufb xmm15, xmm15, xmm4 + vpxor xmm8, xmm8, xmm6 + vmovdqu xmm7, OWORD PTR [rsp] + vpclmulqdq xmm5, xmm7, xmm15, 16 + vpclmulqdq xmm1, xmm7, xmm15, 1 + vpclmulqdq xmm4, xmm7, xmm15, 0 + vpclmulqdq xmm6, xmm7, xmm15, 17 + vpxor xmm5, xmm5, xmm1 + vmovdqu xmm7, OWORD PTR [rsp+16] + vpclmulqdq xmm2, xmm7, xmm14, 16 + vpclmulqdq xmm1, xmm7, xmm14, 1 + vpclmulqdq xmm0, xmm7, xmm14, 0 + vpclmulqdq xmm3, xmm7, xmm14, 17 + vpxor xmm2, xmm2, xmm1 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm4, xmm4, xmm0 + vmovdqu xmm15, OWORD PTR [rsp+32] + vmovdqu xmm7, OWORD PTR [rsp+48] + vpclmulqdq xmm2, xmm15, xmm13, 16 + vpclmulqdq xmm1, xmm15, xmm13, 1 + vpclmulqdq xmm0, xmm15, xmm13, 0 + vpclmulqdq xmm3, xmm15, xmm13, 17 + vpxor xmm2, xmm2, xmm1 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm4, xmm4, xmm0 + vpclmulqdq xmm2, xmm7, xmm12, 16 + vpclmulqdq xmm1, xmm7, xmm12, 1 + vpclmulqdq xmm0, xmm7, xmm12, 0 + vpclmulqdq xmm3, xmm7, xmm12, 17 + vpxor xmm2, xmm2, xmm1 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm4, xmm4, xmm0 + vmovdqu xmm15, OWORD PTR [rsp+64] + vmovdqu xmm7, OWORD PTR [rsp+80] + vpclmulqdq xmm2, xmm15, xmm11, 16 + vpclmulqdq xmm1, xmm15, xmm11, 1 + vpclmulqdq xmm0, xmm15, xmm11, 0 + vpclmulqdq xmm3, xmm15, xmm11, 17 + vpxor xmm2, xmm2, xmm1 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm4, xmm4, xmm0 + vpclmulqdq xmm2, xmm7, xmm10, 16 + vpclmulqdq xmm1, xmm7, xmm10, 1 + vpclmulqdq xmm0, xmm7, xmm10, 0 + vpclmulqdq xmm3, xmm7, xmm10, 17 + vpxor xmm2, xmm2, xmm1 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm4, xmm4, xmm0 + vmovdqu xmm15, OWORD PTR [rsp+96] + vmovdqu xmm7, OWORD PTR [rsp+112] + vpclmulqdq xmm2, xmm15, xmm9, 16 + vpclmulqdq xmm1, xmm15, xmm9, 1 + vpclmulqdq xmm0, xmm15, xmm9, 0 + vpclmulqdq xmm3, xmm15, xmm9, 17 + vpxor xmm2, xmm2, xmm1 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm4, xmm4, xmm0 + vpclmulqdq xmm2, xmm7, xmm8, 16 + vpclmulqdq xmm1, xmm7, xmm8, 1 + vpclmulqdq xmm0, xmm7, xmm8, 0 + vpclmulqdq xmm3, xmm7, xmm8, 17 + vpxor xmm2, xmm2, xmm1 + vpxor xmm6, xmm6, xmm3 + vpxor xmm5, xmm5, xmm2 + vpxor xmm4, xmm4, xmm0 + vpslldq xmm7, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vpxor xmm4, xmm4, xmm7 + vpxor xmm6, xmm6, xmm5 + ; ghash_red + vmovdqu xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpclmulqdq xmm0, xmm4, xmm2, 16 + vpshufd xmm1, xmm4, 78 + vpxor xmm1, xmm1, xmm0 + vpclmulqdq xmm0, xmm1, xmm2, 16 + vpshufd xmm1, xmm1, 78 + vpxor xmm1, xmm1, xmm0 + vpxor xmm6, xmm6, xmm1 + vmovdqu xmm5, OWORD PTR [rsp] + vmovdqu xmm4, OWORD PTR [rsp+128] +L_AES_GCM_encrypt_update_avx2_done_128: + cmp edi, r9d + je L_AES_GCM_encrypt_update_avx2_done_enc + mov r15d, r9d + and r15d, 4294967280 + cmp edi, r15d + jge L_AES_GCM_encrypt_update_avx2_last_block_done + ; aesenc_block + vmovdqu xmm1, xmm4 + vpshufb xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64 + vpaddd xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_one + vpxor xmm0, xmm0, [rax] + vmovdqu xmm2, OWORD PTR [rax+16] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rax+32] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rax+48] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rax+64] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rax+80] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rax+96] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rax+112] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rax+128] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm2, OWORD PTR [rax+144] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm4, xmm1 + cmp r8d, 11 + vmovdqu xmm1, OWORD PTR [rax+160] + jl L_AES_GCM_encrypt_update_avx2_aesenc_block_last + vaesenc xmm0, xmm0, xmm1 + vmovdqu xmm2, OWORD PTR [rax+176] + vaesenc xmm0, xmm0, xmm2 + cmp r8d, 13 + vmovdqu xmm1, OWORD PTR [rax+192] + jl L_AES_GCM_encrypt_update_avx2_aesenc_block_last + vaesenc xmm0, xmm0, xmm1 + vmovdqu xmm2, OWORD PTR [rax+208] + vaesenc xmm0, xmm0, xmm2 + vmovdqu xmm1, OWORD PTR [rax+224] +L_AES_GCM_encrypt_update_avx2_aesenc_block_last: + vaesenclast xmm0, xmm0, xmm1 + vmovdqu xmm1, OWORD PTR [r11+rdi] + vpxor xmm0, xmm0, xmm1 + vmovdqu OWORD PTR [r10+rdi], xmm0 + vpshufb xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm0 + add edi, 16 + cmp edi, r15d + jge L_AES_GCM_encrypt_update_avx2_last_block_ghash +L_AES_GCM_encrypt_update_avx2_last_block_start: + vmovdqu xmm12, OWORD PTR [r11+rdi] + vpshufb xmm11, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64 + vpaddd xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one + ; aesenc_gfmul_sb + vpclmulqdq xmm2, xmm6, xmm5, 1 + vpclmulqdq xmm3, xmm6, xmm5, 16 + vpclmulqdq xmm1, xmm6, xmm5, 0 + vpclmulqdq xmm8, xmm6, xmm5, 17 + vpxor xmm11, xmm11, [rax] + vaesenc xmm11, xmm11, [rax+16] + vpxor xmm3, xmm3, xmm2 + vpslldq xmm2, xmm3, 8 + vpsrldq xmm3, xmm3, 8 + vaesenc xmm11, xmm11, [rax+32] + vpxor xmm2, xmm2, xmm1 + vpclmulqdq xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vaesenc xmm11, xmm11, [rax+48] + vaesenc xmm11, xmm11, [rax+64] + vaesenc xmm11, xmm11, [rax+80] + vpshufd xmm2, xmm2, 78 + vpxor xmm2, xmm2, xmm1 + vpclmulqdq xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vaesenc xmm11, xmm11, [rax+96] + vaesenc xmm11, xmm11, [rax+112] + vaesenc xmm11, xmm11, [rax+128] + vpshufd xmm2, xmm2, 78 + vaesenc xmm11, xmm11, [rax+144] + vpxor xmm8, xmm8, xmm3 + vpxor xmm2, xmm2, xmm8 + vmovdqu xmm0, OWORD PTR [rax+160] + cmp r8d, 11 + jl L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm11, xmm11, [rax+176] + vmovdqu xmm0, OWORD PTR [rax+192] + cmp r8d, 13 + jl L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm11, xmm11, [rax+208] + vmovdqu xmm0, OWORD PTR [rax+224] +L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last: + vaesenclast xmm11, xmm11, xmm0 + vpxor xmm6, xmm2, xmm1 + vpxor xmm11, xmm11, xmm12 + vmovdqu OWORD PTR [r10+rdi], xmm11 + vpshufb xmm11, xmm11, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm6, xmm6, xmm11 + add edi, 16 + cmp edi, r15d + jl L_AES_GCM_encrypt_update_avx2_last_block_start +L_AES_GCM_encrypt_update_avx2_last_block_ghash: + ; ghash_gfmul_red + vpclmulqdq xmm10, xmm6, xmm5, 16 + vpclmulqdq xmm9, xmm6, xmm5, 1 + vpclmulqdq xmm8, xmm6, xmm5, 0 + vpxor xmm10, xmm10, xmm9 + vpslldq xmm9, xmm10, 8 + vpsrldq xmm10, xmm10, 8 + vpxor xmm9, xmm9, xmm8 + vpclmulqdq xmm6, xmm6, xmm5, 17 + vpclmulqdq xmm8, xmm9, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vpshufd xmm9, xmm9, 78 + vpxor xmm9, xmm9, xmm8 + vpclmulqdq xmm8, xmm9, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vpshufd xmm9, xmm9, 78 + vpxor xmm6, xmm6, xmm10 + vpxor xmm6, xmm6, xmm9 + vpxor xmm6, xmm6, xmm8 +L_AES_GCM_encrypt_update_avx2_last_block_done: +L_AES_GCM_encrypt_update_avx2_done_enc: + vmovdqu OWORD PTR [r12], xmm6 + vmovdqu OWORD PTR [r14], xmm4 + vzeroupper + add rsp, 152 + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + ret +AES_GCM_encrypt_update_avx2 ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_encrypt_final_avx2 PROC + push r12 + push r13 + mov eax, DWORD PTR [rsp+56] + mov r10, QWORD PTR [rsp+64] + mov r11, QWORD PTR [rsp+72] + sub rsp, 16 + vmovdqu xmm4, OWORD PTR [rcx] + vmovdqu xmm5, OWORD PTR [r10] + vmovdqu xmm6, OWORD PTR [r11] + vpsrlq xmm1, xmm5, 63 + vpsllq xmm0, xmm5, 1 + vpslldq xmm1, xmm1, 8 + vpor xmm0, xmm0, xmm1 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpxor xmm5, xmm5, xmm0 + ; calc_tag + shl r9, 3 + shl rax, 3 + vmovq xmm0, r9 + vmovq xmm1, rax + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm4 + ; ghash_gfmul_red + vpclmulqdq xmm7, xmm0, xmm5, 16 + vpclmulqdq xmm3, xmm0, xmm5, 1 + vpclmulqdq xmm2, xmm0, xmm5, 0 + vpxor xmm7, xmm7, xmm3 + vpslldq xmm3, xmm7, 8 + vpsrldq xmm7, xmm7, 8 + vpxor xmm3, xmm3, xmm2 + vpclmulqdq xmm0, xmm0, xmm5, 17 + vpclmulqdq xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vpshufd xmm3, xmm3, 78 + vpxor xmm3, xmm3, xmm2 + vpclmulqdq xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vpshufd xmm3, xmm3, 78 + vpxor xmm0, xmm0, xmm7 + vpxor xmm0, xmm0, xmm3 + vpxor xmm0, xmm0, xmm2 + vpshufb xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm0, xmm0, xmm6 + ; store_tag + cmp r8d, 16 + je L_AES_GCM_encrypt_final_avx2_store_tag_16 + xor r12, r12 + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_encrypt_final_avx2_store_tag_loop: + movzx r13d, BYTE PTR [rsp+r12] + mov BYTE PTR [rdx+r12], r13b + inc r12d + cmp r12d, r8d + jne L_AES_GCM_encrypt_final_avx2_store_tag_loop + jmp L_AES_GCM_encrypt_final_avx2_store_tag_done +L_AES_GCM_encrypt_final_avx2_store_tag_16: + vmovdqu OWORD PTR [rdx], xmm0 +L_AES_GCM_encrypt_final_avx2_store_tag_done: + vzeroupper + add rsp, 16 + pop r13 + pop r12 + ret +AES_GCM_encrypt_final_avx2 ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_decrypt_update_avx2 PROC + push r13 + push r12 + push r14 + push r15 + push rdi + mov rax, rcx + mov r10, r8 + mov r8d, edx + mov r11, r9 + mov r9d, DWORD PTR [rsp+80] + mov r12, QWORD PTR [rsp+88] + mov r14, QWORD PTR [rsp+96] + mov r15, QWORD PTR [rsp+104] + sub rsp, 168 + vmovdqu xmm6, OWORD PTR [r12] + vmovdqu xmm5, OWORD PTR [r14] + vmovdqu xmm4, OWORD PTR [r15] + ; Calculate H + vpsrlq xmm1, xmm5, 63 + vpsllq xmm0, xmm5, 1 + vpslldq xmm1, xmm1, 8 + vpor xmm0, xmm0, xmm1 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpxor xmm5, xmm5, xmm0 + xor edi, edi + cmp r9d, 128 + mov r13d, r9d + jl L_AES_GCM_decrypt_update_avx2_done_128 + and r13d, 4294967168 + vmovdqu OWORD PTR [rsp+128], xmm4 + vmovdqu OWORD PTR [rsp+144], xmm15 + vmovdqu xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128 + ; H ^ 1 and H ^ 2 + vpclmulqdq xmm9, xmm5, xmm5, 0 + vpclmulqdq xmm10, xmm5, xmm5, 17 + vpclmulqdq xmm8, xmm9, xmm3, 16 + vpshufd xmm9, xmm9, 78 + vpxor xmm9, xmm9, xmm8 + vpclmulqdq xmm8, xmm9, xmm3, 16 + vpshufd xmm9, xmm9, 78 + vpxor xmm9, xmm9, xmm8 + vpxor xmm0, xmm10, xmm9 + vmovdqu OWORD PTR [rsp], xmm5 + vmovdqu OWORD PTR [rsp+16], xmm0 + ; H ^ 3 and H ^ 4 + vpclmulqdq xmm11, xmm0, xmm5, 16 + vpclmulqdq xmm10, xmm0, xmm5, 1 + vpclmulqdq xmm9, xmm0, xmm5, 0 + vpclmulqdq xmm12, xmm0, xmm5, 17 + vpclmulqdq xmm13, xmm0, xmm0, 0 + vpclmulqdq xmm14, xmm0, xmm0, 17 + vpxor xmm11, xmm11, xmm10 + vpslldq xmm10, xmm11, 8 + vpsrldq xmm11, xmm11, 8 + vpxor xmm10, xmm10, xmm9 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm10, xmm10, xmm9 + vpxor xmm13, xmm13, xmm8 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm12, xmm12, xmm11 + vpxor xmm13, xmm13, xmm8 + vpxor xmm10, xmm10, xmm12 + vpxor xmm2, xmm13, xmm14 + vpxor xmm1, xmm10, xmm9 + vmovdqu OWORD PTR [rsp+32], xmm1 + vmovdqu OWORD PTR [rsp+48], xmm2 + ; H ^ 5 and H ^ 6 + vpclmulqdq xmm11, xmm1, xmm0, 16 + vpclmulqdq xmm10, xmm1, xmm0, 1 + vpclmulqdq xmm9, xmm1, xmm0, 0 + vpclmulqdq xmm12, xmm1, xmm0, 17 + vpclmulqdq xmm13, xmm1, xmm1, 0 + vpclmulqdq xmm14, xmm1, xmm1, 17 + vpxor xmm11, xmm11, xmm10 + vpslldq xmm10, xmm11, 8 + vpsrldq xmm11, xmm11, 8 + vpxor xmm10, xmm10, xmm9 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm10, xmm10, xmm9 + vpxor xmm13, xmm13, xmm8 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm12, xmm12, xmm11 + vpxor xmm13, xmm13, xmm8 + vpxor xmm10, xmm10, xmm12 + vpxor xmm0, xmm13, xmm14 + vpxor xmm7, xmm10, xmm9 + vmovdqu OWORD PTR [rsp+64], xmm7 + vmovdqu OWORD PTR [rsp+80], xmm0 + ; H ^ 7 and H ^ 8 + vpclmulqdq xmm11, xmm2, xmm1, 16 + vpclmulqdq xmm10, xmm2, xmm1, 1 + vpclmulqdq xmm9, xmm2, xmm1, 0 + vpclmulqdq xmm12, xmm2, xmm1, 17 + vpclmulqdq xmm13, xmm2, xmm2, 0 + vpclmulqdq xmm14, xmm2, xmm2, 17 + vpxor xmm11, xmm11, xmm10 + vpslldq xmm10, xmm11, 8 + vpsrldq xmm11, xmm11, 8 + vpxor xmm10, xmm10, xmm9 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm10, xmm10, xmm9 + vpxor xmm13, xmm13, xmm8 + vpclmulqdq xmm9, xmm10, xmm3, 16 + vpclmulqdq xmm8, xmm13, xmm3, 16 + vpshufd xmm10, xmm10, 78 + vpshufd xmm13, xmm13, 78 + vpxor xmm12, xmm12, xmm11 + vpxor xmm13, xmm13, xmm8 + vpxor xmm10, xmm10, xmm12 + vpxor xmm0, xmm13, xmm14 + vpxor xmm7, xmm10, xmm9 + vmovdqu OWORD PTR [rsp+96], xmm7 + vmovdqu OWORD PTR [rsp+112], xmm0 +L_AES_GCM_decrypt_update_avx2_ghash_128: + ; aesenc_128_ghash + lea rcx, QWORD PTR [r11+rdi] + lea rdx, QWORD PTR [r10+rdi] + ; aesenc_ctr + vmovdqu xmm0, OWORD PTR [rsp+128] + vmovdqu xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64 + vpaddd xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one + vpshufb xmm8, xmm0, xmm1 + vpaddd xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two + vpshufb xmm9, xmm9, xmm1 + vpaddd xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three + vpshufb xmm10, xmm10, xmm1 + vpaddd xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four + vpshufb xmm11, xmm11, xmm1 + vpaddd xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five + vpshufb xmm12, xmm12, xmm1 + vpaddd xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six + vpshufb xmm13, xmm13, xmm1 + vpaddd xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven + vpshufb xmm14, xmm14, xmm1 + vpaddd xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight + vpshufb xmm15, xmm15, xmm1 + ; aesenc_xor + vmovdqu xmm7, OWORD PTR [rax] + vmovdqu OWORD PTR [rsp+128], xmm0 + vpxor xmm8, xmm8, xmm7 + vpxor xmm9, xmm9, xmm7 + vpxor xmm10, xmm10, xmm7 + vpxor xmm11, xmm11, xmm7 + vpxor xmm12, xmm12, xmm7 + vpxor xmm13, xmm13, xmm7 + vpxor xmm14, xmm14, xmm7 + vpxor xmm15, xmm15, xmm7 + ; aesenc_pclmul_1 + vmovdqu xmm1, OWORD PTR [rcx] + vmovdqu xmm0, OWORD PTR [rax+16] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vmovdqu xmm2, OWORD PTR [rsp+112] + vpxor xmm1, xmm1, xmm6 + vpclmulqdq xmm5, xmm1, xmm2, 16 + vpclmulqdq xmm3, xmm1, xmm2, 1 + vpclmulqdq xmm6, xmm1, xmm2, 0 + vpclmulqdq xmm7, xmm1, xmm2, 17 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_2 + vmovdqu xmm1, OWORD PTR [rcx+16] + vmovdqu xmm0, OWORD PTR [rsp+96] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rax+32] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rcx+32] + vmovdqu xmm0, OWORD PTR [rsp+80] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rax+48] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rcx+48] + vmovdqu xmm0, OWORD PTR [rsp+64] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rax+64] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rcx+64] + vmovdqu xmm0, OWORD PTR [rsp+48] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rax+80] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rcx+80] + vmovdqu xmm0, OWORD PTR [rsp+32] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rax+96] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rcx+96] + vmovdqu xmm0, OWORD PTR [rsp+16] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rax+112] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_n + vmovdqu xmm1, OWORD PTR [rcx+112] + vmovdqu xmm0, OWORD PTR [rsp] + vpshufb xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm5, xmm5, xmm2 + vpclmulqdq xmm2, xmm1, xmm0, 16 + vpxor xmm5, xmm5, xmm3 + vpclmulqdq xmm3, xmm1, xmm0, 1 + vpxor xmm6, xmm6, xmm4 + vpclmulqdq xmm4, xmm1, xmm0, 0 + vpclmulqdq xmm1, xmm1, xmm0, 17 + vmovdqu xmm0, OWORD PTR [rax+128] + vpxor xmm7, xmm7, xmm1 + vaesenc xmm8, xmm8, xmm0 + vaesenc xmm9, xmm9, xmm0 + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm11, xmm11, xmm0 + vaesenc xmm12, xmm12, xmm0 + vaesenc xmm13, xmm13, xmm0 + vaesenc xmm14, xmm14, xmm0 + vaesenc xmm15, xmm15, xmm0 + ; aesenc_pclmul_l + vpxor xmm5, xmm5, xmm2 + vpxor xmm6, xmm6, xmm4 + vpxor xmm5, xmm5, xmm3 + vpslldq xmm1, xmm5, 8 + vpsrldq xmm5, xmm5, 8 + vmovdqu xmm4, OWORD PTR [rax+144] + vmovdqu xmm0, OWORD PTR L_avx2_aes_gcm_mod2_128 + vaesenc xmm8, xmm8, xmm4 + vpxor xmm6, xmm6, xmm1 + vpxor xmm7, xmm7, xmm5 + vpclmulqdq xmm3, xmm6, xmm0, 16 + vaesenc xmm9, xmm9, xmm4 + vaesenc xmm10, xmm10, xmm4 + vaesenc xmm11, xmm11, xmm4 + vpshufd xmm6, xmm6, 78 + vpxor xmm6, xmm6, xmm3 + vpclmulqdq xmm3, xmm6, xmm0, 16 + vaesenc xmm12, xmm12, xmm4 + vaesenc xmm13, xmm13, xmm4 + vaesenc xmm14, xmm14, xmm4 + vpshufd xmm6, xmm6, 78 + vpxor xmm6, xmm6, xmm3 + vpxor xmm6, xmm6, xmm7 + vaesenc xmm15, xmm15, xmm4 + cmp r8d, 11 + vmovdqu xmm7, OWORD PTR [rax+160] + jl L_AES_GCM_decrypt_update_avx2_aesenc_128_ghash_avx_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rax+176] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + cmp r8d, 13 + vmovdqu xmm7, OWORD PTR [rax+192] + jl L_AES_GCM_decrypt_update_avx2_aesenc_128_ghash_avx_done + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rax+208] + vaesenc xmm8, xmm8, xmm7 + vaesenc xmm9, xmm9, xmm7 + vaesenc xmm10, xmm10, xmm7 + vaesenc xmm11, xmm11, xmm7 + vaesenc xmm12, xmm12, xmm7 + vaesenc xmm13, xmm13, xmm7 + vaesenc xmm14, xmm14, xmm7 + vaesenc xmm15, xmm15, xmm7 + vmovdqu xmm7, OWORD PTR [rax+224] +L_AES_GCM_decrypt_update_avx2_aesenc_128_ghash_avx_done: + ; aesenc_last + vaesenclast xmm8, xmm8, xmm7 + vaesenclast xmm9, xmm9, xmm7 + vaesenclast xmm10, xmm10, xmm7 + vaesenclast xmm11, xmm11, xmm7 + vmovdqu xmm0, OWORD PTR [rcx] + vmovdqu xmm1, OWORD PTR [rcx+16] + vmovdqu xmm2, OWORD PTR [rcx+32] + vmovdqu xmm3, OWORD PTR [rcx+48] + vpxor xmm8, xmm8, xmm0 + vpxor xmm9, xmm9, xmm1 + vpxor xmm10, xmm10, xmm2 + vpxor xmm11, xmm11, xmm3 + vmovdqu OWORD PTR [rdx], xmm8 + vmovdqu OWORD PTR [rdx+16], xmm9 + vmovdqu OWORD PTR [rdx+32], xmm10 + vmovdqu OWORD PTR [rdx+48], xmm11 + vaesenclast xmm12, xmm12, xmm7 + vaesenclast xmm13, xmm13, xmm7 + vaesenclast xmm14, xmm14, xmm7 + vaesenclast xmm15, xmm15, xmm7 + vmovdqu xmm0, OWORD PTR [rcx+64] + vmovdqu xmm1, OWORD PTR [rcx+80] + vmovdqu xmm2, OWORD PTR [rcx+96] + vmovdqu xmm3, OWORD PTR [rcx+112] + vpxor xmm12, xmm12, xmm0 + vpxor xmm13, xmm13, xmm1 + vpxor xmm14, xmm14, xmm2 + vpxor xmm15, xmm15, xmm3 + vmovdqu OWORD PTR [rdx+64], xmm12 + vmovdqu OWORD PTR [rdx+80], xmm13 + vmovdqu OWORD PTR [rdx+96], xmm14 + vmovdqu OWORD PTR [rdx+112], xmm15 + ; aesenc_128_ghash - end + add edi, 128 + cmp edi, r13d + jl L_AES_GCM_decrypt_update_avx2_ghash_128 + vmovdqu xmm5, OWORD PTR [rsp] + vmovdqu xmm4, OWORD PTR [rsp+128] + vmovdqu xmm15, OWORD PTR [rsp+144] +L_AES_GCM_decrypt_update_avx2_done_128: + cmp edi, r9d + jge L_AES_GCM_decrypt_update_avx2_done_dec + mov r13d, r9d + and r13d, 4294967280 + cmp edi, r13d + jge L_AES_GCM_decrypt_update_avx2_last_block_done +L_AES_GCM_decrypt_update_avx2_last_block_start: + vmovdqu xmm11, OWORD PTR [r11+rdi] + vpshufb xmm10, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64 + vpshufb xmm12, xmm11, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpaddd xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one + vpxor xmm12, xmm12, xmm6 + ; aesenc_gfmul_sb + vpclmulqdq xmm2, xmm12, xmm5, 1 + vpclmulqdq xmm3, xmm12, xmm5, 16 + vpclmulqdq xmm1, xmm12, xmm5, 0 + vpclmulqdq xmm8, xmm12, xmm5, 17 + vpxor xmm10, xmm10, [rax] + vaesenc xmm10, xmm10, [rax+16] + vpxor xmm3, xmm3, xmm2 + vpslldq xmm2, xmm3, 8 + vpsrldq xmm3, xmm3, 8 + vaesenc xmm10, xmm10, [rax+32] + vpxor xmm2, xmm2, xmm1 + vpclmulqdq xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vaesenc xmm10, xmm10, [rax+48] + vaesenc xmm10, xmm10, [rax+64] + vaesenc xmm10, xmm10, [rax+80] + vpshufd xmm2, xmm2, 78 + vpxor xmm2, xmm2, xmm1 + vpclmulqdq xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vaesenc xmm10, xmm10, [rax+96] + vaesenc xmm10, xmm10, [rax+112] + vaesenc xmm10, xmm10, [rax+128] + vpshufd xmm2, xmm2, 78 + vaesenc xmm10, xmm10, [rax+144] + vpxor xmm8, xmm8, xmm3 + vpxor xmm2, xmm2, xmm8 + vmovdqu xmm0, OWORD PTR [rax+160] + cmp r8d, 11 + jl L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm10, xmm10, [rax+176] + vmovdqu xmm0, OWORD PTR [rax+192] + cmp r8d, 13 + jl L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last + vaesenc xmm10, xmm10, xmm0 + vaesenc xmm10, xmm10, [rax+208] + vmovdqu xmm0, OWORD PTR [rax+224] +L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last: + vaesenclast xmm10, xmm10, xmm0 + vpxor xmm6, xmm2, xmm1 + vpxor xmm10, xmm10, xmm11 + vmovdqu OWORD PTR [r10+rdi], xmm10 + add edi, 16 + cmp edi, r13d + jl L_AES_GCM_decrypt_update_avx2_last_block_start +L_AES_GCM_decrypt_update_avx2_last_block_done: +L_AES_GCM_decrypt_update_avx2_done_dec: + vmovdqu OWORD PTR [r12], xmm6 + vmovdqu OWORD PTR [r15], xmm4 + vzeroupper + add rsp, 168 + pop rdi + pop r15 + pop r14 + pop r12 + pop r13 + ret +AES_GCM_decrypt_update_avx2 ENDP +_text ENDS +_text SEGMENT READONLY PARA +AES_GCM_decrypt_final_avx2 PROC + push r12 + push r13 + push r14 + mov eax, DWORD PTR [rsp+64] + mov r10, QWORD PTR [rsp+72] + mov r11, QWORD PTR [rsp+80] + mov r12, QWORD PTR [rsp+88] + sub rsp, 16 + vmovdqu xmm4, OWORD PTR [rcx] + vmovdqu xmm5, OWORD PTR [r10] + vmovdqu xmm6, OWORD PTR [r11] + vpsrlq xmm1, xmm5, 63 + vpsllq xmm0, xmm5, 1 + vpslldq xmm1, xmm1, 8 + vpor xmm0, xmm0, xmm1 + vpshufd xmm5, xmm5, 255 + vpsrad xmm5, xmm5, 31 + vpand xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128 + vpxor xmm5, xmm5, xmm0 + ; calc_tag + shl r9, 3 + shl rax, 3 + vmovq xmm0, r9 + vmovq xmm1, rax + vpunpcklqdq xmm0, xmm0, xmm1 + vpxor xmm0, xmm0, xmm4 + ; ghash_gfmul_red + vpclmulqdq xmm7, xmm0, xmm5, 16 + vpclmulqdq xmm3, xmm0, xmm5, 1 + vpclmulqdq xmm2, xmm0, xmm5, 0 + vpxor xmm7, xmm7, xmm3 + vpslldq xmm3, xmm7, 8 + vpsrldq xmm7, xmm7, 8 + vpxor xmm3, xmm3, xmm2 + vpclmulqdq xmm0, xmm0, xmm5, 17 + vpclmulqdq xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vpshufd xmm3, xmm3, 78 + vpxor xmm3, xmm3, xmm2 + vpclmulqdq xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16 + vpshufd xmm3, xmm3, 78 + vpxor xmm0, xmm0, xmm7 + vpxor xmm0, xmm0, xmm3 + vpxor xmm0, xmm0, xmm2 + vpshufb xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask + vpxor xmm0, xmm0, xmm6 + ; cmp_tag + cmp r8d, 16 + je L_AES_GCM_decrypt_final_avx2_cmp_tag_16 + xor r13, r13 + xor r10, r10 + vmovdqu OWORD PTR [rsp], xmm0 +L_AES_GCM_decrypt_final_avx2_cmp_tag_loop: + movzx r14d, BYTE PTR [rsp+r13] + xor r14b, BYTE PTR [rdx+r13] + or r10b, r14b + inc r13d + cmp r13d, r8d + jne L_AES_GCM_decrypt_final_avx2_cmp_tag_loop + cmp r10, 0 + sete r10b + jmp L_AES_GCM_decrypt_final_avx2_cmp_tag_done +L_AES_GCM_decrypt_final_avx2_cmp_tag_16: + vmovdqu xmm1, OWORD PTR [rdx] + vpcmpeqb xmm0, xmm0, xmm1 + vpmovmskb r13, xmm0 + ; %%edx == 0xFFFF then return 1 else => return 0 + xor r10d, r10d + cmp r13d, 65535 + sete r10b +L_AES_GCM_decrypt_final_avx2_cmp_tag_done: + mov DWORD PTR [r12], r10d + vzeroupper + add rsp, 16 + pop r14 + pop r13 + pop r12 + ret +AES_GCM_decrypt_final_avx2 ENDP +_text ENDS +ENDIF +END diff --git a/wolfcrypt/src/include.am b/wolfcrypt/src/include.am index c29887a4c..5dfec3714 100644 --- a/wolfcrypt/src/include.am +++ b/wolfcrypt/src/include.am @@ -14,6 +14,7 @@ EXTRA_DIST += wolfcrypt/src/misc.c EXTRA_DIST += wolfcrypt/src/evp.c EXTRA_DIST += wolfcrypt/src/asm.c EXTRA_DIST += wolfcrypt/src/aes_asm.asm +EXTRA_DIST += wolfcrypt/src/aes_gcm_asm.asm EXTRA_DIST += wolfcrypt/src/wc_dsp.c EXTRA_DIST += wolfcrypt/src/sp_dsp32.c EXTRA_DIST += wolfcrypt/src/sp_x86_64_asm.asm diff --git a/wolfcrypt/src/sp_int.c b/wolfcrypt/src/sp_int.c index 6f5b455ff..fdc15a912 100644 --- a/wolfcrypt/src/sp_int.c +++ b/wolfcrypt/src/sp_int.c @@ -618,7 +618,12 @@ This library provides single precision (SP) integer math functions. while (0) /* Index of highest bit set. */ #define SP_ASM_HI_BIT_SET_IDX(va, vi) \ - vi = _BitScanReverse64(va) + do { \ + unsigned long idx; \ + _BitScanReverse64(&idx, va); \ + vi = idx; \ + } \ + while (0) #endif #if !defined(WOLFSSL_SP_DIV_WORD_HALF) && (!defined(_MSC_VER) || \ diff --git a/wolfcrypt/src/sp_x86_64_asm.asm b/wolfcrypt/src/sp_x86_64_asm.asm index 7e4d39362..66a7f003b 100644 --- a/wolfcrypt/src/sp_x86_64_asm.asm +++ b/wolfcrypt/src/sp_x86_64_asm.asm @@ -20598,7 +20598,7 @@ _text SEGMENT READONLY PARA sp_2048_lshift_32 PROC push r12 push r13 - mov rcx, r8 + mov cl, r8b mov rax, rcx mov r12, 0 mov r13, QWORD PTR [rdx+216] @@ -40446,7 +40446,7 @@ _text SEGMENT READONLY PARA sp_3072_lshift_48 PROC push r12 push r13 - mov rcx, r8 + mov cl, r8b mov rax, rcx mov r12, 0 mov r13, QWORD PTR [rdx+344] @@ -53973,7 +53973,7 @@ _text SEGMENT READONLY PARA sp_4096_lshift_64 PROC push r12 push r13 - mov rcx, r8 + mov cl, r8b mov rax, rcx mov r12, 0 mov r13, QWORD PTR [rdx+472] @@ -67878,7 +67878,7 @@ _text SEGMENT READONLY PARA sp_521_lshift_9 PROC push r12 push r13 - mov rcx, r8 + mov cl, r8b mov rax, rcx mov r12, 0 mov r13, QWORD PTR [rdx+32] @@ -67925,7 +67925,7 @@ _text SEGMENT READONLY PARA sp_521_lshift_18 PROC push r12 push r13 - mov rcx, r8 + mov cl, r8b mov rax, rcx mov r12, 0 mov r13, QWORD PTR [rdx+104] diff --git a/wolfssl.vcxproj b/wolfssl.vcxproj index 442710d8f..e3a80499f 100644 --- a/wolfssl.vcxproj +++ b/wolfssl.vcxproj @@ -362,6 +362,20 @@ $(OutDir)%(Filename).obj $(IntDir)%(Filename).obj + + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false + false + ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity) + ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity) + $(OutDir)%(Filename).obj + $(IntDir)%(Filename).obj + false false