diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index 4178e18f7..9040e3487 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -3580,169 +3580,8 @@ static const __m128i EIGHT = M128_INIT(0x0, 0x8); static const __m128i BSWAP_EPI64 = M128_INIT(0x0001020304050607, 0x08090a0b0c0d0e0f); static const __m128i BSWAP_MASK = M128_INIT(0x08090a0b0c0d0e0f, 0x0001020304050607); -#define aes_gcm_calc_iv_12(KEY, ivec, nr, H, Y, T) \ -do \ -{ \ - word32 iv12[4]; \ - iv12[0] = *(word32*)&ivec[0]; \ - iv12[1] = *(word32*)&ivec[4]; \ - iv12[2] = *(word32*)&ivec[8]; \ - iv12[3] = 0x01000000; \ - Y = _mm_loadu_si128((__m128i*)iv12); \ - \ - /* (Compute E[ZERO, KS] and E[Y0, KS] together */ \ - tmp1 = _mm_load_si128(&KEY[0]); \ - tmp2 = _mm_xor_si128(Y, KEY[0]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); \ - lastKey = KEY[10]; \ - if (nr > 10) { \ - tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ - tmp2 = _mm_aesenc_si128(tmp2, lastKey); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); \ - lastKey = KEY[12]; \ - if (nr > 12) { \ - tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ - tmp2 = _mm_aesenc_si128(tmp2, lastKey); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); \ - lastKey = KEY[14]; \ - } \ - } \ - H = _mm_aesenclast_si128(tmp1, lastKey); \ - T = _mm_aesenclast_si128(tmp2, lastKey); \ - H = _mm_shuffle_epi8(H, BSWAP_MASK); \ -} \ -while (0) - -#define aes_gcm_calc_iv(KEY, ivec, ibytes, nr, H, Y, T) \ -do \ -{ \ - if (ibytes % 16) { \ - i = ibytes / 16; \ - for (j=0; j < (int)(ibytes%16); j++) \ - ((unsigned char*)&last_block)[j] = ivec[i*16+j]; \ - } \ - tmp1 = _mm_load_si128(&KEY[0]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); \ - lastKey = KEY[10]; \ - if (nr > 10) { \ - tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); \ - lastKey = KEY[12]; \ - if (nr > 12) { \ - tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); \ - lastKey = KEY[14]; \ - } \ - } \ - H = _mm_aesenclast_si128(tmp1, lastKey); \ - H = _mm_shuffle_epi8(H, BSWAP_MASK); \ - Y = _mm_setzero_si128(); \ - for (i=0; i < (int)(ibytes/16); i++) { \ - tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); \ - tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); \ - Y = _mm_xor_si128(Y, tmp1); \ - Y = gfmul_sw(Y, H); \ - } \ - if (ibytes % 16) { \ - tmp1 = last_block; \ - tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); \ - Y = _mm_xor_si128(Y, tmp1); \ - Y = gfmul_sw(Y, H); \ - } \ - tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0); \ - tmp1 = _mm_insert_epi64(tmp1, 0, 1); \ - Y = _mm_xor_si128(Y, tmp1); \ - Y = gfmul_sw(Y, H); \ - Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /* Compute E(K, Y0) */ \ - tmp1 = _mm_xor_si128(Y, KEY[0]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); \ - lastKey = KEY[10]; \ - if (nr > 10) { \ - tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); \ - lastKey = KEY[12]; \ - if (nr > 12) { \ - tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); \ - lastKey = KEY[14]; \ - } \ - } \ - T = _mm_aesenclast_si128(tmp1, lastKey); \ -} \ -while (0) - -#define AES_ENC_8(j) \ - tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); \ - tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); \ - tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); \ - tmp4 = _mm_aesenc_si128(tmp4, KEY[j]); \ - tmp5 = _mm_aesenc_si128(tmp5, KEY[j]); \ - tmp6 = _mm_aesenc_si128(tmp6, KEY[j]); \ - tmp7 = _mm_aesenc_si128(tmp7, KEY[j]); \ - tmp8 = _mm_aesenc_si128(tmp8, KEY[j]); - -#define AES_ENC_LAST_8() \ - tmp1 =_mm_aesenclast_si128(tmp1, lastKey); \ - tmp2 =_mm_aesenclast_si128(tmp2, lastKey); \ - tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[i*8+0])); \ - tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[i*8+1])); \ - _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1); \ - _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2); \ - tmp3 =_mm_aesenclast_si128(tmp3, lastKey); \ - tmp4 =_mm_aesenclast_si128(tmp4, lastKey); \ - tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[i*8+2])); \ - tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[i*8+3])); \ - _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3); \ - _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4); \ - tmp5 =_mm_aesenclast_si128(tmp5, lastKey); \ - tmp6 =_mm_aesenclast_si128(tmp6, lastKey); \ - tmp5 = _mm_xor_si128(tmp5, _mm_loadu_si128(&((__m128i*)in)[i*8+4])); \ - tmp6 = _mm_xor_si128(tmp6, _mm_loadu_si128(&((__m128i*)in)[i*8+5])); \ - _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5); \ - _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6); \ - tmp7 =_mm_aesenclast_si128(tmp7, lastKey); \ - tmp8 =_mm_aesenclast_si128(tmp8, lastKey); \ - tmp7 = _mm_xor_si128(tmp7, _mm_loadu_si128(&((__m128i*)in)[i*8+6])); \ - tmp8 = _mm_xor_si128(tmp8, _mm_loadu_si128(&((__m128i*)in)[i*8+7])); \ - _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); \ - _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); - +#ifndef _MSC_VER #define _VAR(a) ""#a"" #define VAR(a) _VAR(a) @@ -6835,6 +6674,1080 @@ static void AES_GCM_decrypt_avx2(const unsigned char *in, unsigned char *out, #endif /* HAVE_INTEL_AVX2 */ #endif /* HAVE_INTEL_AVX1 */ #endif /* HAVE_AES_DECRYPT */ + +#else /* _MSC_VER */ +/* The following are for MSC based builds which do not allow + * inline assembly. Intrinsic functions are used instead. */ + +#define aes_gcm_calc_iv_12(KEY, ivec, nr, H, Y, T) \ +do \ +{ \ + word32 iv12[4]; \ + iv12[0] = *(word32*)&ivec[0]; \ + iv12[1] = *(word32*)&ivec[4]; \ + iv12[2] = *(word32*)&ivec[8]; \ + iv12[3] = 0x01000000; \ + Y = _mm_loadu_si128((__m128i*)iv12); \ + \ + /* (Compute E[ZERO, KS] and E[Y0, KS] together */ \ + tmp1 = _mm_load_si128(&KEY[0]); \ + tmp2 = _mm_xor_si128(Y, KEY[0]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); \ + tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); \ + tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); \ + tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); \ + tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); \ + tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); \ + tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); \ + tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); \ + tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); \ + tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); \ + lastKey = KEY[10]; \ + if (nr > 10) { \ + tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ + tmp2 = _mm_aesenc_si128(tmp2, lastKey); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); \ + tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); \ + lastKey = KEY[12]; \ + if (nr > 12) { \ + tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ + tmp2 = _mm_aesenc_si128(tmp2, lastKey); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); \ + tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); \ + lastKey = KEY[14]; \ + } \ + } \ + H = _mm_aesenclast_si128(tmp1, lastKey); \ + T = _mm_aesenclast_si128(tmp2, lastKey); \ + H = _mm_shuffle_epi8(H, BSWAP_MASK); \ +} \ +while (0) + +#define aes_gcm_calc_iv(KEY, ivec, ibytes, nr, H, Y, T) \ +do \ +{ \ + if (ibytes % 16) { \ + i = ibytes / 16; \ + for (j=0; j < (int)(ibytes%16); j++) \ + ((unsigned char*)&last_block)[j] = ivec[i*16+j]; \ + } \ + tmp1 = _mm_load_si128(&KEY[0]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); \ + lastKey = KEY[10]; \ + if (nr > 10) { \ + tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); \ + lastKey = KEY[12]; \ + if (nr > 12) { \ + tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); \ + lastKey = KEY[14]; \ + } \ + } \ + H = _mm_aesenclast_si128(tmp1, lastKey); \ + H = _mm_shuffle_epi8(H, BSWAP_MASK); \ + Y = _mm_setzero_si128(); \ + for (i=0; i < (int)(ibytes/16); i++) { \ + tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); \ + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); \ + Y = _mm_xor_si128(Y, tmp1); \ + Y = gfmul_sw(Y, H); \ + } \ + if (ibytes % 16) { \ + tmp1 = last_block; \ + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); \ + Y = _mm_xor_si128(Y, tmp1); \ + Y = gfmul_sw(Y, H); \ + } \ + tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0); \ + tmp1 = _mm_insert_epi64(tmp1, 0, 1); \ + Y = _mm_xor_si128(Y, tmp1); \ + Y = gfmul_sw(Y, H); \ + Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /* Compute E(K, Y0) */ \ + tmp1 = _mm_xor_si128(Y, KEY[0]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); \ + lastKey = KEY[10]; \ + if (nr > 10) { \ + tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); \ + lastKey = KEY[12]; \ + if (nr > 12) { \ + tmp1 = _mm_aesenc_si128(tmp1, lastKey); \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); \ + lastKey = KEY[14]; \ + } \ + } \ + T = _mm_aesenclast_si128(tmp1, lastKey); \ +} \ +while (0) + +#define AES_ENC_8(j) \ + tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); \ + tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); \ + tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); \ + tmp4 = _mm_aesenc_si128(tmp4, KEY[j]); \ + tmp5 = _mm_aesenc_si128(tmp5, KEY[j]); \ + tmp6 = _mm_aesenc_si128(tmp6, KEY[j]); \ + tmp7 = _mm_aesenc_si128(tmp7, KEY[j]); \ + tmp8 = _mm_aesenc_si128(tmp8, KEY[j]); + +#define AES_ENC_LAST_8() \ + tmp1 =_mm_aesenclast_si128(tmp1, lastKey); \ + tmp2 =_mm_aesenclast_si128(tmp2, lastKey); \ + tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[i*8+0])); \ + tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[i*8+1])); \ + _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1); \ + _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2); \ + tmp3 =_mm_aesenclast_si128(tmp3, lastKey); \ + tmp4 =_mm_aesenclast_si128(tmp4, lastKey); \ + tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[i*8+2])); \ + tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[i*8+3])); \ + _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3); \ + _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4); \ + tmp5 =_mm_aesenclast_si128(tmp5, lastKey); \ + tmp6 =_mm_aesenclast_si128(tmp6, lastKey); \ + tmp5 = _mm_xor_si128(tmp5, _mm_loadu_si128(&((__m128i*)in)[i*8+4])); \ + tmp6 = _mm_xor_si128(tmp6, _mm_loadu_si128(&((__m128i*)in)[i*8+5])); \ + _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5); \ + _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6); \ + tmp7 =_mm_aesenclast_si128(tmp7, lastKey); \ + tmp8 =_mm_aesenclast_si128(tmp8, lastKey); \ + tmp7 = _mm_xor_si128(tmp7, _mm_loadu_si128(&((__m128i*)in)[i*8+6])); \ + tmp8 = _mm_xor_si128(tmp8, _mm_loadu_si128(&((__m128i*)in)[i*8+7])); \ + _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); \ + _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); + + +static __m128i gfmul_sw(__m128i a, __m128i b) +{ + __m128i r, t1, t2, t3, t4, t5, t6, t7; + t2 = _mm_shuffle_epi32(b, 78); + t3 = _mm_shuffle_epi32(a, 78); + t2 = _mm_xor_si128(t2, b); + t3 = _mm_xor_si128(t3, a); + t4 = _mm_clmulepi64_si128(b, a, 0x11); + t1 = _mm_clmulepi64_si128(b, a, 0x00); + t2 = _mm_clmulepi64_si128(t2, t3, 0x00); + t2 = _mm_xor_si128(t2, t1); + t2 = _mm_xor_si128(t2, t4); + t3 = _mm_slli_si128(t2, 8); + t2 = _mm_srli_si128(t2, 8); + t1 = _mm_xor_si128(t1, t3); + t4 = _mm_xor_si128(t4, t2); + + t5 = _mm_srli_epi32(t1, 31); + t6 = _mm_srli_epi32(t4, 31); + t1 = _mm_slli_epi32(t1, 1); + t4 = _mm_slli_epi32(t4, 1); + t7 = _mm_srli_si128(t5, 12); + t5 = _mm_slli_si128(t5, 4); + t6 = _mm_slli_si128(t6, 4); + t4 = _mm_or_si128(t4, t7); + t1 = _mm_or_si128(t1, t5); + t4 = _mm_or_si128(t4, t6); + + t5 = _mm_slli_epi32(t1, 31); + t6 = _mm_slli_epi32(t1, 30); + t7 = _mm_slli_epi32(t1, 25); + t5 = _mm_xor_si128(t5, t6); + t5 = _mm_xor_si128(t5, t7); + + t6 = _mm_srli_si128(t5, 4); + t5 = _mm_slli_si128(t5, 12); + t1 = _mm_xor_si128(t1, t5); + t7 = _mm_srli_epi32(t1, 1); + t3 = _mm_srli_epi32(t1, 2); + t2 = _mm_srli_epi32(t1, 7); + + t7 = _mm_xor_si128(t7, t3); + t7 = _mm_xor_si128(t7, t2); + t7 = _mm_xor_si128(t7, t6); + t7 = _mm_xor_si128(t7, t1); + r = _mm_xor_si128(t4, t7); + + return r; +} + +static void gfmul_only(__m128i a, __m128i b, __m128i* r0, __m128i* r1) +{ + __m128i t1, t2, t3, t4; + + /* 128 x 128 Carryless Multiply */ + t2 = _mm_shuffle_epi32(b, 78); + t3 = _mm_shuffle_epi32(a, 78); + t2 = _mm_xor_si128(t2, b); + t3 = _mm_xor_si128(t3, a); + t4 = _mm_clmulepi64_si128(b, a, 0x11); + t1 = _mm_clmulepi64_si128(b, a, 0x00); + t2 = _mm_clmulepi64_si128(t2, t3, 0x00); + t2 = _mm_xor_si128(t2, t1); + t2 = _mm_xor_si128(t2, t4); + t3 = _mm_slli_si128(t2, 8); + t2 = _mm_srli_si128(t2, 8); + t1 = _mm_xor_si128(t1, t3); + t4 = _mm_xor_si128(t4, t2); + *r0 = _mm_xor_si128(t1, *r0); + *r1 = _mm_xor_si128(t4, *r1); +} + +static __m128i gfmul_shl1(__m128i a) +{ + __m128i t1 = a, t2; + t2 = _mm_srli_epi64(t1, 63); + t1 = _mm_slli_epi64(t1, 1); + t2 = _mm_slli_si128(t2, 8); + t1 = _mm_or_si128(t1, t2); + /* if (a[1] >> 63) t1 = _mm_xor_si128(t1, MOD2_128); */ + a = _mm_shuffle_epi32(a, 0xff); + a = _mm_srai_epi32(a, 31); + a = _mm_and_si128(a, MOD2_128); + t1 = _mm_xor_si128(t1, a); + return t1; +} + +static __m128i ghash_red(__m128i r0, __m128i r1) +{ + __m128i t2, t3; + __m128i t5, t6, t7; + + t5 = _mm_slli_epi32(r0, 31); + t6 = _mm_slli_epi32(r0, 30); + t7 = _mm_slli_epi32(r0, 25); + t5 = _mm_xor_si128(t5, t6); + t5 = _mm_xor_si128(t5, t7); + + t6 = _mm_srli_si128(t5, 4); + t5 = _mm_slli_si128(t5, 12); + r0 = _mm_xor_si128(r0, t5); + t7 = _mm_srli_epi32(r0, 1); + t3 = _mm_srli_epi32(r0, 2); + t2 = _mm_srli_epi32(r0, 7); + + t7 = _mm_xor_si128(t7, t3); + t7 = _mm_xor_si128(t7, t2); + t7 = _mm_xor_si128(t7, t6); + t7 = _mm_xor_si128(t7, r0); + return _mm_xor_si128(r1, t7); +} + +static __m128i gfmul_shifted(__m128i a, __m128i b) +{ + __m128i t0 = _mm_setzero_si128(), t1 = _mm_setzero_si128(); + gfmul_only(a, b, &t0, &t1); + return ghash_red(t0, t1); +} + +#ifndef AES_GCM_AESNI_NO_UNROLL +static __m128i gfmul8(__m128i a1, __m128i a2, __m128i a3, __m128i a4, + __m128i a5, __m128i a6, __m128i a7, __m128i a8, + __m128i b1, __m128i b2, __m128i b3, __m128i b4, + __m128i b5, __m128i b6, __m128i b7, __m128i b8) +{ + __m128i t0 = _mm_setzero_si128(), t1 = _mm_setzero_si128(); + gfmul_only(a1, b8, &t0, &t1); + gfmul_only(a2, b7, &t0, &t1); + gfmul_only(a3, b6, &t0, &t1); + gfmul_only(a4, b5, &t0, &t1); + gfmul_only(a5, b4, &t0, &t1); + gfmul_only(a6, b3, &t0, &t1); + gfmul_only(a7, b2, &t0, &t1); + gfmul_only(a8, b1, &t0, &t1); + return ghash_red(t0, t1); +} +#endif + + +static void AES_GCM_encrypt(const unsigned char *in, + unsigned char *out, + const unsigned char* addt, + const unsigned char* ivec, + unsigned char *tag, unsigned int nbytes, + unsigned int abytes, unsigned int ibytes, + const unsigned char* key, int nr) +{ + int i, j ,k; + __m128i ctr1; + __m128i H, Y, T; + __m128i X = _mm_setzero_si128(); + __m128i *KEY = (__m128i*)key, lastKey; + __m128i last_block = _mm_setzero_si128(); + __m128i tmp1, tmp2; +#ifndef AES_GCM_AESNI_NO_UNROLL + __m128i HT[8]; + __m128i r0, r1; + __m128i XV; + __m128i tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; +#endif + + if (ibytes == 12) + aes_gcm_calc_iv_12(KEY, ivec, nr, H, Y, T); + else + aes_gcm_calc_iv(KEY, ivec, ibytes, nr, H, Y, T); + + for (i=0; i < (int)(abytes/16); i++) { + tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]); + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X = _mm_xor_si128(X, tmp1); + X = gfmul_sw(X, H); + } + if (abytes%16) { + last_block = _mm_setzero_si128(); + for (j=0; j < (int)(abytes%16); j++) + ((unsigned char*)&last_block)[j] = addt[i*16+j]; + tmp1 = last_block; + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X = _mm_xor_si128(X, tmp1); + X = gfmul_sw(X, H); + } + tmp1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); + ctr1 = _mm_add_epi32(tmp1, ONE); + H = gfmul_shl1(H); + +#ifndef AES_GCM_AESNI_NO_UNROLL + i = 0; + if (nbytes >= 16*8) { + HT[0] = H; + HT[1] = gfmul_shifted(H, H); + HT[2] = gfmul_shifted(H, HT[1]); + HT[3] = gfmul_shifted(HT[1], HT[1]); + HT[4] = gfmul_shifted(HT[1], HT[2]); + HT[5] = gfmul_shifted(HT[2], HT[2]); + HT[6] = gfmul_shifted(HT[2], HT[3]); + HT[7] = gfmul_shifted(HT[3], HT[3]); + + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + tmp2 = _mm_add_epi32(ctr1, ONE); + tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_EPI64); + tmp3 = _mm_add_epi32(ctr1, TWO); + tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_EPI64); + tmp4 = _mm_add_epi32(ctr1, THREE); + tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_EPI64); + tmp5 = _mm_add_epi32(ctr1, FOUR); + tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_EPI64); + tmp6 = _mm_add_epi32(ctr1, FIVE); + tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_EPI64); + tmp7 = _mm_add_epi32(ctr1, SIX); + tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_EPI64); + tmp8 = _mm_add_epi32(ctr1, SEVEN); + tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_EPI64); + ctr1 = _mm_add_epi32(ctr1, EIGHT); + tmp1 =_mm_xor_si128(tmp1, KEY[0]); + tmp2 =_mm_xor_si128(tmp2, KEY[0]); + tmp3 =_mm_xor_si128(tmp3, KEY[0]); + tmp4 =_mm_xor_si128(tmp4, KEY[0]); + tmp5 =_mm_xor_si128(tmp5, KEY[0]); + tmp6 =_mm_xor_si128(tmp6, KEY[0]); + tmp7 =_mm_xor_si128(tmp7, KEY[0]); + tmp8 =_mm_xor_si128(tmp8, KEY[0]); + AES_ENC_8(1); + AES_ENC_8(2); + AES_ENC_8(3); + AES_ENC_8(4); + AES_ENC_8(5); + AES_ENC_8(6); + AES_ENC_8(7); + AES_ENC_8(8); + AES_ENC_8(9); + lastKey = KEY[10]; + if (nr > 10) { + AES_ENC_8(10); + AES_ENC_8(11); + lastKey = KEY[12]; + if (nr > 12) { + AES_ENC_8(12); + AES_ENC_8(13); + lastKey = KEY[14]; + } + } + AES_ENC_LAST_8(); + + for (i=1; i < (int)(nbytes/16/8); i++) { + r0 = _mm_setzero_si128(); + r1 = _mm_setzero_si128(); + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + tmp2 = _mm_add_epi32(ctr1, ONE); + tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_EPI64); + tmp3 = _mm_add_epi32(ctr1, TWO); + tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_EPI64); + tmp4 = _mm_add_epi32(ctr1, THREE); + tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_EPI64); + tmp5 = _mm_add_epi32(ctr1, FOUR); + tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_EPI64); + tmp6 = _mm_add_epi32(ctr1, FIVE); + tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_EPI64); + tmp7 = _mm_add_epi32(ctr1, SIX); + tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_EPI64); + tmp8 = _mm_add_epi32(ctr1, SEVEN); + tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_EPI64); + ctr1 = _mm_add_epi32(ctr1, EIGHT); + tmp1 =_mm_xor_si128(tmp1, KEY[0]); + tmp2 =_mm_xor_si128(tmp2, KEY[0]); + tmp3 =_mm_xor_si128(tmp3, KEY[0]); + tmp4 =_mm_xor_si128(tmp4, KEY[0]); + tmp5 =_mm_xor_si128(tmp5, KEY[0]); + tmp6 =_mm_xor_si128(tmp6, KEY[0]); + tmp7 =_mm_xor_si128(tmp7, KEY[0]); + tmp8 =_mm_xor_si128(tmp8, KEY[0]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+0]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + XV = _mm_xor_si128(XV, X); + gfmul_only(XV, HT[7], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[1]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[1]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[1]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[1]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[1]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[1]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[1]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+1]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[6], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[2]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[2]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[2]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[2]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[2]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[2]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[2]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+2]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[5], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[3]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[3]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[3]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[3]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[3]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[3]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[3]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+3]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[4], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[4]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[4]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[4]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[4]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[4]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[4]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[4]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+4]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[3], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[5]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[5]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[5]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[5]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[5]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[5]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[5]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+5]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[2], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[6]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[6]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[6]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[6]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[6]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[6]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[6]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+6]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[1], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[7]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[7]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[7]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[7]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[7]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[7]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[7]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)out)[(i-1)*8+7]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + gfmul_only(XV, HT[0], &r0, &r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[8]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[8]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[8]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[8]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[8]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[8]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[8]); + /* Reduction */ + X = ghash_red(r0, r1); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[9]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[9]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[9]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[9]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[9]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[9]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, KEY[10]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[10]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[10]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[10]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[10]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[10]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[10]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[10]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[11]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[11]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[11]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[11]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[11]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, KEY[12]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[12]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[12]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[12]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[12]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[12]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[12]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[12]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[13]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[13]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[13]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[13]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[13]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[13]); + lastKey = KEY[14]; + } + } + AES_ENC_LAST_8(); + } + + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); + tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); + tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); + tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK); + tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK); + tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK); + tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK); + tmp1 = _mm_xor_si128(X, tmp1); + X = gfmul8(tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, + HT[0], HT[1], HT[2], HT[3], HT[4], HT[5], HT[6], HT[7]); + } + for (k = i*8; k < (int)(nbytes/16); k++) { + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + ctr1 = _mm_add_epi32(ctr1, ONE); + tmp1 = _mm_xor_si128(tmp1, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); + _mm_storeu_si128(&((__m128i*)out)[k], tmp1); + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X =_mm_xor_si128(X, tmp1); + X = gfmul_shifted(X, H); + } +#else /* AES_GCM_AESNI_NO_UNROLL */ + for (k = 0; k < (int)(nbytes/16) && k < 1; k++) { + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + ctr1 = _mm_add_epi32(ctr1, ONE); + tmp1 = _mm_xor_si128(tmp1, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); + _mm_storeu_si128(&((__m128i*)out)[k], tmp1); + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X =_mm_xor_si128(X, tmp1); + } + for (; k < (int)(nbytes/16); k++) { + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + ctr1 = _mm_add_epi32(ctr1, ONE); + tmp1 = _mm_xor_si128(tmp1, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + X = gfmul_shifted(X, H); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); + _mm_storeu_si128(&((__m128i*)out)[k], tmp1); + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X =_mm_xor_si128(X, tmp1); + } + if (k > 0) { + X = gfmul_shifted(X, H); + } +#endif /* AES_GCM_AESNI_NO_UNROLL */ + + /* If one partial block remains */ + if (nbytes % 16) { + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + tmp1 = _mm_xor_si128(tmp1, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + last_block = tmp1; + for (j=0; j < (int)(nbytes%16); j++) + ((unsigned char*)&last_block)[j] = in[k*16+j]; + tmp1 = _mm_xor_si128(tmp1, last_block); + last_block = tmp1; + for (j=0; j < (int)(nbytes%16); j++) + out[k*16+j] = ((unsigned char*)&last_block)[j]; + tmp1 = last_block; + tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); + X =_mm_xor_si128(X, tmp1); + X = gfmul_shifted(X, H); + } + tmp1 = _mm_insert_epi64(tmp1, nbytes*8, 0); + tmp1 = _mm_insert_epi64(tmp1, abytes*8, 1); + X = _mm_xor_si128(X, tmp1); + X = gfmul_shifted(X, H); + X = _mm_shuffle_epi8(X, BSWAP_MASK); + T = _mm_xor_si128(X, T); + _mm_storeu_si128((__m128i*)tag, T); +} + +#ifdef HAVE_AES_DECRYPT + +static void AES_GCM_decrypt(const unsigned char *in, + unsigned char *out, + const unsigned char* addt, + const unsigned char* ivec, + const unsigned char *tag, int nbytes, int abytes, + int ibytes, const unsigned char* key, int nr, + int* res) +{ + int i, j ,k; + __m128i H, Y, T; + __m128i *KEY = (__m128i*)key, lastKey; + __m128i ctr1; + __m128i last_block = _mm_setzero_si128(); + __m128i X = _mm_setzero_si128(); + __m128i tmp1, tmp2, XV; +#ifndef AES_GCM_AESNI_NO_UNROLL + __m128i HT[8]; + __m128i r0, r1; + __m128i tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; +#endif /* AES_GCM_AESNI_NO_UNROLL */ + + if (ibytes == 12) + aes_gcm_calc_iv_12(KEY, ivec, nr, H, Y, T); + else + aes_gcm_calc_iv(KEY, ivec, ibytes, nr, H, Y, T); + + for (i=0; i 10) { + tmp1 = _mm_aesenc_si128(tmp1, KEY[10]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[10]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[10]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[10]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[10]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[10]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[10]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[10]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[11]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[11]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[11]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[11]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[11]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[11]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, KEY[12]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[12]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[12]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[12]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[12]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[12]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[12]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[12]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + tmp2 = _mm_aesenc_si128(tmp2, KEY[13]); + tmp3 = _mm_aesenc_si128(tmp3, KEY[13]); + tmp4 = _mm_aesenc_si128(tmp4, KEY[13]); + tmp5 = _mm_aesenc_si128(tmp5, KEY[13]); + tmp6 = _mm_aesenc_si128(tmp6, KEY[13]); + tmp7 = _mm_aesenc_si128(tmp7, KEY[13]); + tmp8 = _mm_aesenc_si128(tmp8, KEY[13]); + lastKey = KEY[14]; + } + } + AES_ENC_LAST_8(); + } + } + +#endif /* AES_GCM_AESNI_NO_UNROLL */ + + for (k = i*8; k < nbytes/16; k++) { + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + ctr1 = _mm_add_epi32(ctr1, ONE); + tmp1 = _mm_xor_si128(tmp1, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + /* 128 x 128 Carryless Multiply */ + XV = _mm_loadu_si128(&((__m128i*)in)[k]); + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + XV = _mm_xor_si128(XV, X); + X = gfmul_shifted(XV, H); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + tmp2 = _mm_loadu_si128(&((__m128i*)in)[k]); + tmp1 = _mm_xor_si128(tmp1, tmp2); + _mm_storeu_si128(&((__m128i*)out)[k], tmp1); + } + + /* If one partial block remains */ + if (nbytes % 16) { + tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); + tmp1 = _mm_xor_si128(tmp1, KEY[0]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[1]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[2]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[3]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[4]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[5]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[6]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[7]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[8]); + tmp1 = _mm_aesenc_si128(tmp1, KEY[9]); + lastKey = KEY[10]; + if (nr > 10) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[11]); + lastKey = KEY[12]; + if (nr > 12) { + tmp1 = _mm_aesenc_si128(tmp1, lastKey); + tmp1 = _mm_aesenc_si128(tmp1, KEY[13]); + lastKey = KEY[14]; + } + } + tmp1 = _mm_aesenclast_si128(tmp1, lastKey); + last_block = _mm_setzero_si128(); + for (j=0; j < nbytes%16; j++) + ((unsigned char*)&last_block)[j] = in[k*16+j]; + XV = last_block; + tmp1 = _mm_xor_si128(tmp1, last_block); + last_block = tmp1; + for (j=0; j < nbytes%16; j++) + out[k*16+j] = ((unsigned char*)&last_block)[j]; + XV = _mm_shuffle_epi8(XV, BSWAP_MASK); + XV = _mm_xor_si128(XV, X); + X = gfmul_shifted(XV, H); + } + + tmp1 = _mm_insert_epi64(tmp1, nbytes*8, 0); + tmp1 = _mm_insert_epi64(tmp1, abytes*8, 1); + /* 128 x 128 Carryless Multiply */ + X = _mm_xor_si128(X, tmp1); + X = gfmul_shifted(X, H); + X = _mm_shuffle_epi8(X, BSWAP_MASK); + T = _mm_xor_si128(X, T); + + if (0xffff != + _mm_movemask_epi8(_mm_cmpeq_epi8(T, _mm_loadu_si128((__m128i*)tag)))) + *res = 0; /* in case the authentication failed */ + else + *res = 1; /* when successful returns 1 */ +} + +#endif /* HAVE_AES_DECRYPT */ +#endif /* _MSC_VER */ #endif /* WOLFSSL_AESNI */