1. Add C NI-intrinsic AES-GCM encrypt and decrypt.
2. Fix error string for wolfcrypt test of GMAC. 3. Add AES-GCM Decrypt to benchmark.
This commit is contained in:
parent
5d2d249673
commit
f8aeac608c
@ -452,6 +452,7 @@ then
|
||||
AM_CFLAGS="$AM_CFLAGS -maes -msse4"
|
||||
fi
|
||||
fi
|
||||
AS_IF([test "x$ENABLED_AESGCM" != "xno"],[AM_CCASFLAGS="$AM_CCASFLAGS -DHAVE_AESGCM"])
|
||||
fi
|
||||
|
||||
if test "$ENABLED_INTELASM" = "yes"
|
||||
@ -2500,6 +2501,7 @@ CREATE_HEX_VERSION
|
||||
AC_SUBST([AM_CPPFLAGS])
|
||||
AC_SUBST([AM_CFLAGS])
|
||||
AC_SUBST([AM_LDFLAGS])
|
||||
AC_SUBST([AM_CCASFLAGS])
|
||||
AC_SUBST([LIB_ADD])
|
||||
AC_SUBST([LIB_STATIC_ADD])
|
||||
|
||||
@ -2619,6 +2621,7 @@ echo " * C Flags: $CFLAGS"
|
||||
echo " * C++ Compiler: $CXX"
|
||||
echo " * C++ Flags: $CXXFLAGS"
|
||||
echo " * CPP Flags: $CPPFLAGS"
|
||||
echo " * CCAS Flags: $CCASFLAGS"
|
||||
echo " * LIB Flags: $LIB"
|
||||
echo " * Debug enabled: $ax_enable_debug"
|
||||
echo " * Warnings as failure: $ac_cv_warnings_as_errors"
|
||||
|
@ -483,7 +483,28 @@ void bench_aesgcm(void)
|
||||
persec = persec / 1024;
|
||||
#endif
|
||||
|
||||
printf("AES-GCM %d %s took %5.3f seconds, %8.3f MB/s", numBlocks,
|
||||
printf("AES-GCM Encrypt %d %s took %5.3f seconds, %8.3f MB/s", numBlocks,
|
||||
blockType, total, persec);
|
||||
SHOW_INTEL_CYCLES
|
||||
printf("\n");
|
||||
|
||||
start = current_time(1);
|
||||
BEGIN_INTEL_CYCLES
|
||||
|
||||
for(i = 0; i < numBlocks; i++)
|
||||
wc_AesGcmDecrypt(&enc, plain, cipher, sizeof(cipher), iv, 12,
|
||||
tag, 16, additional, 13);
|
||||
|
||||
END_INTEL_CYCLES
|
||||
total = current_time(0) - start;
|
||||
|
||||
persec = 1 / total * numBlocks;
|
||||
#ifdef BENCH_EMBEDDED
|
||||
/* since using kB, convert to MB/s */
|
||||
persec = persec / 1024;
|
||||
#endif
|
||||
|
||||
printf("AES-GCM Decrypt %d %s took %5.3f seconds, %8.3f MB/s", numBlocks,
|
||||
blockType, total, persec);
|
||||
SHOW_INTEL_CYCLES
|
||||
printf("\n");
|
||||
|
@ -2763,6 +2763,426 @@ int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)
|
||||
}
|
||||
|
||||
|
||||
#ifdef WOLFSSL_AESNI
|
||||
|
||||
void gfmul(__m128i a, __m128i b, __m128i* out) XASM_LINK("gfmul");
|
||||
|
||||
|
||||
/* See Intel® Carry-Less Multiplication Instruction
|
||||
* and its Usage for Computing the GCM Mode White Paper
|
||||
* by Shay Gueron, Intel Mobility Group, Israel Development Center;
|
||||
* and Michael E. Kounavis, Intel Labs, Circuits and Systems Research */
|
||||
|
||||
|
||||
/* Figure 9. AES-GCM – Encrypt With Single Block Ghash at a Time */
|
||||
|
||||
static void AES_GCM_encrypt(const unsigned char *in,
|
||||
unsigned char *out,
|
||||
const unsigned char* addt,
|
||||
const unsigned char* ivec,
|
||||
unsigned char *tag,
|
||||
int nbytes, int abytes, int ibytes,
|
||||
const unsigned char* key, int nr)
|
||||
{
|
||||
int i, j ,k;
|
||||
__m128i tmp1, tmp2, tmp3, tmp4;
|
||||
__m128i H, Y, T;
|
||||
__m128i *KEY = (__m128i*)key;
|
||||
__m128i ctr1, ctr2, ctr3, ctr4;
|
||||
__m128i last_block = _mm_setzero_si128();
|
||||
__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
|
||||
__m128i FOUR = _mm_set_epi32(0, 4, 0, 0);
|
||||
__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7);
|
||||
__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
|
||||
__m128i X = _mm_setzero_si128();
|
||||
|
||||
if(ibytes == 96/8) {
|
||||
Y = _mm_loadu_si128((__m128i*)ivec);
|
||||
Y = _mm_insert_epi32(Y, 0x1000000, 3);
|
||||
/* (Compute E[ZERO, KS] and E[Y0, KS] together */
|
||||
tmp1 = _mm_xor_si128(X, KEY[0]);
|
||||
tmp2 = _mm_xor_si128(Y, KEY[0]);
|
||||
for(j=1; j < nr-1; j+=2) {
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
||||
tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
|
||||
tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
|
||||
}
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
|
||||
tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
|
||||
H = _mm_aesenclast_si128(tmp1, KEY[nr]);
|
||||
T = _mm_aesenclast_si128(tmp2, KEY[nr]);
|
||||
H = _mm_shuffle_epi8(H, BSWAP_MASK);
|
||||
}
|
||||
else {
|
||||
tmp1 = _mm_xor_si128(X, KEY[0]);
|
||||
for(j=1; j <nr; j++)
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
||||
H = _mm_aesenclast_si128(tmp1, KEY[nr]);
|
||||
H = _mm_shuffle_epi8(H, BSWAP_MASK);
|
||||
Y = _mm_setzero_si128();
|
||||
for(i=0; i < ibytes/16; i++) {
|
||||
tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]);
|
||||
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
||||
Y = _mm_xor_si128(Y, tmp1);
|
||||
gfmul(Y, H, &Y);
|
||||
}
|
||||
if(ibytes%16) {
|
||||
for(j=0; j < ibytes%16; j++)
|
||||
((unsigned char*)&last_block)[j] = ivec[i*16+j];
|
||||
tmp1 = last_block;
|
||||
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
||||
Y = _mm_xor_si128(Y, tmp1);
|
||||
gfmul(Y, H, &Y);
|
||||
}
|
||||
tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0);
|
||||
tmp1 = _mm_insert_epi64(tmp1, 0, 1);
|
||||
Y = _mm_xor_si128(Y, tmp1);
|
||||
gfmul(Y, H, &Y);
|
||||
Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /* Compute E(K, Y0) */
|
||||
tmp1 = _mm_xor_si128(Y, KEY[0]);
|
||||
for(j=1; j < nr; j++)
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
||||
T = _mm_aesenclast_si128(tmp1, KEY[nr]);
|
||||
}
|
||||
|
||||
for(i=0; i<abytes/16; i++){
|
||||
tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]);
|
||||
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
||||
X = _mm_xor_si128(X, tmp1);
|
||||
gfmul(X, H, &X);
|
||||
}
|
||||
if(abytes%16){
|
||||
last_block = _mm_setzero_si128();
|
||||
for(j=0; j<abytes%16; j++)
|
||||
((unsigned char*)&last_block)[j] = addt[i*16+j];
|
||||
tmp1 = last_block;
|
||||
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
||||
X = _mm_xor_si128(X, tmp1);
|
||||
gfmul(X, H, &X);
|
||||
}
|
||||
|
||||
ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
|
||||
ctr1 = _mm_add_epi32(ctr1, ONE);
|
||||
ctr2 = _mm_add_epi32(ctr1, ONE);
|
||||
ctr3 = _mm_add_epi32(ctr2, ONE);
|
||||
ctr4 = _mm_add_epi32(ctr3, ONE);
|
||||
|
||||
for(i=0; i < nbytes/16/4; i++){
|
||||
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
|
||||
tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
|
||||
tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
|
||||
tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
|
||||
ctr1 = _mm_add_epi32(ctr1, FOUR);
|
||||
ctr2 = _mm_add_epi32(ctr2, FOUR);
|
||||
ctr3 = _mm_add_epi32(ctr3, FOUR);
|
||||
ctr4 = _mm_add_epi32(ctr4, FOUR);
|
||||
tmp1 =_mm_xor_si128(tmp1, KEY[0]);
|
||||
tmp2 =_mm_xor_si128(tmp2, KEY[0]);
|
||||
tmp3 =_mm_xor_si128(tmp3, KEY[0]);
|
||||
tmp4 =_mm_xor_si128(tmp4, KEY[0]);
|
||||
for(j=1; j < nr-1; j+=2){
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
||||
tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
|
||||
tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
|
||||
tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
|
||||
tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
|
||||
tmp3 = _mm_aesenc_si128(tmp3, KEY[j+1]);
|
||||
tmp4 = _mm_aesenc_si128(tmp4, KEY[j+1]);
|
||||
}
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
|
||||
tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
|
||||
tmp3 = _mm_aesenc_si128(tmp3, KEY[nr-1]);
|
||||
tmp4 = _mm_aesenc_si128(tmp4, KEY[nr-1]);
|
||||
tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
|
||||
tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
|
||||
tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
|
||||
tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
|
||||
tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[i*4+0]));
|
||||
tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[i*4+1]));
|
||||
tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[i*4+2]));
|
||||
tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[i*4+3]));
|
||||
_mm_storeu_si128(&((__m128i*)out)[i*4+0], tmp1);
|
||||
_mm_storeu_si128(&((__m128i*)out)[i*4+1], tmp2);
|
||||
_mm_storeu_si128(&((__m128i*)out)[i*4+2], tmp3);
|
||||
_mm_storeu_si128(&((__m128i*)out)[i*4+3], tmp4);
|
||||
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
||||
tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
|
||||
tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
|
||||
tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
|
||||
X = _mm_xor_si128(X, tmp1);
|
||||
gfmul(X, H, &X);
|
||||
X = _mm_xor_si128(X, tmp2);
|
||||
gfmul(X, H, &X);
|
||||
X = _mm_xor_si128(X, tmp3);
|
||||
gfmul(X, H, &X);
|
||||
X = _mm_xor_si128(X, tmp4);
|
||||
gfmul(X, H, &X);
|
||||
}
|
||||
for(k = i*4; k < nbytes/16; k++){
|
||||
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
|
||||
ctr1 = _mm_add_epi32(ctr1, ONE);
|
||||
tmp1 = _mm_xor_si128(tmp1, KEY[0]);
|
||||
for(j=1; j<nr-1; j+=2){
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
|
||||
}
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
|
||||
tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
|
||||
tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
|
||||
_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
|
||||
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
||||
X =_mm_xor_si128(X, tmp1);
|
||||
gfmul(X, H, &X);
|
||||
}
|
||||
/* If one partial block remains */
|
||||
if(nbytes%16){
|
||||
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
|
||||
tmp1 = _mm_xor_si128(tmp1, KEY[0]);
|
||||
for(j=1; j<nr-1; j+=2){
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
|
||||
}
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
|
||||
tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
|
||||
tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
|
||||
last_block = tmp1;
|
||||
for(j=0; j < nbytes%16; j++)
|
||||
out[k*16+j]=((unsigned char*)&last_block)[j];
|
||||
for(; j<16; j++)
|
||||
((unsigned char*)&last_block)[j]=0;
|
||||
tmp1 = last_block;
|
||||
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
||||
X =_mm_xor_si128(X, tmp1);
|
||||
gfmul(X, H, &X);
|
||||
}
|
||||
tmp1 = _mm_insert_epi64(tmp1, nbytes*8, 0);
|
||||
tmp1 = _mm_insert_epi64(tmp1, abytes*8, 1);
|
||||
X = _mm_xor_si128(X, tmp1);
|
||||
gfmul(X, H, &X);
|
||||
X = _mm_shuffle_epi8(X, BSWAP_MASK);
|
||||
T = _mm_xor_si128(X, T);
|
||||
_mm_storeu_si128((__m128i*)tag, T);
|
||||
}
|
||||
|
||||
|
||||
/* Figure 10. AES-GCM – Decrypt With Single Block Ghash at a Time */
|
||||
|
||||
static int AES_GCM_decrypt(const unsigned char *in,
|
||||
unsigned char *out,
|
||||
const unsigned char* addt,
|
||||
const unsigned char* ivec,
|
||||
const unsigned char *tag, int nbytes, int abytes,
|
||||
int ibytes, const unsigned char* key, int nr)
|
||||
{
|
||||
int i, j ,k;
|
||||
__m128i tmp1, tmp2, tmp3, tmp4;
|
||||
__m128i H, Y, T;
|
||||
__m128i *KEY = (__m128i*)key;
|
||||
__m128i ctr1, ctr2, ctr3, ctr4;
|
||||
__m128i last_block = _mm_setzero_si128();
|
||||
__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
|
||||
__m128i FOUR = _mm_set_epi32(0, 4, 0, 0);
|
||||
__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7);
|
||||
__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
|
||||
__m128i X = _mm_setzero_si128();
|
||||
|
||||
if (ibytes == 96/8) {
|
||||
Y = _mm_loadu_si128((__m128i*)ivec);
|
||||
Y = _mm_insert_epi32(Y, 0x1000000, 3);
|
||||
/* (Compute E[ZERO, KS] and E[Y0, KS] together */
|
||||
tmp1 = _mm_xor_si128(X, KEY[0]);
|
||||
tmp2 = _mm_xor_si128(Y, KEY[0]);
|
||||
for (j = 1; j < nr - 1; j += 2) {
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
||||
tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
|
||||
tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
|
||||
}
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
|
||||
tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
|
||||
H = _mm_aesenclast_si128(tmp1, KEY[nr]);
|
||||
T = _mm_aesenclast_si128(tmp2, KEY[nr]);
|
||||
H = _mm_shuffle_epi8(H, BSWAP_MASK);
|
||||
}
|
||||
else {
|
||||
tmp1 = _mm_xor_si128(X, KEY[0]);
|
||||
for (j = 1; j < nr; j++)
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
||||
H = _mm_aesenclast_si128(tmp1, KEY[nr]);
|
||||
H = _mm_shuffle_epi8(H, BSWAP_MASK);
|
||||
Y = _mm_setzero_si128();
|
||||
|
||||
for (i = 0; i < ibytes / 16; i++) {
|
||||
tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]);
|
||||
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
||||
Y = _mm_xor_si128(Y, tmp1);
|
||||
gfmul(Y, H, &Y);
|
||||
}
|
||||
|
||||
if (ibytes % 16) {
|
||||
for(j = 0; j < ibytes % 16; j++)
|
||||
((unsigned char*)&last_block)[j] = ivec[i*16+j];
|
||||
tmp1 = last_block;
|
||||
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
||||
Y = _mm_xor_si128(Y, tmp1);
|
||||
gfmul(Y, H, &Y);
|
||||
}
|
||||
|
||||
tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0);
|
||||
tmp1 = _mm_insert_epi64(tmp1, 0, 1);
|
||||
Y = _mm_xor_si128(Y, tmp1);
|
||||
gfmul(Y, H, &Y);
|
||||
Y = _mm_shuffle_epi8(Y, BSWAP_MASK);
|
||||
/* Compute E(K, Y0) */
|
||||
tmp1 = _mm_xor_si128(Y, KEY[0]);
|
||||
for(j=1; j < nr; j++)
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
||||
T = _mm_aesenclast_si128(tmp1, KEY[nr]);
|
||||
}
|
||||
|
||||
for (i = 0; i < abytes / 16; i++) {
|
||||
tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]);
|
||||
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
||||
X = _mm_xor_si128(X, tmp1);
|
||||
gfmul(X, H, &X);
|
||||
}
|
||||
|
||||
if (abytes % 16) {
|
||||
last_block = _mm_setzero_si128();
|
||||
for (j = 0;j < abytes % 16; j++)
|
||||
((unsigned char*)&last_block)[j] = addt[i*16+j];
|
||||
tmp1 = last_block;
|
||||
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
||||
X =_mm_xor_si128(X, tmp1);
|
||||
gfmul(X, H, &X);
|
||||
}
|
||||
|
||||
for (i = 0; i < nbytes / 16; i++) {
|
||||
tmp1 = _mm_loadu_si128(&((__m128i*)in)[i]);
|
||||
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
||||
X = _mm_xor_si128(X, tmp1);
|
||||
gfmul(X, H, &X);
|
||||
}
|
||||
|
||||
if (nbytes % 16) {
|
||||
last_block = _mm_setzero_si128();
|
||||
for(j = 0; j < nbytes % 16; j++)
|
||||
((unsigned char*)&last_block)[j] = in[i*16+j];
|
||||
tmp1 = last_block;
|
||||
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
||||
X = _mm_xor_si128(X, tmp1);
|
||||
gfmul(X, H, &X);
|
||||
}
|
||||
|
||||
tmp1 = _mm_insert_epi64(tmp1, nbytes * 8, 0);
|
||||
tmp1 = _mm_insert_epi64(tmp1, abytes * 8, 1);
|
||||
X = _mm_xor_si128(X, tmp1);
|
||||
gfmul(X, H, &X);
|
||||
X = _mm_shuffle_epi8(X, BSWAP_MASK);
|
||||
T = _mm_xor_si128(X, T);
|
||||
|
||||
if (0xffff !=
|
||||
_mm_movemask_epi8(_mm_cmpeq_epi8(T, _mm_loadu_si128((__m128i*)tag))))
|
||||
return 0; /* in case the authentication failed */
|
||||
|
||||
ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
|
||||
ctr1 = _mm_add_epi32(ctr1, ONE);
|
||||
ctr2 = _mm_add_epi32(ctr1, ONE);
|
||||
ctr3 = _mm_add_epi32(ctr2, ONE);
|
||||
ctr4 = _mm_add_epi32(ctr3, ONE);
|
||||
|
||||
for (i=0; i < nbytes/16/4; i++) {
|
||||
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
|
||||
tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
|
||||
tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
|
||||
tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
|
||||
|
||||
ctr1 = _mm_add_epi32(ctr1, FOUR);
|
||||
ctr2 = _mm_add_epi32(ctr2, FOUR);
|
||||
ctr3 = _mm_add_epi32(ctr3, FOUR);
|
||||
ctr4 = _mm_add_epi32(ctr4, FOUR);
|
||||
|
||||
tmp1 =_mm_xor_si128(tmp1, KEY[0]);
|
||||
tmp2 =_mm_xor_si128(tmp2, KEY[0]);
|
||||
tmp3 =_mm_xor_si128(tmp3, KEY[0]);
|
||||
tmp4 =_mm_xor_si128(tmp4, KEY[0]);
|
||||
|
||||
for (j = 1; j < nr - 1; j += 2) {
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
||||
tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
|
||||
tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
|
||||
tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
|
||||
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
|
||||
tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
|
||||
tmp3 = _mm_aesenc_si128(tmp3, KEY[j+1]);
|
||||
tmp4 = _mm_aesenc_si128(tmp4, KEY[j+1]);
|
||||
}
|
||||
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
|
||||
tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
|
||||
tmp3 = _mm_aesenc_si128(tmp3, KEY[nr-1]);
|
||||
tmp4 = _mm_aesenc_si128(tmp4, KEY[nr-1]);
|
||||
|
||||
tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
|
||||
tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
|
||||
tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
|
||||
tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
|
||||
|
||||
tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[i*4+0]));
|
||||
tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[i*4+1]));
|
||||
tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[i*4+2]));
|
||||
tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[i*4+3]));
|
||||
|
||||
_mm_storeu_si128(&((__m128i*)out)[i*4+0], tmp1);
|
||||
_mm_storeu_si128(&((__m128i*)out)[i*4+1], tmp2);
|
||||
_mm_storeu_si128(&((__m128i*)out)[i*4+2], tmp3);
|
||||
_mm_storeu_si128(&((__m128i*)out)[i*4+3], tmp4);
|
||||
|
||||
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
|
||||
tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
|
||||
tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
|
||||
tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
|
||||
}
|
||||
|
||||
for (k = i*4; k < nbytes/16; k++) {
|
||||
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
|
||||
ctr1 = _mm_add_epi32(ctr1, ONE);
|
||||
tmp1 = _mm_xor_si128(tmp1, KEY[0]);
|
||||
for (j = 1; j < nr-1; j += 2) {
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
|
||||
}
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
|
||||
tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
|
||||
tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
|
||||
_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
|
||||
}
|
||||
|
||||
/* If one partial block remains */
|
||||
if (nbytes % 16) {
|
||||
tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
|
||||
tmp1 = _mm_xor_si128(tmp1, KEY[0]);
|
||||
for (j = 1; j < nr-1; j += 2) {
|
||||
tmp1 =_mm_aesenc_si128(tmp1, KEY[j]);
|
||||
tmp1 =_mm_aesenc_si128(tmp1, KEY[j+1]);
|
||||
}
|
||||
tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
|
||||
tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
|
||||
tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
|
||||
last_block = tmp1;
|
||||
for (j = 0; j < nbytes % 16; j++)
|
||||
out[k*16+j]=((unsigned char*)&last_block)[j];
|
||||
}
|
||||
|
||||
return 1; /* when sucessful returns 1 */
|
||||
}
|
||||
|
||||
#endif /* WOLFSSL_AESNI */
|
||||
|
||||
|
||||
#if defined(GCM_SMALL)
|
||||
|
||||
static void GMULT(byte* X, byte* Y)
|
||||
@ -3281,6 +3701,14 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
|
||||
|
||||
WOLFSSL_ENTER("AesGcmEncrypt");
|
||||
|
||||
#ifdef WOLFSSL_AESNI
|
||||
if (haveAESNI) {
|
||||
AES_GCM_encrypt((void*)in, out, (void*)authIn, (void*)iv, authTag,
|
||||
sz, authInSz, ivSz, (byte*)aes->key, aes->rounds);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef WOLFSSL_PIC32MZ_CRYPT
|
||||
ctr = (char *)aes->iv_ce ;
|
||||
#else
|
||||
@ -3339,6 +3767,15 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
|
||||
|
||||
WOLFSSL_ENTER("AesGcmDecrypt");
|
||||
|
||||
#ifdef WOLFSSL_AESNI
|
||||
if (haveAESNI) {
|
||||
if (AES_GCM_decrypt(in, out, authIn, iv, authTag,
|
||||
sz, authInSz, ivSz, (byte*)aes->key, aes->rounds) == 0)
|
||||
return AES_GCM_AUTH_E;
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef WOLFSSL_PIC32MZ_CRYPT
|
||||
ctr = (char *)aes->iv_ce ;
|
||||
#else
|
||||
|
@ -969,4 +969,83 @@ MAKE_RK256_b:
|
||||
pxor xmm3,xmm2
|
||||
ret
|
||||
|
||||
|
||||
; See Intel® Carry-Less Multiplication Instruction
|
||||
; and its Usage for Computing the GCM Mode White Paper
|
||||
; by Shay Gueron, Intel Mobility Group, Israel Development Center;
|
||||
; and Michael E. Kounavis, Intel Labs, Circuits and Systems Research
|
||||
|
||||
; void gfmul(__m128i a, __m128i b, __m128i* out);
|
||||
|
||||
; .globl gfmul
|
||||
gfmul PROC
|
||||
; xmm0 holds operand a (128 bits)
|
||||
; xmm1 holds operand b (128 bits)
|
||||
; rdi holds the pointer to output (128 bits)
|
||||
movdqa %xmm0, %xmm3
|
||||
pclmulqdq $0, %xmm1, %xmm3 ; xmm3 holds a0*b0
|
||||
movdqa %xmm0, %xmm4
|
||||
pclmulqdq $16, %xmm1, %xmm4 ; xmm4 holds a0*b1
|
||||
movdqa %xmm0, %xmm5
|
||||
pclmulqdq $1, %xmm1, %xmm5 ; xmm5 holds a1*b0
|
||||
movdqa %xmm0, %xmm6
|
||||
pclmulqdq $17, %xmm1, %xmm6 ; xmm6 holds a1*b1
|
||||
pxor %xmm5, %xmm4 ; xmm4 holds a0*b1 + a1*b0
|
||||
movdqa %xmm4, %xmm5
|
||||
psrldq $8, %xmm4
|
||||
pslldq $8, %xmm5
|
||||
pxor %xmm5, %xmm3
|
||||
pxor %xmm4, %xmm6 ; <xmm6:xmm3> holds the result of
|
||||
; the carry-less multiplication of
|
||||
; xmm0 by xmm1
|
||||
|
||||
; shift the result by one bit position to the left cope for the fact
|
||||
; that bits are reversed
|
||||
movdqa %xmm3, %xmm7
|
||||
movdqa %xmm6, %xmm8
|
||||
pslld $1, %xmm3
|
||||
pslld $1, %xmm6
|
||||
psrld $31, %xmm7
|
||||
psrld $31, %xmm8
|
||||
movdqa %xmm7, %xmm9
|
||||
pslldq $4, %xmm8
|
||||
pslldq $4, %xmm7
|
||||
psrldq $12, %xmm9
|
||||
por %xmm7, %xmm3
|
||||
por %xmm8, %xmm6
|
||||
por %xmm9, %xmm6
|
||||
|
||||
; first phase of the reduction
|
||||
movdqa %xmm3, %xmm7
|
||||
movdqa %xmm3, %xmm8
|
||||
movdqa %xmm3, %xmm9
|
||||
pslld $31, %xmm7 ; packed right shifting << 31
|
||||
pslld $30, %xmm8 ; packed right shifting shift << 30
|
||||
pslld $25, %xmm9 ; packed right shifting shift << 25
|
||||
pxor %xmm8, %xmm7 ; xor the shifted versions
|
||||
pxor %xmm9, %xmm7
|
||||
|
||||
movdqa %xmm7, %xmm8
|
||||
pslldq $12, %xmm7
|
||||
psrldq $4, %xmm8
|
||||
pxor %xmm7, %xmm3 ; first phase of the reduction complete
|
||||
movdqa %xmm3,%xmm2 ; second phase of the reduction
|
||||
movdqa %xmm3,%xmm4
|
||||
movdqa %xmm3,%xmm5
|
||||
psrld $1, %xmm2 ; packed left shifting >> 1
|
||||
psrld $2, %xmm4 ; packed left shifting >> 2
|
||||
psrld $7, %xmm5 ; packed left shifting >> 7
|
||||
|
||||
pxor %xmm4, %xmm2 ; xor the shifted versions
|
||||
pxor %xmm5, %xmm2
|
||||
pxor %xmm8, %xmm2
|
||||
pxor %xmm2, %xmm3
|
||||
pxor %xmm3, %xmm6 ; the result is in xmm6
|
||||
movdqu %xmm6, (%rdi) ; store the result
|
||||
|
||||
; restore xmm6 and xmm7
|
||||
|
||||
ret
|
||||
gfmul ENDP
|
||||
|
||||
END
|
||||
|
@ -20,12 +20,12 @@
|
||||
*/
|
||||
|
||||
|
||||
/* This file is in at&t asm syntax, see .asm for intel syntax */
|
||||
|
||||
/* See Intel® Advanced Encryption Standard (AES) Instructions Set White Paper
|
||||
* by Intel Mobility Group, Israel Development Center, Israel Shay Gueron
|
||||
*/
|
||||
|
||||
/* This file is in at&t asm syntax, see .asm for intel syntax */
|
||||
|
||||
|
||||
/*
|
||||
AES_CBC_encrypt (const unsigned char *in,
|
||||
@ -814,3 +814,87 @@ pxor %xmm4, %xmm3
|
||||
pxor %xmm2, %xmm3
|
||||
ret
|
||||
|
||||
|
||||
#ifdef HAVE_AESGCM
|
||||
|
||||
/* See Intel® Carry-Less Multiplication Instruction
|
||||
* and its Usage for Computing the GCM Mode White Paper
|
||||
* by Shay Gueron, Intel Mobility Group, Israel Development Center;
|
||||
* and Michael E. Kounavis, Intel Labs, Circuits and Systems Research
|
||||
*
|
||||
* This is for use with the C code.
|
||||
*/
|
||||
|
||||
/* Figure 6. Code Sample - Performing Ghash Using Algorithms 1 and 5 */
|
||||
|
||||
/*
|
||||
* void gfmul(__m128i a, __m128i b, __m128i* out);
|
||||
*/
|
||||
.globl gfmul
|
||||
gfmul:
|
||||
#xmm0 holds operand a (128 bits)
|
||||
#xmm1 holds operand b (128 bits)
|
||||
#rdi holds the pointer to output (128 bits)
|
||||
movdqa %xmm0, %xmm3
|
||||
pclmulqdq $0, %xmm1, %xmm3 # xmm3 holds a0*b0
|
||||
movdqa %xmm0, %xmm4
|
||||
pclmulqdq $16, %xmm1, %xmm4 # xmm4 holds a0*b1
|
||||
movdqa %xmm0, %xmm5
|
||||
pclmulqdq $1, %xmm1, %xmm5 # xmm5 holds a1*b0
|
||||
movdqa %xmm0, %xmm6
|
||||
pclmulqdq $17, %xmm1, %xmm6 # xmm6 holds a1*b1
|
||||
pxor %xmm5, %xmm4 # xmm4 holds a0*b1 + a1*b0
|
||||
movdqa %xmm4, %xmm5
|
||||
psrldq $8, %xmm4
|
||||
pslldq $8, %xmm5
|
||||
pxor %xmm5, %xmm3
|
||||
pxor %xmm4, %xmm6 # <xmm6:xmm3> holds the result of
|
||||
# the carry-less multiplication of
|
||||
# xmm0 by xmm1
|
||||
|
||||
# shift the result by one bit position to the left cope for the fact
|
||||
# that bits are reversed
|
||||
movdqa %xmm3, %xmm7
|
||||
movdqa %xmm6, %xmm8
|
||||
pslld $1, %xmm3
|
||||
pslld $1, %xmm6
|
||||
psrld $31, %xmm7
|
||||
psrld $31, %xmm8
|
||||
movdqa %xmm7, %xmm9
|
||||
pslldq $4, %xmm8
|
||||
pslldq $4, %xmm7
|
||||
psrldq $12, %xmm9
|
||||
por %xmm7, %xmm3
|
||||
por %xmm8, %xmm6
|
||||
por %xmm9, %xmm6
|
||||
|
||||
# first phase of the reduction
|
||||
movdqa %xmm3, %xmm7
|
||||
movdqa %xmm3, %xmm8
|
||||
movdqa %xmm3, %xmm9
|
||||
pslld $31, %xmm7 # packed right shifting << 31
|
||||
pslld $30, %xmm8 # packed right shifting shift << 30
|
||||
pslld $25, %xmm9 # packed right shifting shift << 25
|
||||
pxor %xmm8, %xmm7 # xor the shifted versions
|
||||
pxor %xmm9, %xmm7
|
||||
|
||||
movdqa %xmm7, %xmm8
|
||||
pslldq $12, %xmm7
|
||||
psrldq $4, %xmm8
|
||||
pxor %xmm7, %xmm3 # first phase of the reduction complete
|
||||
movdqa %xmm3,%xmm2 # second phase of the reduction
|
||||
movdqa %xmm3,%xmm4
|
||||
movdqa %xmm3,%xmm5
|
||||
psrld $1, %xmm2 # packed left shifting >> 1
|
||||
psrld $2, %xmm4 # packed left shifting >> 2
|
||||
psrld $7, %xmm5 # packed left shifting >> 7
|
||||
|
||||
pxor %xmm4, %xmm2 # xor the shifted versions
|
||||
pxor %xmm5, %xmm2
|
||||
pxor %xmm8, %xmm2
|
||||
pxor %xmm2, %xmm3
|
||||
pxor %xmm3, %xmm6 # the result is in xmm6
|
||||
movdqu %xmm6, (%rdi) # store the result
|
||||
ret
|
||||
|
||||
#endif /* HAVE_AESGCM */
|
||||
|
@ -390,7 +390,7 @@ int wolfcrypt_test(void* args)
|
||||
|
||||
#ifdef HAVE_AESGCM
|
||||
if ( (ret = gmac_test()) != 0)
|
||||
return err_sys("GMAC test passed!\n", ret);
|
||||
return err_sys("GMAC test failed!\n", ret);
|
||||
else
|
||||
printf( "GMAC test passed!\n");
|
||||
#endif
|
||||
|
@ -46,6 +46,8 @@
|
||||
#ifdef WOLFSSL_AESNI
|
||||
|
||||
#include <wmmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
#include <smmintrin.h>
|
||||
|
||||
#if !defined (ALIGN16)
|
||||
#if defined (__GNUC__)
|
||||
|
Loading…
x
Reference in New Issue
Block a user