From b9d9371aed7ed69f80f1ecf4c7f84cb898457926 Mon Sep 17 00:00:00 2001 From: toddouska Date: Mon, 19 May 2014 13:55:42 -0700 Subject: [PATCH] add aesni intel asm syntax for aesni, abstract cpuid and asm linkage better for msvc --- ctaocrypt/src/aes.c | 41 +- ctaocrypt/src/aes_asm.asm | 856 ++++++++++++++++++++++++++++++++++++++ ctaocrypt/src/aes_asm.s | 2 + ctaocrypt/src/include.am | 3 +- 4 files changed, 882 insertions(+), 20 deletions(-) create mode 100755 ctaocrypt/src/aes_asm.asm diff --git a/ctaocrypt/src/aes.c b/ctaocrypt/src/aes.c index e0ec103d2..719017c7a 100644 --- a/ctaocrypt/src/aes.c +++ b/ctaocrypt/src/aes.c @@ -1551,31 +1551,34 @@ static const word32 Td[5][256] = { #ifdef CYASSL_AESNI +/* Each platform needs to query info type 1 from cpuid to see if aesni is + * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts + */ + #ifndef _MSC_VER - #define cpuid(func,ax,bx,cx,dx)\ + #define cpuid(reg, func)\ __asm__ __volatile__ ("cpuid":\ - "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (func)); + "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\ + "a" (func)); + #define XASM_LINK(f) asm(f) #else - #define cpuid(func,ax,bx,cx,dx)\ - __asm mov eax, func \ - __asm cpuid \ - __asm mov ax, eax \ - __asm mov bx, ebx \ - __asm mov cx, ecx \ - __asm mov dx, edx + #include + #define cpuid(a,b) __cpuid(a,b) + + #define XASM_LINK(f) #endif /* _MSC_VER */ static int Check_CPU_support_AES(void) { - unsigned int a,b,c,d; - cpuid(1,a,b,c,d); + unsigned int reg[4]; /* put a,b,c,d into 0,1,2,3 */ + cpuid(reg, 1); /* query info 1 */ - if (c & 0x2000000) + if (reg[2] & 0x2000000) return 1; return 0; @@ -1590,34 +1593,34 @@ static int haveAESNI = 0; void AES_CBC_encrypt(const unsigned char* in, unsigned char* out, unsigned char* ivec, unsigned long length, const unsigned char* KS, int nr) - asm ("AES_CBC_encrypt"); + XASM_LINK("AES_CBC_encrypt"); void AES_CBC_decrypt(const unsigned char* in, unsigned char* out, unsigned char* ivec, unsigned long length, const unsigned char* KS, int nr) - asm ("AES_CBC_decrypt"); + XASM_LINK("AES_CBC_decrypt"); void AES_ECB_encrypt(const unsigned char* in, unsigned char* out, unsigned long length, const unsigned char* KS, int nr) - asm ("AES_ECB_encrypt"); + XASM_LINK("AES_ECB_encrypt"); void AES_ECB_decrypt(const unsigned char* in, unsigned char* out, unsigned long length, const unsigned char* KS, int nr) - asm ("AES_ECB_decrypt"); + XASM_LINK("AES_ECB_decrypt"); void AES_128_Key_Expansion(const unsigned char* userkey, unsigned char* key_schedule) - asm ("AES_128_Key_Expansion"); + XASM_LINK("AES_128_Key_Expansion"); void AES_192_Key_Expansion(const unsigned char* userkey, unsigned char* key_schedule) - asm ("AES_192_Key_Expansion"); + XASM_LINK("AES_192_Key_Expansion"); void AES_256_Key_Expansion(const unsigned char* userkey, unsigned char* key_schedule) - asm ("AES_256_Key_Expansion"); + XASM_LINK("AES_256_Key_Expansion"); static int AES_set_encrypt_key(const unsigned char *userKey, const int bits, diff --git a/ctaocrypt/src/aes_asm.asm b/ctaocrypt/src/aes_asm.asm new file mode 100755 index 000000000..b1a43ffa1 --- /dev/null +++ b/ctaocrypt/src/aes_asm.asm @@ -0,0 +1,856 @@ +; /*aes_asm . asm +; * +; *Copyright[C]2006 -2014 wolfSSL Inc . +; * +; *This file is part of CyaSSL . +; * +; *CyaSSL is free software/ you can redistribute it and/or modify +; *it under the terms of the GNU General Public License as published by +; *the Free Software Foundation/ either version 2 of the License, or +; *[at your option]any later version . +; * +; *CyaSSL ,is distributed in the hope that it will be useful +; *but WITHOUT ANY WARRANTY/ without even the implied warranty of +; *MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the +; *GNU General Public License for more details . +; * +; *You should have received a copy of the GNU General Public License +; *along with this program/ if not, write to the Free Software +; *Foundation,Inc .,51 Franklin Street,Fifth Floor,Boston,MA 02110-1301,USA +; */ +; +; +; /*See IntelA dvanced Encryption Standard[AES]Instructions Set White Paper +; *by Israel,Intel Mobility Group Development Center,Israel Shay Gueron +; */ +; +; /* This file is in intel sm syntax, see .s for intel syntax */ +; +; /* +; AES_CBC_encrypt[const ,unsigned char*in +; unsigned ,char*out +; unsigned ,char ivec+16 +; unsigned ,long length +; const ,unsigned char*KS +; int nr] +; */ +_text SEGMENT +AES_CBC_encrypt PROC +;# parameter 1: rdi +;# parameter 2: rsi +;# parameter 3: rdx +;# parameter 4: rcx +;# parameter 5: r8 +;# parameter 6: r9d + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,[rsp+40] + mov r9d,[rsp+48] + + mov r10,rcx + shr rcx,4 + shl r10,60 + je NO_PARTS + add rcx,1 +NO_PARTS: + sub rsi,16 + movdqa xmm1,[rdx] +LOOP_1: + pxor xmm1,[rdi] + pxor xmm1,[r8] + add rsi,16 + add rdi,16 + cmp r9d,12 + aesenc xmm1,16[r8] + aesenc xmm1,32[r8] + aesenc xmm1,48[r8] + aesenc xmm1,64[r8] + aesenc xmm1,80[r8] + aesenc xmm1,96[r8] + aesenc xmm1,112[r8] + aesenc xmm1,128[r8] + aesenc xmm1,144[r8] + movdqa xmm2,160[r8] + jb LAST + cmp r9d,14 + + aesenc xmm1,160[r8] + aesenc xmm1,176[r8] + movdqa xmm2,192[r8] + jb LAST + aesenc xmm1,192[r8] + aesenc xmm1,208[r8] + movdqa xmm2,224[r8] +LAST: + dec rcx + aesenclast xmm1,xmm2 + movdqu [rsi],xmm1 + jne LOOP_1 + ret +AES_CBC_encrypt ENDP + + + +; /* +; AES_CBC_decrypt[const ,unsigned char*in +; unsigned ,char*out +; unsigned ,char ivec+16 +; unsigned ,long length +; const ,unsigned char*KS +; int nr] +; */ +; . globl AES_CBC_decrypt +AES_CBC_decrypt PROC +;# parameter 1: rdi +;# parameter 2: rsi +;# parameter 3: rdx +;# parameter 4: rcx +;# parameter 5: r8 +;# parameter 6: r9d + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,[rsp+40] + mov r9d,[rsp+48] + + mov r10,rcx + shr rcx,4 + shl r10,60 + je DNO_PARTS_4 + add rcx,1 +DNO_PARTS_4: + mov r10,rcx + shl r10,62 + shr r10,62 + shr rcx,2 + movdqu xmm5,[rdx] + je DREMAINDER_4 + sub rsi,64 +DLOOP_4: + movdqu xmm1,[rdi] + movdqu xmm2,16[rdi] + movdqu xmm3,32[rdi] + movdqu xmm4,48[rdi] + movdqa xmm6,xmm1 + movdqa xmm7,xmm2 + movdqa xmm8,xmm3 + movdqa xmm15,xmm4 + movdqa xmm9,[r8] + movdqa xmm10,16[r8] + movdqa xmm11,32[r8] + movdqa xmm12,48[r8] + pxor xmm1,xmm9 + pxor xmm2,xmm9 + pxor xmm3,xmm9 + + pxor xmm4,xmm9 + aesdec xmm1,xmm10 + aesdec xmm2,xmm10 + aesdec xmm3,xmm10 + aesdec xmm4,xmm10 + aesdec xmm1,xmm11 + aesdec xmm2,xmm11 + aesdec xmm3,xmm11 + aesdec xmm4,xmm11 + aesdec xmm1,xmm12 + aesdec xmm2,xmm12 + aesdec xmm3,xmm12 + aesdec xmm4,xmm12 + movdqa xmm9,64[r8] + movdqa xmm10,80[r8] + movdqa xmm11,96[r8] + movdqa xmm12,112[r8] + aesdec xmm1,xmm9 + aesdec xmm2,xmm9 + aesdec xmm3,xmm9 + aesdec xmm4,xmm9 + aesdec xmm1,xmm10 + aesdec xmm2,xmm10 + aesdec xmm3,xmm10 + aesdec xmm4,xmm10 + aesdec xmm1,xmm11 + aesdec xmm2,xmm11 + aesdec xmm3,xmm11 + aesdec xmm4,xmm11 + aesdec xmm1,xmm12 + aesdec xmm2,xmm12 + aesdec xmm3,xmm12 + aesdec xmm4,xmm12 + movdqa xmm9,128[r8] + movdqa xmm10,144[r8] + movdqa xmm11,160[r8] + cmp r9d,12 + aesdec xmm1,xmm9 + aesdec xmm2,xmm9 + aesdec xmm3,xmm9 + aesdec xmm4,xmm9 + aesdec xmm1,xmm10 + aesdec xmm2,xmm10 + aesdec xmm3,xmm10 + aesdec xmm4,xmm10 + jb DLAST_4 + movdqa xmm9,160[r8] + movdqa xmm10,176[r8] + movdqa xmm11,192[r8] + cmp r9d,14 + aesdec xmm1,xmm9 + aesdec xmm2,xmm9 + aesdec xmm3,xmm9 + aesdec xmm4,xmm9 + aesdec xmm1,xmm10 + aesdec xmm2,xmm10 + aesdec xmm3,xmm10 + aesdec xmm4,xmm10 + jb DLAST_4 + + movdqa xmm9,192[r8] + movdqa xmm10,208[r8] + movdqa xmm11,224[r8] + aesdec xmm1,xmm9 + aesdec xmm2,xmm9 + aesdec xmm3,xmm9 + aesdec xmm4,xmm9 + aesdec xmm1,xmm10 + aesdec xmm2,xmm10 + aesdec xmm3,xmm10 + aesdec xmm4,xmm10 +DLAST_4: + add rdi,64 + add rsi,64 + dec rcx + aesdeclast xmm1,xmm11 + aesdeclast xmm2,xmm11 + aesdeclast xmm3,xmm11 + aesdeclast xmm4,xmm11 + pxor xmm1,xmm5 + pxor xmm2,xmm6 + pxor xmm3,xmm7 + pxor xmm4,xmm8 + movdqu [rsi],xmm1 + movdqu 16[rsi],xmm2 + movdqu 32[rsi],xmm3 + movdqu 48[rsi],xmm4 + movdqa xmm5,xmm15 + jne DLOOP_4 + add rsi,64 +DREMAINDER_4: + cmp r10,0 + je DEND_4 +DLOOP_4_2: + movdqu xmm1,[rdi] + movdqa xmm15,xmm1 + add rdi,16 + pxor xmm1,[r8] + movdqu xmm2,160[r8] + cmp r9d,12 + aesdec xmm1,16[r8] + aesdec xmm1,32[r8] + aesdec xmm1,48[r8] + aesdec xmm1,64[r8] + aesdec xmm1,80[r8] + aesdec xmm1,96[r8] + aesdec xmm1,112[r8] + aesdec xmm1,128[r8] + aesdec xmm1,144[r8] + jb DLAST_4_2 + movdqu xmm2,192[r8] + cmp r9d,14 + aesdec xmm1,160[r8] + aesdec xmm1,176[r8] + jb DLAST_4_2 + movdqu xmm2,224[r8] + aesdec xmm1,192[r8] + aesdec xmm1,208[r8] +DLAST_4_2: + aesdeclast xmm1,xmm2 + pxor xmm1,xmm5 + movdqa xmm5,xmm15 + movdqu [rsi],xmm1 + + add rsi,16 + dec r10 + jne DLOOP_4_2 +DEND_4: + ret +AES_CBC_decrypt ENDP + +; /* +; AES_ECB_encrypt[const ,unsigned char*in +; unsigned ,char*out +; unsigned ,long length +; const ,unsigned char*KS +; int nr] +; */ +; . globl AES_ECB_encrypt +AES_ECB_encrypt PROC +;# parameter 1: rdi +;# parameter 2: rsi +;# parameter 3: rdx +;# parameter 4: rcx +;# parameter 5: r8d + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8d,[rsp+40] + + mov r10,rdx + shr rdx,4 + shl r10,60 + je EECB_NO_PARTS_4 + add rdx,1 +EECB_NO_PARTS_4: + mov r10,rdx + shl r10,62 + shr r10,62 + shr rdx,2 + je EECB_REMAINDER_4 + sub rsi,64 +EECB_LOOP_4: + movdqu xmm1,[rdi] + movdqu xmm2,16[rdi] + movdqu xmm3,32[rdi] + movdqu xmm4,48[rdi] + movdqa xmm9,[rcx] + movdqa xmm10,16[rcx] + movdqa xmm11,32[rcx] + movdqa xmm12,48[rcx] + pxor xmm1,xmm9 + pxor xmm2,xmm9 + pxor xmm3,xmm9 + pxor xmm4,xmm9 + aesenc xmm1,xmm10 + aesenc xmm2,xmm10 + aesenc xmm3,xmm10 + aesenc xmm4,xmm10 + aesenc xmm1,xmm11 + aesenc xmm2,xmm11 + aesenc xmm3,xmm11 + aesenc xmm4,xmm11 + aesenc xmm1,xmm12 + aesenc xmm2,xmm12 + aesenc xmm3,xmm12 + aesenc xmm4,xmm12 + movdqa xmm9,64[rcx] + movdqa xmm10,80[rcx] + movdqa xmm11,96[rcx] + movdqa xmm12,112[rcx] + aesenc xmm1,xmm9 + aesenc xmm2,xmm9 + aesenc xmm3,xmm9 + aesenc xmm4,xmm9 + aesenc xmm1,xmm10 + aesenc xmm2,xmm10 + aesenc xmm3,xmm10 + aesenc xmm4,xmm10 + aesenc xmm1,xmm11 + aesenc xmm2,xmm11 + aesenc xmm3,xmm11 + aesenc xmm4,xmm11 + aesenc xmm1,xmm12 + aesenc xmm2,xmm12 + aesenc xmm3,xmm12 + aesenc xmm4,xmm12 + movdqa xmm9,128[rcx] + movdqa xmm10,144[rcx] + movdqa xmm11,160[rcx] + cmp r8d,12 + aesenc xmm1,xmm9 + aesenc xmm2,xmm9 + aesenc xmm3,xmm9 + aesenc xmm4,xmm9 + aesenc xmm1,xmm10 + aesenc xmm2,xmm10 + aesenc xmm3,xmm10 + aesenc xmm4,xmm10 + jb EECB_LAST_4 + movdqa xmm9,160[rcx] + movdqa xmm10,176[rcx] + movdqa xmm11,192[rcx] + cmp r8d,14 + aesenc xmm1,xmm9 + aesenc xmm2,xmm9 + aesenc xmm3,xmm9 + aesenc xmm4,xmm9 + aesenc xmm1,xmm10 + aesenc xmm2,xmm10 + aesenc xmm3,xmm10 + aesenc xmm4,xmm10 + jb EECB_LAST_4 + movdqa xmm9,192[rcx] + movdqa xmm10,208[rcx] + movdqa xmm11,224[rcx] + aesenc xmm1,xmm9 + aesenc xmm2,xmm9 + aesenc xmm3,xmm9 + aesenc xmm4,xmm9 + aesenc xmm1,xmm10 + aesenc xmm2,xmm10 + aesenc xmm3,xmm10 + aesenc xmm4,xmm10 +EECB_LAST_4: + add rdi,64 + add rsi,64 + dec rdx + aesenclast xmm1,xmm11 + aesenclast xmm2,xmm11 + aesenclast xmm3,xmm11 + aesenclast xmm4,xmm11 + movdqu [rsi],xmm1 + movdqu 16[rsi],xmm2 + movdqu 32[rsi],xmm3 + movdqu 48[rsi],xmm4 + jne EECB_LOOP_4 + add rsi,64 +EECB_REMAINDER_4: + cmp r10,0 + je EECB_END_4 +EECB_LOOP_4_2: + movdqu xmm1,[rdi] + add rdi,16 + pxor xmm1,[rcx] + movdqu xmm2,160[rcx] + aesenc xmm1,16[rcx] + aesenc xmm1,32[rcx] + aesenc xmm1,48[rcx] + aesenc xmm1,64[rcx] + aesenc xmm1,80[rcx] + aesenc xmm1,96[rcx] + aesenc xmm1,112[rcx] + aesenc xmm1,128[rcx] + aesenc xmm1,144[rcx] + cmp r8d,12 + jb EECB_LAST_4_2 + movdqu xmm2,192[rcx] + aesenc xmm1,160[rcx] + aesenc xmm1,176[rcx] + cmp r8d,14 + jb EECB_LAST_4_2 + movdqu xmm2,224[rcx] + aesenc xmm1,192[rcx] + aesenc xmm1,208[rcx] +EECB_LAST_4_2: + aesenclast xmm1,xmm2 + movdqu [rsi],xmm1 + add rsi,16 + dec r10 + jne EECB_LOOP_4_2 +EECB_END_4: + ret +AES_ECB_encrypt ENDP + +; /* +; AES_ECB_decrypt[const ,unsigned char*in +; unsigned ,char*out +; unsigned ,long length +; const ,unsigned char*KS +; int nr] +; */ +; . globl AES_ECB_decrypt +AES_ECB_decrypt PROC +;# parameter 1: rdi +;# parameter 2: rsi +;# parameter 3: rdx +;# parameter 4: rcx +;# parameter 5: r8d + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8d,[rsp+40] + + mov r10,rdx + shr rdx,4 + shl r10,60 + je DECB_NO_PARTS_4 + add rdx,1 +DECB_NO_PARTS_4: + mov r10,rdx + shl r10,62 + shr r10,62 + shr rdx,2 + je DECB_REMAINDER_4 + sub rsi,64 +DECB_LOOP_4: + movdqu xmm1,[rdi] + movdqu xmm2,16[rdi] + movdqu xmm3,32[rdi] + movdqu xmm4,48[rdi] + movdqa xmm9,[rcx] + movdqa xmm10,16[rcx] + movdqa xmm11,32[rcx] + movdqa xmm12,48[rcx] + pxor xmm1,xmm9 + pxor xmm2,xmm9 + pxor xmm3,xmm9 + pxor xmm4,xmm9 + aesdec xmm1,xmm10 + aesdec xmm2,xmm10 + aesdec xmm3,xmm10 + aesdec xmm4,xmm10 + aesdec xmm1,xmm11 + aesdec xmm2,xmm11 + aesdec xmm3,xmm11 + aesdec xmm4,xmm11 + aesdec xmm1,xmm12 + aesdec xmm2,xmm12 + aesdec xmm3,xmm12 + aesdec xmm4,xmm12 + movdqa xmm9,64[rcx] + movdqa xmm10,80[rcx] + movdqa xmm11,96[rcx] + movdqa xmm12,112[rcx] + aesdec xmm1,xmm9 + aesdec xmm2,xmm9 + aesdec xmm3,xmm9 + aesdec xmm4,xmm9 + aesdec xmm1,xmm10 + aesdec xmm2,xmm10 + aesdec xmm3,xmm10 + aesdec xmm4,xmm10 + aesdec xmm1,xmm11 + aesdec xmm2,xmm11 + aesdec xmm3,xmm11 + aesdec xmm4,xmm11 + aesdec xmm1,xmm12 + aesdec xmm2,xmm12 + aesdec xmm3,xmm12 + aesdec xmm4,xmm12 + movdqa xmm9,128[rcx] + movdqa xmm10,144[rcx] + movdqa xmm11,160[rcx] + cmp r8d,12 + aesdec xmm1,xmm9 + aesdec xmm2,xmm9 + aesdec xmm3,xmm9 + aesdec xmm4,xmm9 + aesdec xmm1,xmm10 + aesdec xmm2,xmm10 + aesdec xmm3,xmm10 + aesdec xmm4,xmm10 + jb DECB_LAST_4 + movdqa xmm9,160[rcx] + movdqa xmm10,176[rcx] + movdqa xmm11,192[rcx] + cmp r8d,14 + aesdec xmm1,xmm9 + aesdec xmm2,xmm9 + aesdec xmm3,xmm9 + aesdec xmm4,xmm9 + aesdec xmm1,xmm10 + aesdec xmm2,xmm10 + aesdec xmm3,xmm10 + aesdec xmm4,xmm10 + jb DECB_LAST_4 + movdqa xmm9,192[rcx] + movdqa xmm10,208[rcx] + movdqa xmm11,224[rcx] + aesdec xmm1,xmm9 + aesdec xmm2,xmm9 + aesdec xmm3,xmm9 + aesdec xmm4,xmm9 + aesdec xmm1,xmm10 + aesdec xmm2,xmm10 + aesdec xmm3,xmm10 + aesdec xmm4,xmm10 +DECB_LAST_4: + add rdi,64 + add rsi,64 + dec rdx + aesdeclast xmm1,xmm11 + aesdeclast xmm2,xmm11 + aesdeclast xmm3,xmm11 + aesdeclast xmm4,xmm11 + movdqu [rsi],xmm1 + movdqu 16[rsi],xmm2 + movdqu 32[rsi],xmm3 + movdqu 48[rsi],xmm4 + jne DECB_LOOP_4 + add rsi,64 +DECB_REMAINDER_4: + cmp r10,0 + je DECB_END_4 +DECB_LOOP_4_2: + movdqu xmm1,[rdi] + add rdi,16 + pxor xmm1,[rcx] + movdqu xmm2,160[rcx] + cmp r8d,12 + aesdec xmm1,16[rcx] + aesdec xmm1,32[rcx] + aesdec xmm1,48[rcx] + aesdec xmm1,64[rcx] + aesdec xmm1,80[rcx] + aesdec xmm1,96[rcx] + aesdec xmm1,112[rcx] + aesdec xmm1,128[rcx] + aesdec xmm1,144[rcx] + jb DECB_LAST_4_2 + cmp r8d,14 + movdqu xmm2,192[rcx] + aesdec xmm1,160[rcx] + aesdec xmm1,176[rcx] + jb DECB_LAST_4_2 + movdqu xmm2,224[rcx] + aesdec xmm1,192[rcx] + aesdec xmm1,208[rcx] +DECB_LAST_4_2: + aesdeclast xmm1,xmm2 + movdqu [rsi],xmm1 + add rsi,16 + dec r10 + jne DECB_LOOP_4_2 +DECB_END_4: + ret +AES_ECB_decrypt ENDP + + + +; /* +; void ,AES_128_Key_Expansion[const unsigned char*userkey +; unsigned char*key_schedule]/ +; */ +; . align 16,0x90 +; . globl AES_128_Key_Expansion +AES_128_Key_Expansion PROC +;# parameter 1: rdi +;# parameter 2: rsi + mov rdi,rcx + mov rsi,rdx + + mov dword ptr 240[rsi],10 + + movdqu xmm1,[rdi] + movdqa [rsi],xmm1 + + +ASSISTS: + aeskeygenassist xmm2,xmm1,1 + call PREPARE_ROUNDKEY_128 + movdqa 16[rsi],xmm1 + + aeskeygenassist xmm2,xmm1,2 + call PREPARE_ROUNDKEY_128 + movdqa 32[rsi],xmm1 + + aeskeygenassist xmm2,xmm1,4 + call PREPARE_ROUNDKEY_128 + movdqa 48[rsi],xmm1 + + aeskeygenassist xmm2,xmm1,8 + call PREPARE_ROUNDKEY_128 + movdqa 64[rsi],xmm1 + + aeskeygenassist xmm2,xmm1,16 + call PREPARE_ROUNDKEY_128 + movdqa 80[rsi],xmm1 + + aeskeygenassist xmm2,xmm1,32 + call PREPARE_ROUNDKEY_128 + movdqa 96[rsi],xmm1 + + aeskeygenassist xmm2,xmm1,64 + call PREPARE_ROUNDKEY_128 + movdqa 112[rsi],xmm1 + aeskeygenassist xmm2,xmm1,80h + call PREPARE_ROUNDKEY_128 + movdqa 128[rsi],xmm1 + aeskeygenassist xmm2,xmm1,1bh + call PREPARE_ROUNDKEY_128 + movdqa 144[rsi],xmm1 + aeskeygenassist xmm2,xmm1,36h + call PREPARE_ROUNDKEY_128 + movdqa 160[rsi],xmm1 + ret + +PREPARE_ROUNDKEY_128: + pshufd xmm2,xmm2,255 + movdqa xmm3,xmm1 + pslldq xmm3,4 + pxor xmm1,xmm3 + pslldq xmm3,4 + pxor xmm1,xmm3 + pslldq xmm3,4 + pxor xmm1,xmm3 + pxor xmm1,xmm2 + ret +AES_128_Key_Expansion ENDP + +; /* +; void ,AES_192_Key_Expansion[const unsigned char*userkey +; unsigned char*key] +; */ +; . globl AES_192_Key_Expansion +AES_192_Key_Expansion PROC +;# parameter 1: rdi +;# parameter 2: rsi + mov rdi,rcx + mov rsi,rdx + + movdqu xmm1,[rdi] + movdqu xmm3,16[rdi] + movdqa [rsi],xmm1 + movdqa xmm5,xmm3 + + aeskeygenassist xmm2,xmm3,1h + call PREPARE_ROUNDKEY_192 + shufpd xmm1,xmm5,0 + movdqa 16[rsi],xmm5 + movdqa xmm6,xmm1 + shufpd xmm3,xmm6,1 + movdqa 32[rsi],xmm6 + + aeskeygenassist xmm2,xmm3,2h + call PREPARE_ROUNDKEY_192 + movdqa 48[rsi],xmm1 + movdqa xmm5,xmm3 + + aeskeygenassist xmm2,xmm3,4h + call PREPARE_ROUNDKEY_192 + shufpd xmm1,xmm5,0 + movdqa 64[rsi],xmm5 + movdqa xmm6,xmm1 + shufpd xmm3,xmm6,1 + movdqa 80[rsi],xmm6 + + aeskeygenassist xmm2,xmm3,8h + call PREPARE_ROUNDKEY_192 + movdqa 96[rsi],xmm1 + movdqa xmm5,xmm3 + + aeskeygenassist xmm2,xmm3,10h + call PREPARE_ROUNDKEY_192 + shufpd xmm1,xmm5,0 + movdqa 112[rsi],xmm5 + movdqa xmm6,xmm1 + shufpd xmm3,xmm6,1 + movdqa 128[rsi],xmm6 + + aeskeygenassist xmm2,xmm3,20h + call PREPARE_ROUNDKEY_192 + movdqa 144[rsi],xmm1 + movdqa xmm5,xmm3 + + aeskeygenassist xmm2,xmm3,40h + call PREPARE_ROUNDKEY_192 + shufpd xmm1,xmm5,0 + movdqa 160[rsi],xmm5 + movdqa xmm6,xmm1 + shufpd xmm3,xmm6,1 + movdqa 176[rsi],xmm6 + + aeskeygenassist xmm2,xmm3,80h + call PREPARE_ROUNDKEY_192 + movdqa 192[rsi],xmm1 + movdqa 208[rsi],xmm3 + ret + +PREPARE_ROUNDKEY_192: + pshufd xmm2,xmm2,55h + movdqu xmm4,xmm1 + pslldq xmm4,4 + pxor xmm1,xmm4 + + pslldq xmm4,4 + pxor xmm1,xmm4 + pslldq xmm4,4 + pxor xmm1,xmm4 + pxor xmm1,xmm2 + pshufd xmm2,xmm1,0ffh + movdqu xmm4,xmm3 + pslldq xmm4,4 + pxor xmm3,xmm4 + pxor xmm3,xmm2 + ret +AES_192_Key_Expansion ENDP + +; /* +; void ,AES_256_Key_Expansion[const unsigned char*userkey +; unsigned char*key] +; */ +; . globl AES_256_Key_Expansion +AES_256_Key_Expansion PROC +;# parameter 1: rdi +;# parameter 2: rsi + mov rdi,rcx + mov rsi,rdx + + movdqu xmm1,[rdi] + movdqu xmm3,16[rdi] + movdqa [rsi],xmm1 + movdqa 16[rsi],xmm3 + + aeskeygenassist xmm2,xmm3,1h + call MAKE_RK256_a + movdqa 32[rsi],xmm1 + aeskeygenassist xmm2,xmm1,0h + call MAKE_RK256_b + movdqa 48[rsi],xmm3 + aeskeygenassist xmm2,xmm3,2h + call MAKE_RK256_a + movdqa 64[rsi],xmm1 + aeskeygenassist xmm2,xmm1,0h + call MAKE_RK256_b + movdqa 80[rsi],xmm3 + aeskeygenassist xmm2,xmm3,4h + call MAKE_RK256_a + movdqa 96[rsi],xmm1 + aeskeygenassist xmm2,xmm1,0h + call MAKE_RK256_b + movdqa 112[rsi],xmm3 + aeskeygenassist xmm2,xmm3,8h + call MAKE_RK256_a + movdqa 128[rsi],xmm1 + aeskeygenassist xmm2,xmm1,0h + call MAKE_RK256_b + movdqa 144[rsi],xmm3 + aeskeygenassist xmm2,xmm3,10h + call MAKE_RK256_a + movdqa 160[rsi],xmm1 + aeskeygenassist xmm2,xmm1,0h + call MAKE_RK256_b + movdqa 176[rsi],xmm3 + aeskeygenassist xmm2,xmm3,20h + call MAKE_RK256_a + movdqa 192[rsi],xmm1 + + aeskeygenassist xmm2,xmm1,0h + call MAKE_RK256_b + movdqa 208[rsi],xmm3 + aeskeygenassist xmm2,xmm3,40h + call MAKE_RK256_a + movdqa 224[rsi],xmm1 + + ret +AES_256_Key_Expansion ENDP + +MAKE_RK256_a: + pshufd xmm2,xmm2,0ffh + movdqa xmm4,xmm1 + pslldq xmm4,4 + pxor xmm1,xmm4 + pslldq xmm4,4 + pxor xmm1,xmm4 + pslldq xmm4,4 + pxor xmm1,xmm4 + pxor xmm1,xmm2 + ret + +MAKE_RK256_b: + pshufd xmm2,xmm2,0aah + movdqa xmm4,xmm3 + pslldq xmm4,4 + pxor xmm3,xmm4 + pslldq xmm4,4 + pxor xmm3,xmm4 + pslldq xmm4,4 + pxor xmm3,xmm4 + pxor xmm3,xmm2 + ret + +END diff --git a/ctaocrypt/src/aes_asm.s b/ctaocrypt/src/aes_asm.s index 382d9b313..b5f5bc9c1 100755 --- a/ctaocrypt/src/aes_asm.s +++ b/ctaocrypt/src/aes_asm.s @@ -24,6 +24,8 @@ * by Intel Mobility Group, Israel Development Center, Israel Shay Gueron */ +/* This file is in at&t asm syntax, see .asm for intel syntax */ + /* AES_CBC_encrypt (const unsigned char *in, diff --git a/ctaocrypt/src/include.am b/ctaocrypt/src/include.am index 580d3f553..6664dab22 100644 --- a/ctaocrypt/src/include.am +++ b/ctaocrypt/src/include.am @@ -2,7 +2,8 @@ # All paths should be given relative to the root EXTRA_DIST += ctaocrypt/src/misc.c -EXTRA_DIST += ctaocrypt/src/asm.c +EXTRA_DIST += ctaocrypt/src/asm.c +EXTRA_DIST += ctaocrypt/src/aes_asm.asm EXTRA_DIST += \ ctaocrypt/src/ecc_fp.c \