diff --git a/configure.ac b/configure.ac index 1958e62ca..ee2ebb9fc 100644 --- a/configure.ac +++ b/configure.ac @@ -4664,6 +4664,10 @@ AC_ARG_ENABLE([xts], AS_IF([test "x$ENABLED_XTS" = "xyes"], [AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_AES_XTS -DWOLFSSL_AES_DIRECT"]) +AS_IF([test "x$ENABLED_XTS" = "xyes" && test "x$ENABLED_INTELASM" = "xyes"], + [AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_AES_XTS"]) +AS_IF([test "x$ENABLED_XTS" = "xyes" && test "x$ENABLED_AESNI" = "xyes"], + [AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_AES_XTS"]) # Web Server Build AC_ARG_ENABLE([webserver], @@ -8922,6 +8926,7 @@ AM_CONDITIONAL([BUILD_SNIFFER], [ test "x$ENABLED_SNIFFER" = "xyes" || test " AM_CONDITIONAL([BUILD_SNIFFTEST],[ test "x$ENABLED_SNIFFTEST" = "xyes"]) AM_CONDITIONAL([BUILD_AESGCM],[test "x$ENABLED_AESGCM" = "xyes" || test "x$ENABLED_USERSETTINGS" = "xyes"]) AM_CONDITIONAL([BUILD_AESCCM],[test "x$ENABLED_AESCCM" = "xyes" || test "x$ENABLED_USERSETTINGS" = "xyes"]) +AM_CONDITIONAL([BUILD_XTS],[test "x$ENABLED_XTS" = "xyes" || test "x$ENABLED_USERSETTINGS" = "xyes"]) AM_CONDITIONAL([BUILD_ARMASM],[test "x$ENABLED_ARMASM" = "xyes"]) AM_CONDITIONAL([BUILD_ARMASM_INLINE],[test "x$ENABLED_ARMASM_INLINE" = "xyes"]) AM_CONDITIONAL([BUILD_ARMASM_CRYPTO],[test "x$ENABLED_ARMASM_CRYPTO" = "xyes"]) diff --git a/linuxkm/Kbuild b/linuxkm/Kbuild index 3133ea8fc..093a7a112 100644 --- a/linuxkm/Kbuild +++ b/linuxkm/Kbuild @@ -103,6 +103,8 @@ $(obj)/wolfcrypt/src/aes_asm.o: asflags-y = $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DIS $(obj)/wolfcrypt/src/aes_asm.o: OBJECT_FILES_NON_STANDARD := y $(obj)/wolfcrypt/src/aes_gcm_asm.o: asflags-y = $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE) $(obj)/wolfcrypt/src/aes_gcm_asm.o: OBJECT_FILES_NON_STANDARD := y +$(obj)/wolfcrypt/src/aes_xts_asm.o: asflags-y = $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE) +$(obj)/wolfcrypt/src/aes_xts_asm.o: OBJECT_FILES_NON_STANDARD := y $(obj)/wolfcrypt/src/sp_x86_64_asm.o: asflags-y = $(WOLFSSL_ASFLAGS) $(ASFLAGS_FPU_DISABLE_SIMD_ENABLE) $(obj)/wolfcrypt/src/sp_x86_64_asm.o: OBJECT_FILES_NON_STANDARD := y diff --git a/src/include.am b/src/include.am index c222d5c67..c42b8495e 100644 --- a/src/include.am +++ b/src/include.am @@ -80,6 +80,7 @@ if BUILD_X86_ASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S else src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_asm.S +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_xts_asm.S endif endif @@ -187,6 +188,7 @@ if BUILD_X86_ASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S else src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_asm.S +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_xts_asm.S endif endif @@ -623,6 +625,7 @@ if BUILD_X86_ASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S else src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_gcm_asm.S +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/aes_xts_asm.S endif endif endif diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index 1f46057fd..3836ca265 100644 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -11006,6 +11006,41 @@ int wc_AesXtsDecryptSector(XtsAes* aes, byte* out, const byte* in, word32 sz, return wc_AesXtsDecrypt(aes, out, in, sz, (const byte*)i, AES_BLOCK_SIZE); } +#ifdef WOLFSSL_AESNI + +#if defined(USE_INTEL_SPEEDUP) + #define HAVE_INTEL_AVX1 + #define HAVE_INTEL_AVX2 +#endif /* USE_INTEL_SPEEDUP */ + +void AES_XTS_encrypt(const unsigned char *in, unsigned char *out, word32 sz, + const unsigned char* i, const unsigned char* key, + const unsigned char* key2, int nr) + XASM_LINK("AES_XTS_encrypt"); +#ifdef HAVE_INTEL_AVX1 +void AES_XTS_encrypt_avx1(const unsigned char *in, unsigned char *out, + word32 sz, const unsigned char* i, + const unsigned char* key, const unsigned char* key2, + int nr) + XASM_LINK("AES_XTS_encrypt_avx1"); +#endif /* HAVE_INTEL_AVX1 */ + +#ifdef HAVE_AES_DECRYPT +void AES_XTS_decrypt(const unsigned char *in, unsigned char *out, word32 sz, + const unsigned char* i, const unsigned char* key, + const unsigned char* key2, int nr) + XASM_LINK("AES_XTS_decrypt"); +#ifdef HAVE_INTEL_AVX1 +void AES_XTS_decrypt_avx1(const unsigned char *in, unsigned char *out, + word32 sz, const unsigned char* i, + const unsigned char* key, const unsigned char* key2, + int nr) + XASM_LINK("AES_XTS_decrypt_avx1"); +#endif /* HAVE_INTEL_AVX1 */ +#endif /* HAVE_AES_DECRYPT */ + +#endif /* WOLFSSL_AESNI */ + #ifdef HAVE_AES_ECB /* helper function for encrypting / decrypting full buffer at once */ static WARN_UNUSED_RESULT int _AesXtsHelper( @@ -11047,6 +11082,118 @@ static WARN_UNUSED_RESULT int _AesXtsHelper( #endif /* HAVE_AES_ECB */ +/* AES with XTS mode. (XTS) XEX encryption with Tweak and cipher text Stealing. + * + * xaes AES keys to use for block encrypt/decrypt + * out output buffer to hold cipher text + * in input plain text buffer to encrypt + * sz size of both out and in buffers + * i value to use for tweak + * + * returns 0 on success + */ +/* Software AES - XTS Encrypt */ +static int AesXtsEncrypt_sw(XtsAes* xaes, byte* out, const byte* in, word32 sz, + const byte* i) +{ + int ret = 0; + word32 blocks = (sz / AES_BLOCK_SIZE); + Aes *aes = &xaes->aes; + Aes *tweak = &xaes->tweak; + byte tmp[AES_BLOCK_SIZE]; + + XMEMSET(tmp, 0, AES_BLOCK_SIZE); /* set to 0's in case of improper AES + * key setup passed to encrypt direct*/ + + SAVE_VECTOR_REGISTERS(return _svr_ret;); + + ret = wc_AesEncryptDirect(tweak, tmp, i); + + if (ret != 0) { + RESTORE_VECTOR_REGISTERS(); + return ret; + } + +#ifdef HAVE_AES_ECB + /* encrypt all of buffer at once when possible */ + if (in != out) { /* can not handle inline */ + XMEMCPY(out, tmp, AES_BLOCK_SIZE); + if ((ret = _AesXtsHelper(aes, out, in, sz, AES_ENCRYPTION)) != 0) { + RESTORE_VECTOR_REGISTERS(); + return ret; + } + } +#endif + + while (blocks > 0) { + word32 j; + byte carry = 0; + +#ifdef HAVE_AES_ECB + if (in == out) +#endif + { /* check for if inline */ + byte buf[AES_BLOCK_SIZE]; + + XMEMCPY(buf, in, AES_BLOCK_SIZE); + xorbuf(buf, tmp, AES_BLOCK_SIZE); + ret = wc_AesEncryptDirect(aes, out, buf); + if (ret != 0) { + RESTORE_VECTOR_REGISTERS(); + return ret; + } + } + xorbuf(out, tmp, AES_BLOCK_SIZE); + + /* multiply by shift left and propagate carry */ + for (j = 0; j < AES_BLOCK_SIZE; j++) { + byte tmpC; + + tmpC = (tmp[j] >> 7) & 0x01; + tmp[j] = (byte)((tmp[j] << 1) + carry); + carry = tmpC; + } + if (carry) { + tmp[0] ^= GF_XTS; + } + + in += AES_BLOCK_SIZE; + out += AES_BLOCK_SIZE; + sz -= AES_BLOCK_SIZE; + blocks--; + } + + /* stealing operation of XTS to handle left overs */ + if (sz > 0) { + byte buf[AES_BLOCK_SIZE]; + + XMEMCPY(buf, out - AES_BLOCK_SIZE, AES_BLOCK_SIZE); + if (sz >= AES_BLOCK_SIZE) { /* extra sanity check before copy */ + RESTORE_VECTOR_REGISTERS(); + return BUFFER_E; + } + if (in != out) { + XMEMCPY(out, buf, sz); + XMEMCPY(buf, in, sz); + } + else { + byte buf2[AES_BLOCK_SIZE]; + + XMEMCPY(buf2, buf, sz); + XMEMCPY(buf, in, sz); + XMEMCPY(out, buf2, sz); + } + + xorbuf(buf, tmp, AES_BLOCK_SIZE); + ret = wc_AesEncryptDirect(aes, out - AES_BLOCK_SIZE, buf); + if (ret == 0) + xorbuf(out - AES_BLOCK_SIZE, tmp, AES_BLOCK_SIZE); + } + RESTORE_VECTOR_REGISTERS(); + + return ret; +} + /* AES with XTS mode. (XTS) XEX encryption with Tweak and cipher text Stealing. * * xaes AES keys to use for block encrypt/decrypt @@ -11059,126 +11206,188 @@ static WARN_UNUSED_RESULT int _AesXtsHelper( * * returns 0 on success */ -/* Software AES - XTS Encrypt */ int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, const byte* i, word32 iSz) { - int ret = 0; - word32 blocks = (sz / AES_BLOCK_SIZE); - Aes *aes, *tweak; - if (xaes == NULL || out == NULL || in == NULL) { return BAD_FUNC_ARG; } - aes = &xaes->aes; - tweak = &xaes->tweak; - if (iSz < AES_BLOCK_SIZE) { return BAD_FUNC_ARG; } - if (blocks > 0) { - byte tmp[AES_BLOCK_SIZE]; - - XMEMSET(tmp, 0, AES_BLOCK_SIZE); /* set to 0's in case of improper AES - * key setup passed to encrypt direct*/ - - SAVE_VECTOR_REGISTERS(return _svr_ret;); - - ret = wc_AesEncryptDirect(tweak, tmp, i); - - if (ret != 0) { - RESTORE_VECTOR_REGISTERS(); - return ret; - } - - #ifdef HAVE_AES_ECB - /* encrypt all of buffer at once when possible */ - if (in != out) { /* can not handle inline */ - XMEMCPY(out, tmp, AES_BLOCK_SIZE); - if ((ret = _AesXtsHelper(aes, out, in, sz, AES_ENCRYPTION)) != 0) { - RESTORE_VECTOR_REGISTERS(); - return ret; - } - } - #endif - - while (blocks > 0) { - word32 j; - byte carry = 0; - - #ifdef HAVE_AES_ECB - if (in == out) - #endif - { /* check for if inline */ - byte buf[AES_BLOCK_SIZE]; - - XMEMCPY(buf, in, AES_BLOCK_SIZE); - xorbuf(buf, tmp, AES_BLOCK_SIZE); - ret = wc_AesEncryptDirect(aes, out, buf); - if (ret != 0) { - RESTORE_VECTOR_REGISTERS(); - return ret; - } - } - xorbuf(out, tmp, AES_BLOCK_SIZE); - - /* multiply by shift left and propagate carry */ - for (j = 0; j < AES_BLOCK_SIZE; j++) { - byte tmpC; - - tmpC = (tmp[j] >> 7) & 0x01; - tmp[j] = (byte)((tmp[j] << 1) + carry); - carry = tmpC; - } - if (carry) { - tmp[0] ^= GF_XTS; - } - - in += AES_BLOCK_SIZE; - out += AES_BLOCK_SIZE; - sz -= AES_BLOCK_SIZE; - blocks--; - } - - /* stealing operation of XTS to handle left overs */ - if (sz > 0) { - byte buf[AES_BLOCK_SIZE]; - - XMEMCPY(buf, out - AES_BLOCK_SIZE, AES_BLOCK_SIZE); - if (sz >= AES_BLOCK_SIZE) { /* extra sanity check before copy */ - RESTORE_VECTOR_REGISTERS(); - return BUFFER_E; - } - if (in != out) { - XMEMCPY(out, buf, sz); - XMEMCPY(buf, in, sz); - } - else { - byte buf2[AES_BLOCK_SIZE]; - - XMEMCPY(buf2, buf, sz); - XMEMCPY(buf, in, sz); - XMEMCPY(out, buf2, sz); - } - - xorbuf(buf, tmp, AES_BLOCK_SIZE); - ret = wc_AesEncryptDirect(aes, out - AES_BLOCK_SIZE, buf); - if (ret == 0) - xorbuf(out - AES_BLOCK_SIZE, tmp, AES_BLOCK_SIZE); - } - RESTORE_VECTOR_REGISTERS(); - } - else { + if (sz < AES_BLOCK_SIZE) { WOLFSSL_MSG("Plain text input too small for encryption"); return BAD_FUNC_ARG; } - return ret; +#ifdef WOLFSSL_AESNI + #if defined(HAVE_INTEL_AVX1) + if (IS_INTEL_AVX1(intel_flags)) { + SAVE_VECTOR_REGISTERS(return _svr_ret;); + AES_XTS_encrypt_avx1(in, out, sz, i, (const byte*)xaes->aes.key, + (const byte*)xaes->tweak.key, (int)xaes->aes.rounds); + RESTORE_VECTOR_REGISTERS(); + return 0; + } + else + #endif + if (haveAESNI) { + AES_XTS_encrypt(in, out, sz, i, (const byte*)xaes->aes.key, + (const byte*)xaes->tweak.key, (int)xaes->aes.rounds); + return 0; + } + else +#endif + { + return AesXtsEncrypt_sw(xaes, out, in, sz, i); + } } +/* Same process as encryption but Aes key is AES_DECRYPTION type. + * + * xaes AES keys to use for block encrypt/decrypt + * out output buffer to hold plain text + * in input cipher text buffer to decrypt + * sz size of both out and in buffers + * i value to use for tweak + * + * returns 0 on success + */ +/* Software AES - XTS Decrypt */ +static int AesXtsDecrypt_sw(XtsAes* xaes, byte* out, const byte* in, word32 sz, + const byte* i) +{ + int ret = 0; + word32 blocks = (sz / AES_BLOCK_SIZE); + Aes *aes = &xaes->aes; + Aes *tweak = &xaes->tweak; + word32 j; + byte carry = 0; + byte tmp[AES_BLOCK_SIZE]; + byte stl = (sz % AES_BLOCK_SIZE); + + XMEMSET(tmp, 0, AES_BLOCK_SIZE); /* set to 0's in case of improper AES + * key setup passed to decrypt direct*/ + + SAVE_VECTOR_REGISTERS(return _svr_ret;); + + ret = wc_AesEncryptDirect(tweak, tmp, i); + if (ret != 0) { + RESTORE_VECTOR_REGISTERS(); + return ret; + } + + /* if Stealing then break out of loop one block early to handle special + * case */ + if (stl > 0) { + blocks--; + } + +#ifdef HAVE_AES_ECB + /* decrypt all of buffer at once when possible */ + if (in != out) { /* can not handle inline */ + XMEMCPY(out, tmp, AES_BLOCK_SIZE); + if ((ret = _AesXtsHelper(aes, out, in, sz, AES_DECRYPTION)) != 0) { + RESTORE_VECTOR_REGISTERS(); + return ret; + } + } +#endif + + while (blocks > 0) { +#ifdef HAVE_AES_ECB + if (in == out) +#endif + { /* check for if inline */ + byte buf[AES_BLOCK_SIZE]; + + XMEMCPY(buf, in, AES_BLOCK_SIZE); + xorbuf(buf, tmp, AES_BLOCK_SIZE); + ret = wc_AesDecryptDirect(aes, out, buf); + if (ret != 0) { + RESTORE_VECTOR_REGISTERS(); + return ret; + } + } + xorbuf(out, tmp, AES_BLOCK_SIZE); + + /* multiply by shift left and propagate carry */ + for (j = 0; j < AES_BLOCK_SIZE; j++) { + byte tmpC; + + tmpC = (tmp[j] >> 7) & 0x01; + tmp[j] = (byte)((tmp[j] << 1) + carry); + carry = tmpC; + } + if (carry) { + tmp[0] ^= GF_XTS; + } + carry = 0; + + in += AES_BLOCK_SIZE; + out += AES_BLOCK_SIZE; + sz -= AES_BLOCK_SIZE; + blocks--; + } + + /* stealing operation of XTS to handle left overs */ + if (sz >= AES_BLOCK_SIZE) { + byte buf[AES_BLOCK_SIZE]; + byte tmp2[AES_BLOCK_SIZE]; + + /* multiply by shift left and propagate carry */ + for (j = 0; j < AES_BLOCK_SIZE; j++) { + byte tmpC; + + tmpC = (tmp[j] >> 7) & 0x01; + tmp2[j] = (byte)((tmp[j] << 1) + carry); + carry = tmpC; + } + if (carry) { + tmp2[0] ^= GF_XTS; + } + + XMEMCPY(buf, in, AES_BLOCK_SIZE); + xorbuf(buf, tmp2, AES_BLOCK_SIZE); + ret = wc_AesDecryptDirect(aes, out, buf); + if (ret != 0) { + RESTORE_VECTOR_REGISTERS(); + return ret; + } + xorbuf(out, tmp2, AES_BLOCK_SIZE); + + /* tmp2 holds partial | last */ + XMEMCPY(tmp2, out, AES_BLOCK_SIZE); + in += AES_BLOCK_SIZE; + out += AES_BLOCK_SIZE; + sz -= AES_BLOCK_SIZE; + + /* Make buffer with end of cipher text | last */ + XMEMCPY(buf, tmp2, AES_BLOCK_SIZE); + if (sz >= AES_BLOCK_SIZE) { /* extra sanity check before copy */ + RESTORE_VECTOR_REGISTERS(); + return BUFFER_E; + } + XMEMCPY(buf, in, sz); + XMEMCPY(out, tmp2, sz); + + xorbuf(buf, tmp, AES_BLOCK_SIZE); + ret = wc_AesDecryptDirect(aes, tmp2, buf); + if (ret != 0) { + RESTORE_VECTOR_REGISTERS(); + return ret; + } + xorbuf(tmp2, tmp, AES_BLOCK_SIZE); + XMEMCPY(out - AES_BLOCK_SIZE, tmp2, AES_BLOCK_SIZE); + } + RESTORE_VECTOR_REGISTERS(); + + return ret; +} + /* Same process as encryption but Aes key is AES_DECRYPTION type. * * xaes AES keys to use for block encrypt/decrypt @@ -11191,155 +11400,44 @@ int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, * * returns 0 on success */ -/* Software AES - XTS Decrypt */ int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz, const byte* i, word32 iSz) { - int ret = 0; - word32 blocks = (sz / AES_BLOCK_SIZE); - Aes *aes, *tweak; - if (xaes == NULL || out == NULL || in == NULL) { return BAD_FUNC_ARG; } - aes = &xaes->aes; - tweak = &xaes->tweak; - if (iSz < AES_BLOCK_SIZE) { return BAD_FUNC_ARG; } - if (blocks > 0) { - word32 j; - byte carry = 0; - byte tmp[AES_BLOCK_SIZE]; - byte stl = (sz % AES_BLOCK_SIZE); - - XMEMSET(tmp, 0, AES_BLOCK_SIZE); /* set to 0's in case of improper AES - * key setup passed to decrypt direct*/ - - SAVE_VECTOR_REGISTERS(return _svr_ret;); - - ret = wc_AesEncryptDirect(tweak, tmp, i); - if (ret != 0) { - RESTORE_VECTOR_REGISTERS(); - return ret; - } - - /* if Stealing then break out of loop one block early to handle special - * case */ - if (stl > 0) { - blocks--; - } - - #ifdef HAVE_AES_ECB - /* decrypt all of buffer at once when possible */ - if (in != out) { /* can not handle inline */ - XMEMCPY(out, tmp, AES_BLOCK_SIZE); - if ((ret = _AesXtsHelper(aes, out, in, sz, AES_DECRYPTION)) != 0) { - RESTORE_VECTOR_REGISTERS(); - return ret; - } - } - #endif - - while (blocks > 0) { - #ifdef HAVE_AES_ECB - if (in == out) - #endif - { /* check for if inline */ - byte buf[AES_BLOCK_SIZE]; - - XMEMCPY(buf, in, AES_BLOCK_SIZE); - xorbuf(buf, tmp, AES_BLOCK_SIZE); - ret = wc_AesDecryptDirect(aes, out, buf); - if (ret != 0) { - RESTORE_VECTOR_REGISTERS(); - return ret; - } - } - xorbuf(out, tmp, AES_BLOCK_SIZE); - - /* multiply by shift left and propagate carry */ - for (j = 0; j < AES_BLOCK_SIZE; j++) { - byte tmpC; - - tmpC = (tmp[j] >> 7) & 0x01; - tmp[j] = (byte)((tmp[j] << 1) + carry); - carry = tmpC; - } - if (carry) { - tmp[0] ^= GF_XTS; - } - carry = 0; - - in += AES_BLOCK_SIZE; - out += AES_BLOCK_SIZE; - sz -= AES_BLOCK_SIZE; - blocks--; - } - - /* stealing operation of XTS to handle left overs */ - if (sz >= AES_BLOCK_SIZE) { - byte buf[AES_BLOCK_SIZE]; - byte tmp2[AES_BLOCK_SIZE]; - - /* multiply by shift left and propagate carry */ - for (j = 0; j < AES_BLOCK_SIZE; j++) { - byte tmpC; - - tmpC = (tmp[j] >> 7) & 0x01; - tmp2[j] = (byte)((tmp[j] << 1) + carry); - carry = tmpC; - } - if (carry) { - tmp2[0] ^= GF_XTS; - } - - XMEMCPY(buf, in, AES_BLOCK_SIZE); - xorbuf(buf, tmp2, AES_BLOCK_SIZE); - ret = wc_AesDecryptDirect(aes, out, buf); - if (ret != 0) { - RESTORE_VECTOR_REGISTERS(); - return ret; - } - xorbuf(out, tmp2, AES_BLOCK_SIZE); - - /* tmp2 holds partial | last */ - XMEMCPY(tmp2, out, AES_BLOCK_SIZE); - in += AES_BLOCK_SIZE; - out += AES_BLOCK_SIZE; - sz -= AES_BLOCK_SIZE; - - /* Make buffer with end of cipher text | last */ - XMEMCPY(buf, tmp2, AES_BLOCK_SIZE); - if (sz >= AES_BLOCK_SIZE) { /* extra sanity check before copy */ - RESTORE_VECTOR_REGISTERS(); - return BUFFER_E; - } - XMEMCPY(buf, in, sz); - XMEMCPY(out, tmp2, sz); - - xorbuf(buf, tmp, AES_BLOCK_SIZE); - ret = wc_AesDecryptDirect(aes, tmp2, buf); - if (ret != 0) { - RESTORE_VECTOR_REGISTERS(); - return ret; - } - xorbuf(tmp2, tmp, AES_BLOCK_SIZE); - XMEMCPY(out - AES_BLOCK_SIZE, tmp2, AES_BLOCK_SIZE); - } - RESTORE_VECTOR_REGISTERS(); - } - else { - WOLFSSL_MSG("Plain text input too small for encryption"); + if (sz < AES_BLOCK_SIZE) { + WOLFSSL_MSG("Cipher text input too small for decryption"); return BAD_FUNC_ARG; } - return ret; +#ifdef WOLFSSL_AESNI + #if defined(HAVE_INTEL_AVX1) + if (IS_INTEL_AVX1(intel_flags)) { + SAVE_VECTOR_REGISTERS(return _svr_ret;); + AES_XTS_decrypt_avx1(in, out, sz, i, (const byte*)xaes->aes.key, + (const byte*)xaes->tweak.key, (int)xaes->aes.rounds); + RESTORE_VECTOR_REGISTERS(); + return 0; + } + else + #endif + if (haveAESNI) { + AES_XTS_decrypt(in, out, sz, i, (const byte*)xaes->aes.key, + (const byte*)xaes->tweak.key, (int)xaes->aes.rounds); + return 0; + } + else +#endif + { + return AesXtsDecrypt_sw(xaes, out, in, sz, i); + } } - #endif /* WOLFSSL_AES_XTS */ #ifdef WOLFSSL_AES_SIV diff --git a/wolfcrypt/src/aes_gcm_asm.S b/wolfcrypt/src/aes_gcm_asm.S index 629c72b8c..b17afcf3e 100644 --- a/wolfcrypt/src/aes_gcm_asm.S +++ b/wolfcrypt/src/aes_gcm_asm.S @@ -1,5 +1,5 @@ -/* aes_gcm_asm - * +/* aes_gcm_asm.S */ +/* * Copyright (C) 2006-2023 wolfSSL Inc. * * This file is part of wolfSSL. diff --git a/wolfcrypt/src/aes_gcm_asm.asm b/wolfcrypt/src/aes_gcm_asm.asm index ff52a93db..348f2846d 100644 --- a/wolfcrypt/src/aes_gcm_asm.asm +++ b/wolfcrypt/src/aes_gcm_asm.asm @@ -1,5 +1,5 @@ -; /* aes_gcm_asm -; * +; /* aes_gcm_asm.asm */ +; /* ; * Copyright (C) 2006-2023 wolfSSL Inc. ; * ; * This file is part of wolfSSL. diff --git a/wolfcrypt/src/aes_xts_asm.S b/wolfcrypt/src/aes_xts_asm.S new file mode 100644 index 000000000..bba34a2e8 --- /dev/null +++ b/wolfcrypt/src/aes_xts_asm.S @@ -0,0 +1,1449 @@ +/* aes_xts_asm.S */ +/* + * Copyright (C) 2006-2023 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +#ifdef WOLFSSL_USER_SETTINGS +#ifdef WOLFSSL_USER_SETTINGS_ASM +/* + * user_settings_asm.h is a file generated by the script user_settings_asm.sh. + * The script takes in a user_settings.h and produces user_settings_asm.h, which + * is a stripped down version of user_settings.h containing only preprocessor + * directives. This makes the header safe to include in assembly (.S) files. + */ +#include "user_settings_asm.h" +#else +/* + * Note: if user_settings.h contains any C code (e.g. a typedef or function + * prototype), including it here in an assembly (.S) file will cause an + * assembler failure. See user_settings_asm.h above. + */ +#include "user_settings.h" +#endif /* WOLFSSL_USER_SETTINGS_ASM */ +#endif /* WOLFSSL_USER_SETTINGS */ + +#ifndef HAVE_INTEL_AVX1 +#define HAVE_INTEL_AVX1 +#endif /* HAVE_INTEL_AVX1 */ +#ifndef NO_AVX2_SUPPORT +#define HAVE_INTEL_AVX2 +#endif /* NO_AVX2_SUPPORT */ + +#ifdef WOLFSSL_AES_XTS +#ifdef WOLFSSL_X86_64_BUILD +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +L_aes_xts_gc_xts: +.long 0x87,0x1,0x1,0x1 +#ifndef __APPLE__ +.text +.globl AES_XTS_encrypt +.type AES_XTS_encrypt,@function +.align 16 +AES_XTS_encrypt: +#else +.section __TEXT,__text +.globl _AES_XTS_encrypt +.p2align 4 +_AES_XTS_encrypt: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + movq %rdx, %rax + movq %rcx, %r12 + movl 24(%rsp), %r10d + subq $0x40, %rsp + movdqu L_aes_xts_gc_xts(%rip), %xmm12 + movdqu (%r12), %xmm0 + # aes_enc_block + pxor (%r9), %xmm0 + movdqu 16(%r9), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 32(%r9), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 48(%r9), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 64(%r9), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 80(%r9), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 96(%r9), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 112(%r9), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 128(%r9), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 144(%r9), %xmm5 + aesenc %xmm5, %xmm0 + cmpl $11, %r10d + movdqu 160(%r9), %xmm5 + jl L_AES_XTS_encrypt_tweak_aes_enc_block_last + aesenc %xmm5, %xmm0 + movdqu 176(%r9), %xmm6 + aesenc %xmm6, %xmm0 + cmpl $13, %r10d + movdqu 192(%r9), %xmm5 + jl L_AES_XTS_encrypt_tweak_aes_enc_block_last + aesenc %xmm5, %xmm0 + movdqu 208(%r9), %xmm6 + aesenc %xmm6, %xmm0 + movdqu 224(%r9), %xmm5 +L_AES_XTS_encrypt_tweak_aes_enc_block_last: + aesenclast %xmm5, %xmm0 + xorl %r13d, %r13d + cmpl $0x40, %eax + movl %eax, %r11d + jl L_AES_XTS_encrypt_done_64 + andl $0xffffffc0, %r11d +L_AES_XTS_encrypt_enc_64: + # 64 bytes of input + # aes_enc_64 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + movdqu (%rcx), %xmm8 + movdqu 16(%rcx), %xmm9 + movdqu 32(%rcx), %xmm10 + movdqu 48(%rcx), %xmm11 + movdqa %xmm0, %xmm4 + movdqa %xmm0, %xmm1 + psrad $31, %xmm4 + pslld $0x01, %xmm1 + pshufd $0x93, %xmm4, %xmm4 + pand %xmm12, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm1, %xmm4 + movdqa %xmm1, %xmm2 + psrad $31, %xmm4 + pslld $0x01, %xmm2 + pshufd $0x93, %xmm4, %xmm4 + pand %xmm12, %xmm4 + pxor %xmm4, %xmm2 + movdqa %xmm2, %xmm4 + movdqa %xmm2, %xmm3 + psrad $31, %xmm4 + pslld $0x01, %xmm3 + pshufd $0x93, %xmm4, %xmm4 + pand %xmm12, %xmm4 + pxor %xmm4, %xmm3 + pxor %xmm0, %xmm8 + pxor %xmm1, %xmm9 + pxor %xmm2, %xmm10 + pxor %xmm3, %xmm11 + # aes_enc_block + movdqu (%r8), %xmm4 + pxor %xmm4, %xmm8 + pxor %xmm4, %xmm9 + pxor %xmm4, %xmm10 + pxor %xmm4, %xmm11 + movdqu 16(%r8), %xmm4 + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + movdqu 32(%r8), %xmm4 + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + movdqu 48(%r8), %xmm4 + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + movdqu 64(%r8), %xmm4 + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + movdqu 80(%r8), %xmm4 + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + movdqu 96(%r8), %xmm4 + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + movdqu 112(%r8), %xmm4 + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + movdqu 128(%r8), %xmm4 + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + movdqu 144(%r8), %xmm4 + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + cmpl $11, %r10d + movdqu 160(%r8), %xmm4 + jl L_AES_XTS_encrypt_aes_enc_64_aes_enc_block_last + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + movdqu 176(%r8), %xmm4 + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + cmpl $13, %r10d + movdqu 192(%r8), %xmm4 + jl L_AES_XTS_encrypt_aes_enc_64_aes_enc_block_last + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + movdqu 208(%r8), %xmm4 + aesenc %xmm4, %xmm8 + aesenc %xmm4, %xmm9 + aesenc %xmm4, %xmm10 + aesenc %xmm4, %xmm11 + movdqu 224(%r8), %xmm4 +L_AES_XTS_encrypt_aes_enc_64_aes_enc_block_last: + aesenclast %xmm4, %xmm8 + aesenclast %xmm4, %xmm9 + aesenclast %xmm4, %xmm10 + aesenclast %xmm4, %xmm11 + pxor %xmm0, %xmm8 + pxor %xmm1, %xmm9 + pxor %xmm2, %xmm10 + pxor %xmm3, %xmm11 + movdqu %xmm8, (%rdx) + movdqu %xmm9, 16(%rdx) + movdqu %xmm10, 32(%rdx) + movdqu %xmm11, 48(%rdx) + movdqa %xmm3, %xmm4 + movdqa %xmm3, %xmm0 + psrad $31, %xmm4 + pslld $0x01, %xmm0 + pshufd $0x93, %xmm4, %xmm4 + pand %xmm12, %xmm4 + pxor %xmm4, %xmm0 + addl $0x40, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_encrypt_enc_64 +L_AES_XTS_encrypt_done_64: + cmpl %eax, %r13d + movl %eax, %r11d + je L_AES_XTS_encrypt_done_enc + subl %r13d, %r11d + cmpl $16, %r11d + movl %eax, %r11d + jl L_AES_XTS_encrypt_last_15 + andl $0xfffffff0, %r11d + # 16 bytes of input +L_AES_XTS_encrypt_enc_16: + leaq (%rdi,%r13,1), %rcx + movdqu (%rcx), %xmm8 + pxor %xmm0, %xmm8 + # aes_enc_block + pxor (%r8), %xmm8 + movdqu 16(%r8), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 32(%r8), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 48(%r8), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 64(%r8), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 80(%r8), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 96(%r8), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 112(%r8), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 128(%r8), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 144(%r8), %xmm5 + aesenc %xmm5, %xmm8 + cmpl $11, %r10d + movdqu 160(%r8), %xmm5 + jl L_AES_XTS_encrypt_aes_enc_block_last + aesenc %xmm5, %xmm8 + movdqu 176(%r8), %xmm6 + aesenc %xmm6, %xmm8 + cmpl $13, %r10d + movdqu 192(%r8), %xmm5 + jl L_AES_XTS_encrypt_aes_enc_block_last + aesenc %xmm5, %xmm8 + movdqu 208(%r8), %xmm6 + aesenc %xmm6, %xmm8 + movdqu 224(%r8), %xmm5 +L_AES_XTS_encrypt_aes_enc_block_last: + aesenclast %xmm5, %xmm8 + pxor %xmm0, %xmm8 + leaq (%rsi,%r13,1), %rcx + movdqu %xmm8, (%rcx) + movdqa %xmm0, %xmm4 + psrad $31, %xmm4 + pslld $0x01, %xmm0 + pshufd $0x93, %xmm4, %xmm4 + pand %xmm12, %xmm4 + pxor %xmm4, %xmm0 + addl $16, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_encrypt_enc_16 + cmpl %eax, %r13d + je L_AES_XTS_encrypt_done_enc +L_AES_XTS_encrypt_last_15: + subq $16, %r13 + leaq (%rsi,%r13,1), %rcx + movdqu (%rcx), %xmm8 + addq $16, %r13 + movdqu %xmm8, (%rsp) + xorq %rdx, %rdx +L_AES_XTS_encrypt_last_15_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r13,1), %cl + movb %r11b, (%rsi,%r13,1) + movb %cl, (%rsp,%rdx,1) + incl %r13d + incl %edx + cmpl %eax, %r13d + jl L_AES_XTS_encrypt_last_15_byte_loop + subq %rdx, %r13 + movdqu (%rsp), %xmm8 + subq $16, %r13 + pxor %xmm0, %xmm8 + # aes_enc_block + pxor (%r8), %xmm8 + movdqu 16(%r8), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 32(%r8), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 48(%r8), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 64(%r8), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 80(%r8), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 96(%r8), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 112(%r8), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 128(%r8), %xmm5 + aesenc %xmm5, %xmm8 + movdqu 144(%r8), %xmm5 + aesenc %xmm5, %xmm8 + cmpl $11, %r10d + movdqu 160(%r8), %xmm5 + jl L_AES_XTS_encrypt_last_15_aes_enc_block_last + aesenc %xmm5, %xmm8 + movdqu 176(%r8), %xmm6 + aesenc %xmm6, %xmm8 + cmpl $13, %r10d + movdqu 192(%r8), %xmm5 + jl L_AES_XTS_encrypt_last_15_aes_enc_block_last + aesenc %xmm5, %xmm8 + movdqu 208(%r8), %xmm6 + aesenc %xmm6, %xmm8 + movdqu 224(%r8), %xmm5 +L_AES_XTS_encrypt_last_15_aes_enc_block_last: + aesenclast %xmm5, %xmm8 + pxor %xmm0, %xmm8 + leaq (%rsi,%r13,1), %rcx + movdqu %xmm8, (%rcx) +L_AES_XTS_encrypt_done_enc: + addq $0x40, %rsp + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_encrypt,.-AES_XTS_encrypt +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_XTS_decrypt +.type AES_XTS_decrypt,@function +.align 16 +AES_XTS_decrypt: +#else +.section __TEXT,__text +.globl _AES_XTS_decrypt +.p2align 4 +_AES_XTS_decrypt: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + movq %rdx, %rax + movq %rcx, %r12 + movl 24(%rsp), %r10d + subq $16, %rsp + movdqu L_aes_xts_gc_xts(%rip), %xmm12 + movdqu (%r12), %xmm0 + # aes_enc_block + pxor (%r9), %xmm0 + movdqu 16(%r9), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 32(%r9), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 48(%r9), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 64(%r9), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 80(%r9), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 96(%r9), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 112(%r9), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 128(%r9), %xmm5 + aesenc %xmm5, %xmm0 + movdqu 144(%r9), %xmm5 + aesenc %xmm5, %xmm0 + cmpl $11, %r10d + movdqu 160(%r9), %xmm5 + jl L_AES_XTS_decrypt_tweak_aes_enc_block_last + aesenc %xmm5, %xmm0 + movdqu 176(%r9), %xmm6 + aesenc %xmm6, %xmm0 + cmpl $13, %r10d + movdqu 192(%r9), %xmm5 + jl L_AES_XTS_decrypt_tweak_aes_enc_block_last + aesenc %xmm5, %xmm0 + movdqu 208(%r9), %xmm6 + aesenc %xmm6, %xmm0 + movdqu 224(%r9), %xmm5 +L_AES_XTS_decrypt_tweak_aes_enc_block_last: + aesenclast %xmm5, %xmm0 + xorl %r13d, %r13d + movl %eax, %r11d + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_mul16_64 + subl $16, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_last_31_start +L_AES_XTS_decrypt_mul16_64: + cmpl $0x40, %r11d + jl L_AES_XTS_decrypt_done_64 + andl $0xffffffc0, %r11d +L_AES_XTS_decrypt_dec_64: + # 64 bytes of input + # aes_dec_64 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + movdqu (%rcx), %xmm8 + movdqu 16(%rcx), %xmm9 + movdqu 32(%rcx), %xmm10 + movdqu 48(%rcx), %xmm11 + movdqa %xmm0, %xmm4 + movdqa %xmm0, %xmm1 + psrad $31, %xmm4 + pslld $0x01, %xmm1 + pshufd $0x93, %xmm4, %xmm4 + pand %xmm12, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm1, %xmm4 + movdqa %xmm1, %xmm2 + psrad $31, %xmm4 + pslld $0x01, %xmm2 + pshufd $0x93, %xmm4, %xmm4 + pand %xmm12, %xmm4 + pxor %xmm4, %xmm2 + movdqa %xmm2, %xmm4 + movdqa %xmm2, %xmm3 + psrad $31, %xmm4 + pslld $0x01, %xmm3 + pshufd $0x93, %xmm4, %xmm4 + pand %xmm12, %xmm4 + pxor %xmm4, %xmm3 + pxor %xmm0, %xmm8 + pxor %xmm1, %xmm9 + pxor %xmm2, %xmm10 + pxor %xmm3, %xmm11 + # aes_dec_block + movdqu (%r8), %xmm4 + pxor %xmm4, %xmm8 + pxor %xmm4, %xmm9 + pxor %xmm4, %xmm10 + pxor %xmm4, %xmm11 + movdqu 16(%r8), %xmm4 + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + movdqu 32(%r8), %xmm4 + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + movdqu 48(%r8), %xmm4 + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + movdqu 64(%r8), %xmm4 + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + movdqu 80(%r8), %xmm4 + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + movdqu 96(%r8), %xmm4 + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + movdqu 112(%r8), %xmm4 + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + movdqu 128(%r8), %xmm4 + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + movdqu 144(%r8), %xmm4 + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + cmpl $11, %r10d + movdqu 160(%r8), %xmm4 + jl L_AES_XTS_decrypt_aes_dec_64_aes_dec_block_last + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + movdqu 176(%r8), %xmm4 + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + cmpl $13, %r10d + movdqu 192(%r8), %xmm4 + jl L_AES_XTS_decrypt_aes_dec_64_aes_dec_block_last + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + movdqu 208(%r8), %xmm4 + aesdec %xmm4, %xmm8 + aesdec %xmm4, %xmm9 + aesdec %xmm4, %xmm10 + aesdec %xmm4, %xmm11 + movdqu 224(%r8), %xmm4 +L_AES_XTS_decrypt_aes_dec_64_aes_dec_block_last: + aesdeclast %xmm4, %xmm8 + aesdeclast %xmm4, %xmm9 + aesdeclast %xmm4, %xmm10 + aesdeclast %xmm4, %xmm11 + pxor %xmm0, %xmm8 + pxor %xmm1, %xmm9 + pxor %xmm2, %xmm10 + pxor %xmm3, %xmm11 + movdqu %xmm8, (%rdx) + movdqu %xmm9, 16(%rdx) + movdqu %xmm10, 32(%rdx) + movdqu %xmm11, 48(%rdx) + movdqa %xmm3, %xmm4 + movdqa %xmm3, %xmm0 + psrad $31, %xmm4 + pslld $0x01, %xmm0 + pshufd $0x93, %xmm4, %xmm4 + pand %xmm12, %xmm4 + pxor %xmm4, %xmm0 + addl $0x40, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_decrypt_dec_64 +L_AES_XTS_decrypt_done_64: + cmpl %eax, %r13d + movl %eax, %r11d + je L_AES_XTS_decrypt_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_mul16 + subl $16, %r11d + subl %r13d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_last_31_start + addl %r13d, %r11d +L_AES_XTS_decrypt_mul16: +L_AES_XTS_decrypt_dec_16: + # 16 bytes of input + leaq (%rdi,%r13,1), %rcx + movdqu (%rcx), %xmm8 + pxor %xmm0, %xmm8 + # aes_dec_block + pxor (%r8), %xmm8 + movdqu 16(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 32(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 48(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 64(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 80(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 96(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 112(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 128(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 144(%r8), %xmm5 + aesdec %xmm5, %xmm8 + cmpl $11, %r10d + movdqu 160(%r8), %xmm5 + jl L_AES_XTS_decrypt_aes_dec_block_last + aesdec %xmm5, %xmm8 + movdqu 176(%r8), %xmm6 + aesdec %xmm6, %xmm8 + cmpl $13, %r10d + movdqu 192(%r8), %xmm5 + jl L_AES_XTS_decrypt_aes_dec_block_last + aesdec %xmm5, %xmm8 + movdqu 208(%r8), %xmm6 + aesdec %xmm6, %xmm8 + movdqu 224(%r8), %xmm5 +L_AES_XTS_decrypt_aes_dec_block_last: + aesdeclast %xmm5, %xmm8 + pxor %xmm0, %xmm8 + leaq (%rsi,%r13,1), %rcx + movdqu %xmm8, (%rcx) + movdqa %xmm0, %xmm4 + psrad $31, %xmm4 + pslld $0x01, %xmm0 + pshufd $0x93, %xmm4, %xmm4 + pand %xmm12, %xmm4 + pxor %xmm4, %xmm0 + addl $16, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_decrypt_dec_16 + cmpl %eax, %r13d + je L_AES_XTS_decrypt_done_dec +L_AES_XTS_decrypt_last_31_start: + movdqa %xmm0, %xmm4 + movdqa %xmm0, %xmm7 + psrad $31, %xmm4 + pslld $0x01, %xmm7 + pshufd $0x93, %xmm4, %xmm4 + pand %xmm12, %xmm4 + pxor %xmm4, %xmm7 + leaq (%rdi,%r13,1), %rcx + movdqu (%rcx), %xmm8 + pxor %xmm7, %xmm8 + # aes_dec_block + pxor (%r8), %xmm8 + movdqu 16(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 32(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 48(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 64(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 80(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 96(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 112(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 128(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 144(%r8), %xmm5 + aesdec %xmm5, %xmm8 + cmpl $11, %r10d + movdqu 160(%r8), %xmm5 + jl L_AES_XTS_decrypt_last_31_aes_dec_block_last + aesdec %xmm5, %xmm8 + movdqu 176(%r8), %xmm6 + aesdec %xmm6, %xmm8 + cmpl $13, %r10d + movdqu 192(%r8), %xmm5 + jl L_AES_XTS_decrypt_last_31_aes_dec_block_last + aesdec %xmm5, %xmm8 + movdqu 208(%r8), %xmm6 + aesdec %xmm6, %xmm8 + movdqu 224(%r8), %xmm5 +L_AES_XTS_decrypt_last_31_aes_dec_block_last: + aesdeclast %xmm5, %xmm8 + pxor %xmm7, %xmm8 + movdqu %xmm8, (%rsp) + addq $16, %r13 + xorq %rdx, %rdx +L_AES_XTS_decrypt_last_31_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r13,1), %cl + movb %r11b, (%rsi,%r13,1) + movb %cl, (%rsp,%rdx,1) + incl %r13d + incl %edx + cmpl %eax, %r13d + jl L_AES_XTS_decrypt_last_31_byte_loop + subq %rdx, %r13 + movdqu (%rsp), %xmm8 + pxor %xmm0, %xmm8 + # aes_dec_block + pxor (%r8), %xmm8 + movdqu 16(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 32(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 48(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 64(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 80(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 96(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 112(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 128(%r8), %xmm5 + aesdec %xmm5, %xmm8 + movdqu 144(%r8), %xmm5 + aesdec %xmm5, %xmm8 + cmpl $11, %r10d + movdqu 160(%r8), %xmm5 + jl L_AES_XTS_decrypt_last_31_2_aes_dec_block_last + aesdec %xmm5, %xmm8 + movdqu 176(%r8), %xmm6 + aesdec %xmm6, %xmm8 + cmpl $13, %r10d + movdqu 192(%r8), %xmm5 + jl L_AES_XTS_decrypt_last_31_2_aes_dec_block_last + aesdec %xmm5, %xmm8 + movdqu 208(%r8), %xmm6 + aesdec %xmm6, %xmm8 + movdqu 224(%r8), %xmm5 +L_AES_XTS_decrypt_last_31_2_aes_dec_block_last: + aesdeclast %xmm5, %xmm8 + pxor %xmm0, %xmm8 + subq $16, %r13 + leaq (%rsi,%r13,1), %rcx + movdqu %xmm8, (%rcx) +L_AES_XTS_decrypt_done_dec: + addq $16, %rsp + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_decrypt,.-AES_XTS_decrypt +#endif /* __APPLE__ */ +#ifdef HAVE_INTEL_AVX1 +#ifndef __APPLE__ +.data +#else +.section __DATA,__data +#endif /* __APPLE__ */ +L_avx1_aes_xts_gc_xts: +.long 0x87,0x1,0x1,0x1 +#ifndef __APPLE__ +.text +.globl AES_XTS_encrypt_avx1 +.type AES_XTS_encrypt_avx1,@function +.align 16 +AES_XTS_encrypt_avx1: +#else +.section __TEXT,__text +.globl _AES_XTS_encrypt_avx1 +.p2align 4 +_AES_XTS_encrypt_avx1: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + movq %rdx, %rax + movq %rcx, %r12 + movl 24(%rsp), %r10d + subq $0x40, %rsp + vmovdqu L_avx1_aes_xts_gc_xts(%rip), %xmm12 + vmovdqu (%r12), %xmm0 + # aes_enc_block + vpxor (%r9), %xmm0, %xmm0 + vmovdqu 16(%r9), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r9), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r9), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r9), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r9), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r9), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r9), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r9), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r9), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r10d + vmovdqu 160(%r9), %xmm5 + jl L_AES_XTS_encrypt_avx1_tweak_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r9), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r10d + vmovdqu 192(%r9), %xmm5 + jl L_AES_XTS_encrypt_avx1_tweak_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r9), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r9), %xmm5 +L_AES_XTS_encrypt_avx1_tweak_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + xorl %r13d, %r13d + cmpl $0x40, %eax + movl %eax, %r11d + jl L_AES_XTS_encrypt_avx1_done_64 + andl $0xffffffc0, %r11d +L_AES_XTS_encrypt_avx1_enc_64: + # 64 bytes of input + # aes_enc_64 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu (%rcx), %xmm8 + vmovdqu 16(%rcx), %xmm9 + vmovdqu 32(%rcx), %xmm10 + vmovdqu 48(%rcx), %xmm11 + vpsrad $31, %xmm0, %xmm4 + vpslld $0x01, %xmm0, %xmm1 + vpshufd $0x93, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm1, %xmm1 + vpsrad $31, %xmm1, %xmm4 + vpslld $0x01, %xmm1, %xmm2 + vpshufd $0x93, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm2, %xmm2 + vpsrad $31, %xmm2, %xmm4 + vpslld $0x01, %xmm2, %xmm3 + vpshufd $0x93, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm0, %xmm8, %xmm8 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm10, %xmm10 + vpxor %xmm3, %xmm11, %xmm11 + # aes_enc_block + vmovdqu (%r8), %xmm4 + vpxor %xmm4, %xmm8, %xmm8 + vpxor %xmm4, %xmm9, %xmm9 + vpxor %xmm4, %xmm10, %xmm10 + vpxor %xmm4, %xmm11, %xmm11 + vmovdqu 16(%r8), %xmm4 + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + vmovdqu 32(%r8), %xmm4 + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + vmovdqu 48(%r8), %xmm4 + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + vmovdqu 64(%r8), %xmm4 + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + vmovdqu 80(%r8), %xmm4 + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + vmovdqu 96(%r8), %xmm4 + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + vmovdqu 112(%r8), %xmm4 + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + vmovdqu 128(%r8), %xmm4 + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + vmovdqu 144(%r8), %xmm4 + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm4 + jl L_AES_XTS_encrypt_avx1_aes_enc_64_aes_enc_block_last + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + vmovdqu 176(%r8), %xmm4 + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm4 + jl L_AES_XTS_encrypt_avx1_aes_enc_64_aes_enc_block_last + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + vmovdqu 208(%r8), %xmm4 + vaesenc %xmm4, %xmm8, %xmm8 + vaesenc %xmm4, %xmm9, %xmm9 + vaesenc %xmm4, %xmm10, %xmm10 + vaesenc %xmm4, %xmm11, %xmm11 + vmovdqu 224(%r8), %xmm4 +L_AES_XTS_encrypt_avx1_aes_enc_64_aes_enc_block_last: + vaesenclast %xmm4, %xmm8, %xmm8 + vaesenclast %xmm4, %xmm9, %xmm9 + vaesenclast %xmm4, %xmm10, %xmm10 + vaesenclast %xmm4, %xmm11, %xmm11 + vpxor %xmm0, %xmm8, %xmm8 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm10, %xmm10 + vpxor %xmm3, %xmm11, %xmm11 + vmovdqu %xmm8, (%rdx) + vmovdqu %xmm9, 16(%rdx) + vmovdqu %xmm10, 32(%rdx) + vmovdqu %xmm11, 48(%rdx) + vpsrad $31, %xmm3, %xmm4 + vpslld $0x01, %xmm3, %xmm0 + vpshufd $0x93, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm0, %xmm0 + addl $0x40, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_encrypt_avx1_enc_64 +L_AES_XTS_encrypt_avx1_done_64: + cmpl %eax, %r13d + movl %eax, %r11d + je L_AES_XTS_encrypt_avx1_done_enc + subl %r13d, %r11d + cmpl $16, %r11d + movl %eax, %r11d + jl L_AES_XTS_encrypt_avx1_last_15 + andl $0xfffffff0, %r11d + # 16 bytes of input +L_AES_XTS_encrypt_avx1_enc_16: + leaq (%rdi,%r13,1), %rcx + vmovdqu (%rcx), %xmm8 + vpxor %xmm0, %xmm8, %xmm8 + # aes_enc_block + vpxor (%r8), %xmm8, %xmm8 + vmovdqu 16(%r8), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 32(%r8), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 48(%r8), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 64(%r8), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 80(%r8), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 96(%r8), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 112(%r8), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 128(%r8), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 144(%r8), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_encrypt_avx1_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 176(%r8), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_encrypt_avx1_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 208(%r8), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_encrypt_avx1_aes_enc_block_last: + vaesenclast %xmm5, %xmm8, %xmm8 + vpxor %xmm0, %xmm8, %xmm8 + leaq (%rsi,%r13,1), %rcx + vmovdqu %xmm8, (%rcx) + vpsrad $31, %xmm0, %xmm4 + vpslld $0x01, %xmm0, %xmm0 + vpshufd $0x93, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm0, %xmm0 + addl $16, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_encrypt_avx1_enc_16 + cmpl %eax, %r13d + je L_AES_XTS_encrypt_avx1_done_enc +L_AES_XTS_encrypt_avx1_last_15: + subq $16, %r13 + leaq (%rsi,%r13,1), %rcx + vmovdqu (%rcx), %xmm8 + addq $16, %r13 + vmovdqu %xmm8, (%rsp) + xorq %rdx, %rdx +L_AES_XTS_encrypt_avx1_last_15_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r13,1), %cl + movb %r11b, (%rsi,%r13,1) + movb %cl, (%rsp,%rdx,1) + incl %r13d + incl %edx + cmpl %eax, %r13d + jl L_AES_XTS_encrypt_avx1_last_15_byte_loop + subq %rdx, %r13 + vmovdqu (%rsp), %xmm8 + subq $16, %r13 + vpxor %xmm0, %xmm8, %xmm8 + # aes_enc_block + vpxor (%r8), %xmm8, %xmm8 + vmovdqu 16(%r8), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 32(%r8), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 48(%r8), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 64(%r8), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 80(%r8), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 96(%r8), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 112(%r8), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 128(%r8), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 144(%r8), %xmm5 + vaesenc %xmm5, %xmm8, %xmm8 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_encrypt_avx1_last_15_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 176(%r8), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_encrypt_avx1_last_15_aes_enc_block_last + vaesenc %xmm5, %xmm8, %xmm8 + vmovdqu 208(%r8), %xmm6 + vaesenc %xmm6, %xmm8, %xmm8 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_encrypt_avx1_last_15_aes_enc_block_last: + vaesenclast %xmm5, %xmm8, %xmm8 + vpxor %xmm0, %xmm8, %xmm8 + leaq (%rsi,%r13,1), %rcx + vmovdqu %xmm8, (%rcx) +L_AES_XTS_encrypt_avx1_done_enc: + vzeroupper + addq $0x40, %rsp + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_encrypt_avx1,.-AES_XTS_encrypt_avx1 +#endif /* __APPLE__ */ +#ifndef __APPLE__ +.text +.globl AES_XTS_decrypt_avx1 +.type AES_XTS_decrypt_avx1,@function +.align 16 +AES_XTS_decrypt_avx1: +#else +.section __TEXT,__text +.globl _AES_XTS_decrypt_avx1 +.p2align 4 +_AES_XTS_decrypt_avx1: +#endif /* __APPLE__ */ + pushq %r12 + pushq %r13 + movq %rdx, %rax + movq %rcx, %r12 + movl 24(%rsp), %r10d + subq $16, %rsp + vmovdqu L_avx1_aes_xts_gc_xts(%rip), %xmm12 + vmovdqu (%r12), %xmm0 + # aes_enc_block + vpxor (%r9), %xmm0, %xmm0 + vmovdqu 16(%r9), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 32(%r9), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 48(%r9), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 64(%r9), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 80(%r9), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 96(%r9), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 112(%r9), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 128(%r9), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 144(%r9), %xmm5 + vaesenc %xmm5, %xmm0, %xmm0 + cmpl $11, %r10d + vmovdqu 160(%r9), %xmm5 + jl L_AES_XTS_decrypt_avx1_tweak_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 176(%r9), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + cmpl $13, %r10d + vmovdqu 192(%r9), %xmm5 + jl L_AES_XTS_decrypt_avx1_tweak_aes_enc_block_last + vaesenc %xmm5, %xmm0, %xmm0 + vmovdqu 208(%r9), %xmm6 + vaesenc %xmm6, %xmm0, %xmm0 + vmovdqu 224(%r9), %xmm5 +L_AES_XTS_decrypt_avx1_tweak_aes_enc_block_last: + vaesenclast %xmm5, %xmm0, %xmm0 + xorl %r13d, %r13d + movl %eax, %r11d + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_avx1_mul16_64 + subl $16, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_avx1_last_31_start +L_AES_XTS_decrypt_avx1_mul16_64: + cmpl $0x40, %r11d + jl L_AES_XTS_decrypt_avx1_done_64 + andl $0xffffffc0, %r11d +L_AES_XTS_decrypt_avx1_dec_64: + # 64 bytes of input + # aes_dec_64 + leaq (%rdi,%r13,1), %rcx + leaq (%rsi,%r13,1), %rdx + vmovdqu (%rcx), %xmm8 + vmovdqu 16(%rcx), %xmm9 + vmovdqu 32(%rcx), %xmm10 + vmovdqu 48(%rcx), %xmm11 + vpsrad $31, %xmm0, %xmm4 + vpslld $0x01, %xmm0, %xmm1 + vpshufd $0x93, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm1, %xmm1 + vpsrad $31, %xmm1, %xmm4 + vpslld $0x01, %xmm1, %xmm2 + vpshufd $0x93, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm2, %xmm2 + vpsrad $31, %xmm2, %xmm4 + vpslld $0x01, %xmm2, %xmm3 + vpshufd $0x93, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm0, %xmm8, %xmm8 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm10, %xmm10 + vpxor %xmm3, %xmm11, %xmm11 + # aes_dec_block + vmovdqu (%r8), %xmm4 + vpxor %xmm4, %xmm8, %xmm8 + vpxor %xmm4, %xmm9, %xmm9 + vpxor %xmm4, %xmm10, %xmm10 + vpxor %xmm4, %xmm11, %xmm11 + vmovdqu 16(%r8), %xmm4 + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + vmovdqu 32(%r8), %xmm4 + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + vmovdqu 48(%r8), %xmm4 + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + vmovdqu 64(%r8), %xmm4 + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + vmovdqu 80(%r8), %xmm4 + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + vmovdqu 96(%r8), %xmm4 + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + vmovdqu 112(%r8), %xmm4 + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + vmovdqu 128(%r8), %xmm4 + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + vmovdqu 144(%r8), %xmm4 + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm4 + jl L_AES_XTS_decrypt_avx1_aes_dec_64_aes_dec_block_last + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + vmovdqu 176(%r8), %xmm4 + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm4 + jl L_AES_XTS_decrypt_avx1_aes_dec_64_aes_dec_block_last + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + vmovdqu 208(%r8), %xmm4 + vaesdec %xmm4, %xmm8, %xmm8 + vaesdec %xmm4, %xmm9, %xmm9 + vaesdec %xmm4, %xmm10, %xmm10 + vaesdec %xmm4, %xmm11, %xmm11 + vmovdqu 224(%r8), %xmm4 +L_AES_XTS_decrypt_avx1_aes_dec_64_aes_dec_block_last: + vaesdeclast %xmm4, %xmm8, %xmm8 + vaesdeclast %xmm4, %xmm9, %xmm9 + vaesdeclast %xmm4, %xmm10, %xmm10 + vaesdeclast %xmm4, %xmm11, %xmm11 + vpxor %xmm0, %xmm8, %xmm8 + vpxor %xmm1, %xmm9, %xmm9 + vpxor %xmm2, %xmm10, %xmm10 + vpxor %xmm3, %xmm11, %xmm11 + vmovdqu %xmm8, (%rdx) + vmovdqu %xmm9, 16(%rdx) + vmovdqu %xmm10, 32(%rdx) + vmovdqu %xmm11, 48(%rdx) + vpsrad $31, %xmm3, %xmm4 + vpslld $0x01, %xmm3, %xmm0 + vpshufd $0x93, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm0, %xmm0 + addl $0x40, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_decrypt_avx1_dec_64 +L_AES_XTS_decrypt_avx1_done_64: + cmpl %eax, %r13d + movl %eax, %r11d + je L_AES_XTS_decrypt_avx1_done_dec + andl $0xfffffff0, %r11d + cmpl %eax, %r11d + je L_AES_XTS_decrypt_avx1_mul16 + subl $16, %r11d + subl %r13d, %r11d + cmpl $16, %r11d + jl L_AES_XTS_decrypt_avx1_last_31_start + addl %r13d, %r11d +L_AES_XTS_decrypt_avx1_mul16: +L_AES_XTS_decrypt_avx1_dec_16: + # 16 bytes of input + leaq (%rdi,%r13,1), %rcx + vmovdqu (%rcx), %xmm8 + vpxor %xmm0, %xmm8, %xmm8 + # aes_dec_block + vpxor (%r8), %xmm8, %xmm8 + vmovdqu 16(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 32(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 48(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 64(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 80(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 96(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 112(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 128(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 144(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_decrypt_avx1_aes_dec_block_last + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 176(%r8), %xmm6 + vaesdec %xmm6, %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_decrypt_avx1_aes_dec_block_last + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 208(%r8), %xmm6 + vaesdec %xmm6, %xmm8, %xmm8 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_decrypt_avx1_aes_dec_block_last: + vaesdeclast %xmm5, %xmm8, %xmm8 + vpxor %xmm0, %xmm8, %xmm8 + leaq (%rsi,%r13,1), %rcx + vmovdqu %xmm8, (%rcx) + vpsrad $31, %xmm0, %xmm4 + vpslld $0x01, %xmm0, %xmm0 + vpshufd $0x93, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm0, %xmm0 + addl $16, %r13d + cmpl %r11d, %r13d + jl L_AES_XTS_decrypt_avx1_dec_16 + cmpl %eax, %r13d + je L_AES_XTS_decrypt_avx1_done_dec +L_AES_XTS_decrypt_avx1_last_31_start: + vpsrad $31, %xmm0, %xmm4 + vpslld $0x01, %xmm0, %xmm7 + vpshufd $0x93, %xmm4, %xmm4 + vpand %xmm12, %xmm4, %xmm4 + vpxor %xmm4, %xmm7, %xmm7 + leaq (%rdi,%r13,1), %rcx + vmovdqu (%rcx), %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + # aes_dec_block + vpxor (%r8), %xmm8, %xmm8 + vmovdqu 16(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 32(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 48(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 64(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 80(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 96(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 112(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 128(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 144(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_decrypt_avx1_last_31_aes_dec_block_last + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 176(%r8), %xmm6 + vaesdec %xmm6, %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_decrypt_avx1_last_31_aes_dec_block_last + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 208(%r8), %xmm6 + vaesdec %xmm6, %xmm8, %xmm8 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_decrypt_avx1_last_31_aes_dec_block_last: + vaesdeclast %xmm5, %xmm8, %xmm8 + vpxor %xmm7, %xmm8, %xmm8 + vmovdqu %xmm8, (%rsp) + addq $16, %r13 + xorq %rdx, %rdx +L_AES_XTS_decrypt_avx1_last_31_byte_loop: + movb (%rsp,%rdx,1), %r11b + movb (%rdi,%r13,1), %cl + movb %r11b, (%rsi,%r13,1) + movb %cl, (%rsp,%rdx,1) + incl %r13d + incl %edx + cmpl %eax, %r13d + jl L_AES_XTS_decrypt_avx1_last_31_byte_loop + subq %rdx, %r13 + vmovdqu (%rsp), %xmm8 + vpxor %xmm0, %xmm8, %xmm8 + # aes_dec_block + vpxor (%r8), %xmm8, %xmm8 + vmovdqu 16(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 32(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 48(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 64(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 80(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 96(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 112(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 128(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 144(%r8), %xmm5 + vaesdec %xmm5, %xmm8, %xmm8 + cmpl $11, %r10d + vmovdqu 160(%r8), %xmm5 + jl L_AES_XTS_decrypt_avx1_last_31_2_aes_dec_block_last + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 176(%r8), %xmm6 + vaesdec %xmm6, %xmm8, %xmm8 + cmpl $13, %r10d + vmovdqu 192(%r8), %xmm5 + jl L_AES_XTS_decrypt_avx1_last_31_2_aes_dec_block_last + vaesdec %xmm5, %xmm8, %xmm8 + vmovdqu 208(%r8), %xmm6 + vaesdec %xmm6, %xmm8, %xmm8 + vmovdqu 224(%r8), %xmm5 +L_AES_XTS_decrypt_avx1_last_31_2_aes_dec_block_last: + vaesdeclast %xmm5, %xmm8, %xmm8 + vpxor %xmm0, %xmm8, %xmm8 + subq $16, %r13 + leaq (%rsi,%r13,1), %rcx + vmovdqu %xmm8, (%rcx) +L_AES_XTS_decrypt_avx1_done_dec: + vzeroupper + addq $16, %rsp + popq %r13 + popq %r12 + repz retq +#ifndef __APPLE__ +.size AES_XTS_decrypt_avx1,.-AES_XTS_decrypt_avx1 +#endif /* __APPLE__ */ +#endif /* HAVE_INTEL_AVX1 */ +#endif /* WOLFSSL_X86_64_BUILD */ +#endif /* WOLFSSL_AES_XTS */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/wolfcrypt/src/chacha_asm.S b/wolfcrypt/src/chacha_asm.S index 6fafafab9..9ca854cb4 100644 --- a/wolfcrypt/src/chacha_asm.S +++ b/wolfcrypt/src/chacha_asm.S @@ -1,5 +1,5 @@ -/* chacha_asm - * +/* chacha_asm.S */ +/* * Copyright (C) 2006-2023 wolfSSL Inc. * * This file is part of wolfSSL. diff --git a/wolfcrypt/src/fe_x25519_asm.S b/wolfcrypt/src/fe_x25519_asm.S index 2f07e3a11..7f6192acd 100644 --- a/wolfcrypt/src/fe_x25519_asm.S +++ b/wolfcrypt/src/fe_x25519_asm.S @@ -1,5 +1,5 @@ -/* fe_x25519_asm - * +/* fe_x25519_asm.S */ +/* * Copyright (C) 2006-2023 wolfSSL Inc. * * This file is part of wolfSSL. diff --git a/wolfcrypt/src/poly1305_asm.S b/wolfcrypt/src/poly1305_asm.S index 7eb5e02ba..ba1d29449 100644 --- a/wolfcrypt/src/poly1305_asm.S +++ b/wolfcrypt/src/poly1305_asm.S @@ -1,5 +1,5 @@ -/* poly1305_asm - * +/* poly1305_asm.S */ +/* * Copyright (C) 2006-2023 wolfSSL Inc. * * This file is part of wolfSSL. diff --git a/wolfcrypt/src/sha256_asm.S b/wolfcrypt/src/sha256_asm.S index 3adac1e79..6d1c8ea79 100644 --- a/wolfcrypt/src/sha256_asm.S +++ b/wolfcrypt/src/sha256_asm.S @@ -1,5 +1,5 @@ -/* sha256_asm - * +/* sha256_asm.S */ +/* * Copyright (C) 2006-2023 wolfSSL Inc. * * This file is part of wolfSSL. diff --git a/wolfcrypt/src/sha3_asm.S b/wolfcrypt/src/sha3_asm.S index 99c90d65a..07a0b140b 100644 --- a/wolfcrypt/src/sha3_asm.S +++ b/wolfcrypt/src/sha3_asm.S @@ -1,5 +1,5 @@ -/* sha3_asm - * +/* sha3_asm.S */ +/* * Copyright (C) 2006-2023 wolfSSL Inc. * * This file is part of wolfSSL. diff --git a/wolfcrypt/src/sha512_asm.S b/wolfcrypt/src/sha512_asm.S index 83f60047c..47789e83a 100644 --- a/wolfcrypt/src/sha512_asm.S +++ b/wolfcrypt/src/sha512_asm.S @@ -1,5 +1,5 @@ -/* sha512_asm - * +/* sha512_asm.S */ +/* * Copyright (C) 2006-2023 wolfSSL Inc. * * This file is part of wolfSSL. diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index 002437f9e..c3dc1e26f 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -9501,6 +9501,61 @@ static wc_test_ret_t aes_xts_128_test(void) #endif /* !HAVE_FIPS || FIPS_VERSION_GE(5,3) */ +#if !defined(BENCH_EMBEDDED) && !defined(HAVE_CAVIUM) + { + #define LARGE_XTS_SZ 1024 + #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_NO_MALLOC) + byte* large_input = (byte *)XMALLOC(LARGE_XTS_SZ, HEAP_HINT, + DYNAMIC_TYPE_TMP_BUFFER); + #else + byte large_input[LARGE_XTS_SZ]; + #endif + int i; + int j; + #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_NO_MALLOC) + if (large_input == NULL) + ERROR_OUT(WC_TEST_RET_ENC_EC(MEMORY_E), out); + #endif + + for (i = 0; i < (int)LARGE_XTS_SZ; i++) + large_input[i] = (byte)i; + + for (j = 16; j < (int)LARGE_XTS_SZ; j++) { + ret = wc_AesXtsSetKey(aes, k1, sizeof(k1), AES_ENCRYPTION, + HEAP_HINT, devId); + if (ret != 0) + ERROR_OUT(WC_TEST_RET_ENC_EC(ret), out); + ret = wc_AesXtsEncrypt(aes, large_input, large_input, j, i1, + sizeof(i1)); + #if defined(WOLFSSL_ASYNC_CRYPT) + ret = wc_AsyncWait(ret, &aes->aes.asyncDev, WC_ASYNC_FLAG_NONE); + #endif + if (ret != 0) + ERROR_OUT(WC_TEST_RET_ENC_EC(ret), out); + + ret = wc_AesXtsSetKey(aes, k1, sizeof(k1), AES_DECRYPTION, + HEAP_HINT, devId); + if (ret != 0) + ERROR_OUT(WC_TEST_RET_ENC_EC(ret), out); + ret = wc_AesXtsDecrypt(aes, large_input, large_input, j, i1, + sizeof(i1)); + #if defined(WOLFSSL_ASYNC_CRYPT) + ret = wc_AsyncWait(ret, &aes->aes.asyncDev, WC_ASYNC_FLAG_NONE); + #endif + if (ret != 0) + ERROR_OUT(WC_TEST_RET_ENC_EC(ret), out); + for (i = 0; i < j; i++) { + if (large_input[i] != (byte)i) { + ERROR_OUT(WC_TEST_RET_ENC_NC, out); + } + } + } + #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_NO_MALLOC) + XFREE(large_input, HEAP_HINT, DYNAMIC_TYPE_TMP_BUFFER); + #endif + } +#endif + out: if (aes_inited)