Pull out x86_64 ASM into separate files

This commit is contained in:
Sean Parkinson 2019-01-25 12:03:08 +10:00
parent d16c2ca7c6
commit 7822cef1ac
17 changed files with 70054 additions and 34596 deletions

View File

@ -1063,6 +1063,7 @@ then
fi
AM_CONDITIONAL([BUILD_AESNI], [test "x$ENABLED_AESNI" = "xyes"])
AM_CONDITIONAL([BUILD_INTELASM], [test "x$ENABLED_INTELASM" = "xyes"])
# Linux af_alg
@ -3554,191 +3555,6 @@ then
fi
fi
# Single Precision maths implementation
AC_ARG_ENABLE([sp],
[AS_HELP_STRING([--enable-sp],[Enable Single Precision maths implementation (default: disabled)])],
[ ENABLED_SP=$enableval ],
[ ENABLED_SP=no ],
)
ENABLED_SP_RSA=no
ENABLED_SP_DH=no
ENABLED_SP_ECC=no
for v in `echo $ENABLED_SP | tr "," " "`
do
case $v in
small)
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
ENABLED_SP_RSA=yes
ENABLED_SP_DH=yes
ENABLED_SP_ECC=yes
;;
yes)
ENABLED_SP_RSA=yes
ENABLED_SP_DH=yes
ENABLED_SP_ECC=yes
;;
no)
;;
smallec256 | smallp256 | small256)
ENABLED_SP_ECC=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
;;
ec256 | p256 | 256)
ENABLED_SP_ECC=yes
;;
small2048)
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
ENABLED_SP_RSA=yes
ENABLED_SP_DH=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_3072"
;;
2048)
ENABLED_SP_RSA=yes
ENABLED_SP_DH=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_3072"
;;
smallrsa2048)
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
ENABLED_SP_RSA=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_3072"
;;
rsa2048)
ENABLED_SP_RSA=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_3072"
;;
small3072)
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
ENABLED_SP_RSA=yes
ENABLED_SP_DH=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_2048"
;;
3072)
ENABLED_SP_RSA=yes
ENABLED_SP_DH=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_2048"
;;
smallrsa3072)
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
ENABLED_SP_RSA=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_2048"
;;
rsa3072)
ENABLED_SP_RSA=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_2048"
;;
*)
AC_MSG_ERROR([Invalid choice of Single Precision length in bits [256, 2048, 3072]: $ENABLED_SP.])
break;;
esac
done
ENABLED_SP=no
if test "$ENABLED_RSA" = "yes" && test "$ENABLED_SP_RSA" = "yes"; then
ENABLED_SP=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_HAVE_SP_RSA"
fi
if test "$ENABLED_DH" = "yes" && test "$ENABLED_SP_DH" = "yes"; then
ENABLED_SP=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_HAVE_SP_DH"
fi
if test "$ENABLED_ECC" = "yes" && test "$ENABLED_SP_ECC" = "yes"; then
ENABLED_SP=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_HAVE_SP_ECC"
fi
AC_ARG_ENABLE([sp-asm],
[AS_HELP_STRING([--enable-sp-asm],[Enable Single Precision assembly implementation (default: disabled)])],
[ ENABLED_SP_ASM=$enableval ],
[ ENABLED_SP_ASM=no ],
)
if test "$ENABLED_SP_ASM" = "yes"; then
if test "$ENABLED_SP" = "no"; then
AC_MSG_ERROR([Must have SP enabled: --enable-sp])
fi
if test "$ENABLED_ASM" = "no"; then
AC_MSG_ERROR([Assembly code turned off])
fi
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ASM"
case $host_cpu in
*aarch64*)
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM64_ASM"
ENABLED_SP_ARM64_ASM=yes
;;
*arm*)
if test $host_alias = "thumb"; then
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM_THUMB_ASM -mthumb -march=armv6"
ENABLED_SP_ARM_THUMB_ASM=yes
else
if test $host_alias = "cortex"; then
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM_CORTEX_ASM"
ENABLED_SP_ARM_CORTEX_ASM=yes
else
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM32_ASM"
ENABLED_SP_ARM32_ASM=yes
fi
fi
;;
*x86_64*)
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_X86_64_ASM"
ENABLED_SP_X86_64_ASM=yes
;;
*)
AC_MSG_ERROR([ASM not available for CPU. Supported CPUs: x86_64, aarch64, arm])
;;
esac
fi
AC_ARG_ENABLE([sp-math],
[AS_HELP_STRING([--enable-sp-math],[Enable Single Precision math implementation only (default: disabled)])],
[ ENABLED_SP_MATH=$enableval ],
[ ENABLED_SP_MATH=no ],
)
if test "$ENABLED_SP_MATH" = "yes"; then
if test "$ENABLED_SP" = "no"; then
AC_MSG_ERROR([Must have SP enabled: --enable-sp])
fi
if test "$ENABLED_ECCCUSTCURVES" = "yes"; then
AC_MSG_ERROR([Cannot use single precision math and custom curves])
fi
if test "$ENABLED_OPENSSLEXTRA" = "yes"; then
AC_MSG_ERROR([Cannot use single precision math and OpenSSL extra])
fi
if test "$ENABLED_DSA" = "yes"; then
AC_MSG_ERROR([Cannot use single precision math and DSA])
fi
if test "$ENABLED_SRP" = "yes"; then
AC_MSG_ERROR([Cannot use single precision math and SRP])
fi
if test "$ENABLED_SP_RSA" = "no" && test "$ENABLED_RSA" = "yes"; then
AC_MSG_ERROR([Cannot use Single Precision maths without RSA with RSA])
fi
if test "$ENABLED_SP_DH" = "no" && test "$ENABLED_DH" = "yes"; then
AC_MSG_ERROR([Cannot use Single Precision maths without DH with DH])
fi
fi
if test "$ENABLED_SP_MATH" = "yes"; then
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_MATH"
fi
AM_CONDITIONAL([BUILD_SP], [test "x$ENABLED_SP" = "xyes"])
AM_CONDITIONAL([BUILD_SP_C], [test "x$ENABLED_SP" = "xyes" && test "x$ENABLED_SP_ASM" = "xno" ])
AM_CONDITIONAL([BUILD_SP_ARM64], [test "x$ENABLED_SP_ARM64_ASM" = "xyes" ])
AM_CONDITIONAL([BUILD_SP_ARM32], [test "x$ENABLED_SP_ARM32_ASM" = "xyes" ])
AM_CONDITIONAL([BUILD_SP_ARM_THUMB], [test "x$ENABLED_SP_ARM_THUMB_ASM" = "xyes" ])
AM_CONDITIONAL([BUILD_SP_ARM_CORTEX], [test "x$ENABLED_SP_ARM_CORTEX_ASM" = "xyes" ])
AM_CONDITIONAL([BUILD_SP_X86_64], [test "x$ENABLED_SP_X86_64_ASM" = "xyes" ])
AM_CONDITIONAL([BUILD_SP_INT], [test "x$ENABLED_SP_MATH" = "xyes" ])
# set fastmath default
FASTMATH_DEFAULT=no
@ -4016,6 +3832,212 @@ AC_ARG_WITH([intelqa],
)
AM_CONDITIONAL([BUILD_INTEL_QA], [test "x$ENABLED_INTEL_QA" = "xyes"])
# Single Precision maths implementation
AC_ARG_ENABLE([sp],
[AS_HELP_STRING([--enable-sp],[Enable Single Precision maths implementation (default: disabled)])],
[ ENABLED_SP=$enableval ],
[ ENABLED_SP=no ],
)
ENABLED_SP_RSA=no
ENABLED_SP_DH=no
ENABLED_SP_ECC=no
for v in `echo $ENABLED_SP | tr "," " "`
do
case $v in
small)
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
ENABLED_SP_RSA=yes
ENABLED_SP_DH=yes
ENABLED_SP_ECC=yes
;;
yes)
ENABLED_SP_RSA=yes
ENABLED_SP_DH=yes
ENABLED_SP_ECC=yes
;;
no)
;;
smallec256 | smallp256 | small256)
ENABLED_SP_ECC=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_SMALL"
;;
ec256 | p256 | 256)
ENABLED_SP_ECC=yes
;;
small2048)
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_SMALL"
ENABLED_SP_RSA=yes
ENABLED_SP_DH=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_3072"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_NO_3072"
;;
2048)
ENABLED_SP_RSA=yes
ENABLED_SP_DH=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_3072"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_NO_3072"
;;
smallrsa2048)
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_SMALL"
ENABLED_SP_RSA=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_3072"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_NO_3072"
;;
rsa2048)
ENABLED_SP_RSA=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_3072"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_NO_3072"
;;
small3072)
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_SMALL"
ENABLED_SP_RSA=yes
ENABLED_SP_DH=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_2048"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_NO_2048"
;;
3072)
ENABLED_SP_RSA=yes
ENABLED_SP_DH=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_2048"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_NO_2048"
;;
smallrsa3072)
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_SMALL"
ENABLED_SP_RSA=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_2048"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_NO_2048"
;;
rsa3072)
ENABLED_SP_RSA=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_2048"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_NO_2048"
;;
*)
AC_MSG_ERROR([Invalid choice of Single Precision length in bits [256, 2048, 3072]: $ENABLED_SP.])
break;;
esac
done
ENABLED_SP=no
if test "$ENABLED_RSA" = "yes" && test "$ENABLED_SP_RSA" = "yes"; then
ENABLED_SP=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_HAVE_SP_RSA"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_HAVE_SP_RSA"
fi
if test "$ENABLED_DH" = "yes" && test "$ENABLED_SP_DH" = "yes"; then
ENABLED_SP=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_HAVE_SP_DH"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_HAVE_SP_DH"
fi
if test "$ENABLED_ECC" = "yes" && test "$ENABLED_SP_ECC" = "yes"; then
ENABLED_SP=yes
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_HAVE_SP_ECC"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_HAVE_SP_ECC"
fi
AC_ARG_ENABLE([sp-asm],
[AS_HELP_STRING([--enable-sp-asm],[Enable Single Precision assembly implementation (default: disabled)])],
[ ENABLED_SP_ASM=$enableval ],
[ ENABLED_SP_ASM=no ],
)
if test "$ENABLED_SP_ASM" = "yes"; then
if test "$ENABLED_SP" = "no"; then
AC_MSG_ERROR([Must have SP enabled: --enable-sp])
fi
if test "$ENABLED_ASM" = "no"; then
AC_MSG_ERROR([Assembly code turned off])
fi
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ASM"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_ASM"
case $host_cpu in
*aarch64*)
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM64_ASM"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_ARM64_ASM"
ENABLED_SP_ARM64_ASM=yes
;;
*arm*)
if test $host_alias = "thumb"; then
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM_THUMB_ASM -mthumb -march=armv6"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_ARM_THUMB_ASM"
ENABLED_SP_ARM_THUMB_ASM=yes
else
if test $host_alias = "cortex"; then
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM_CORTEX_ASM"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_ARM_CORTEX_ASM"
ENABLED_SP_ARM_CORTEX_ASM=yes
else
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM32_ASM"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_ARM32_ASM"
ENABLED_SP_ARM32_ASM=yes
fi
fi
;;
*x86_64*)
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_X86_64_ASM"
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_X86_64_ASM"
ENABLED_SP_X86_64_ASM=yes
;;
*)
AC_MSG_ERROR([ASM not available for CPU. Supported CPUs: x86_64, aarch64, arm])
;;
esac
fi
AC_ARG_ENABLE([sp-math],
[AS_HELP_STRING([--enable-sp-math],[Enable Single Precision math implementation only (default: disabled)])],
[ ENABLED_SP_MATH=$enableval ],
[ ENABLED_SP_MATH=no ],
)
if test "$ENABLED_SP_MATH" = "yes"; then
if test "$ENABLED_SP" = "no"; then
AC_MSG_ERROR([Must have SP enabled: --enable-sp])
fi
if test "$ENABLED_ECCCUSTCURVES" = "yes"; then
AC_MSG_ERROR([Cannot use single precision math and custom curves])
fi
if test "$ENABLED_OPENSSLEXTRA" = "yes"; then
AC_MSG_ERROR([Cannot use single precision math and OpenSSL extra])
fi
if test "$ENABLED_DSA" = "yes"; then
AC_MSG_ERROR([Cannot use single precision math and DSA])
fi
if test "$ENABLED_SRP" = "yes"; then
AC_MSG_ERROR([Cannot use single precision math and SRP])
fi
if test "$ENABLED_SP_RSA" = "no" && test "$ENABLED_RSA" = "yes"; then
AC_MSG_ERROR([Cannot use P256 single precision only math and RSA])
fi
if test "$ENABLED_SP_DH" = "no" && test "$ENABLED_DH" = "yes"; then
AC_MSG_ERROR([Cannot use P256 single precision only math and DH])
fi
fi
if test "$ENABLED_SP_MATH" = "yes"; then
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_MATH"
fi
AM_CONDITIONAL([BUILD_SP], [test "x$ENABLED_SP" = "xyes"])
AM_CONDITIONAL([BUILD_SP_C], [test "x$ENABLED_SP" = "xyes" && test "x$ENABLED_SP_ASM" = "xno" ])
AM_CONDITIONAL([BUILD_SP_ARM64], [test "x$ENABLED_SP_ARM64_ASM" = "xyes" ])
AM_CONDITIONAL([BUILD_SP_ARM32], [test "x$ENABLED_SP_ARM32_ASM" = "xyes" ])
AM_CONDITIONAL([BUILD_SP_ARM_THUMB], [test "x$ENABLED_SP_ARM_THUMB_ASM" = "xyes" ])
AM_CONDITIONAL([BUILD_SP_ARM_CORTEX], [test "x$ENABLED_SP_ARM_CORTEX_ASM" = "xyes" ])
AM_CONDITIONAL([BUILD_SP_X86_64], [test "x$ENABLED_SP_X86_64_ASM" = "xyes" ])
AM_CONDITIONAL([BUILD_SP_INT], [test "x$ENABLED_SP_MATH" = "xyes" ])
# Fast RSA using Intel IPP
ippdir="${srcdir}/IPP"
ipplib="lib" # if autoconf guesses 32bit system changes lib directory

View File

@ -145,6 +145,9 @@ if BUILD_ARMASM
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha256.c
else
src_libwolfssl_la_SOURCES += wolfcrypt/src/sha256.c
if BUILD_INTELASM
src_libwolfssl_la_SOURCES += wolfcrypt/src/sha256_asm.S
endif
endif
endif
@ -179,6 +182,7 @@ src_libwolfssl_la_SOURCES += wolfcrypt/src/sp_c64.c
endif
if BUILD_SP_X86_64
src_libwolfssl_la_SOURCES += wolfcrypt/src/sp_x86_64.c
src_libwolfssl_la_SOURCES += wolfcrypt/src/sp_x86_64_asm.S
endif
if BUILD_SP_ARM32
src_libwolfssl_la_SOURCES += wolfcrypt/src/sp_arm32.c
@ -230,6 +234,9 @@ endif
if !BUILD_FIPS_V2
if BUILD_SHA512
src_libwolfssl_la_SOURCES += wolfcrypt/src/sha512.c
if BUILD_INTELASM
src_libwolfssl_la_SOURCES += wolfcrypt/src/sha512_asm.S
endif
endif
endif
@ -267,6 +274,9 @@ endif
if BUILD_POLY1305
src_libwolfssl_la_SOURCES += wolfcrypt/src/poly1305.c
if BUILD_INTELASM
src_libwolfssl_la_SOURCES += wolfcrypt/src/poly1305_asm.S
endif
endif
if BUILD_RC4
@ -293,6 +303,7 @@ endif
if !BUILD_FIPS_V2
if BUILD_AESNI
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_asm.S
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_gcm_asm.S
endif
endif
@ -322,6 +333,9 @@ endif
if BUILD_CHACHA
src_libwolfssl_la_SOURCES += wolfcrypt/src/chacha.c
if BUILD_INTELASM
src_libwolfssl_la_SOURCES += wolfcrypt/src/chacha_asm.S
endif
if BUILD_POLY1305
src_libwolfssl_la_SOURCES += wolfcrypt/src/chacha20_poly1305.c
endif

View File

@ -4141,7 +4141,9 @@ exit_rsa_verify:
}
FREE_ARRAY_DYNAMIC(enc, BENCH_MAX_PENDING, HEAP_HINT);
#if !defined(WOLFSSL_RSA_VERIFY_INLINE) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)
FREE_ARRAY_DYNAMIC(out, BENCH_MAX_PENDING, HEAP_HINT);
#endif
FREE_VAR(message, HEAP_HINT);
}

File diff suppressed because it is too large Load Diff

8367
wolfcrypt/src/aes_gcm_asm.S Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

1314
wolfcrypt/src/chacha_asm.S Normal file

File diff suppressed because it is too large Load Diff

View File

@ -118,827 +118,55 @@ static word32 cpu_flags_set = 0;
#endif
#ifdef USE_INTEL_SPEEDUP
#ifdef __cplusplus
extern "C" {
#endif
#ifdef HAVE_INTEL_AVX1
/* Process one block (16 bytes) of data.
*
* ctx Poly1305 context.
* m One block of message data.
*/
static void poly1305_block_avx(Poly1305* ctx, const unsigned char *m)
{
__asm__ __volatile__ (
"movq (%[ctx]), %%r15\n\t"
"movq 24(%[ctx]), %%r8\n\t"
"movq 32(%[ctx]), %%r9\n\t"
"movq 40(%[ctx]), %%r10\n\t"
"xorq %%rbx, %%rbx\n\t"
"movb %[nfin], %%bl\n\t"
"# h += m\n\t"
"movq (%[m]), %%r11\n\t"
"movq 8(%[m]), %%r12\n\t"
"addq %%r11, %%r8\n\t"
"adcq %%r12, %%r9\n\t"
"movq 8(%[ctx]), %%rax\n\t"
"adcq %%rbx, %%r10\n\t"
"# r[1] * h[0] => rdx, rax ==> t2, t1\n\t"
"mulq %%r8\n\t"
"movq %%rax, %%r12\n\t"
"movq %%rdx, %%r13\n\t"
"# r[0] * h[1] => rdx, rax ++> t2, t1\n\t"
"movq %%r15, %%rax\n\t"
"mulq %%r9\n\t"
"addq %%rax, %%r12\n\t"
"movq %%r15, %%rax\n\t"
"adcq %%rdx, %%r13\n\t"
"# r[0] * h[0] => rdx, rax ==> t4, t0\n\t"
"mulq %%r8\n\t"
"movq %%rax, %%r11\n\t"
"movq %%rdx, %%r8\n\t"
"# r[1] * h[1] => rdx, rax =+> t3, t2\n\t"
"movq 8(%[ctx]), %%rax\n\t"
"mulq %%r9\n\t"
"# r[0] * h[2] +> t2\n\t"
"addq 352(%[ctx],%%r10,8), %%r13\n\t"
"movq %%rdx, %%r14\n\t"
"addq %%r8, %%r12\n\t"
"adcq %%rax, %%r13\n\t"
"# r[1] * h[2] +> t3\n\t"
"adcq 408(%[ctx],%%r10,8), %%r14\n\t"
"# r * h in r14, r13, r12, r11 \n\t"
"# h = (r * h) mod 2^130 - 5\n\t"
"movq %%r13, %%r10\n\t"
"andq $-4, %%r13\n\t"
"andq $3, %%r10\n\t"
"addq %%r13, %%r11\n\t"
"movq %%r13, %%r8\n\t"
"adcq %%r14, %%r12\n\t"
"adcq $0, %%r10\n\t"
"shrdq $2, %%r14, %%r8\n\t"
"shrq $2, %%r14\n\t"
"addq %%r11, %%r8\n\t"
"adcq %%r14, %%r12\n\t"
"movq %%r12, %%r9\n\t"
"adcq $0, %%r10\n\t"
"# h in r10, r9, r8 \n\t"
"# Store h to ctx\n\t"
"movq %%r8, 24(%[ctx])\n\t"
"movq %%r9, 32(%[ctx])\n\t"
"movq %%r10, 40(%[ctx])\n\t"
:
: [m] "r" (m), [ctx] "r" (ctx), [nfin] "m" (ctx->finished)
: "rax", "rdx", "r11", "r12", "r13", "r14", "r15", "rbx",
"r8", "r9", "r10", "memory"
);
}
extern void poly1305_block_avx(Poly1305* ctx, const unsigned char *m);
/* Process multiple blocks (n * 16 bytes) of data.
*
* ctx Poly1305 context.
* m Blocks of message data.
* bytes The number of bytes to process.
*/
POLY1305_NOINLINE static void poly1305_blocks_avx(Poly1305* ctx,
const unsigned char* m, size_t bytes)
{
__asm__ __volatile__ (
"movq (%[ctx]), %%r15\n\t"
"movq 24(%[ctx]), %%r8\n\t"
"movq 32(%[ctx]), %%r9\n\t"
"movq 40(%[ctx]), %%r10\n"
"L_avx_start:\n\t"
"# h += m\n\t"
"movq (%[m]), %%r11\n\t"
"movq 8(%[m]), %%r12\n\t"
"addq %%r11, %%r8\n\t"
"adcq %%r12, %%r9\n\t"
"movq 8(%[ctx]), %%rax\n\t"
"adcq $0, %%r10\n\t"
"# r[1] * h[0] => rdx, rax ==> t2, t1\n\t"
"mulq %%r8\n\t"
"movq %%rax, %%r12\n\t"
"movq %%rdx, %%r13\n\t"
"# r[0] * h[1] => rdx, rax ++> t2, t1\n\t"
"movq %%r15, %%rax\n\t"
"mulq %%r9\n\t"
"addq %%rax, %%r12\n\t"
"movq %%r15, %%rax\n\t"
"adcq %%rdx, %%r13\n\t"
"# r[0] * h[0] => rdx, rax ==> t4, t0\n\t"
"mulq %%r8\n\t"
"movq %%rax, %%r11\n\t"
"movq %%rdx, %%r8\n\t"
"# r[1] * h[1] => rdx, rax =+> t3, t2\n\t"
"movq 8(%[ctx]), %%rax\n\t"
"mulq %%r9\n\t"
"# r[0] * h[2] +> t2\n\t"
"addq 360(%[ctx],%%r10,8), %%r13\n\t"
"movq %%rdx, %%r14\n\t"
"addq %%r8, %%r12\n\t"
"adcq %%rax, %%r13\n\t"
"# r[1] * h[2] +> t3\n\t"
"adcq 416(%[ctx],%%r10,8), %%r14\n\t"
"# r * h in r14, r13, r12, r11 \n\t"
"# h = (r * h) mod 2^130 - 5\n\t"
"movq %%r13, %%r10\n\t"
"andq $-4, %%r13\n\t"
"andq $3, %%r10\n\t"
"addq %%r13, %%r11\n\t"
"movq %%r13, %%r8\n\t"
"adcq %%r14, %%r12\n\t"
"adcq $0, %%r10\n\t"
"shrdq $2, %%r14, %%r8\n\t"
"shrq $2, %%r14\n\t"
"addq %%r11, %%r8\n\t"
"adcq %%r14, %%r12\n\t"
"movq %%r12, %%r9\n\t"
"adcq $0, %%r10\n\t"
"# h in r10, r9, r8 \n\t"
"# Next block from message\n\t"
"addq $16, %[m]\n\t"
"subq $16, %[bytes]\n\t"
"cmp $16, %[bytes]\n\t"
"jge L_avx_start\n\t"
"# Store h to ctx\n\t"
"movq %%r8, 24(%[ctx])\n\t"
"movq %%r9, 32(%[ctx])\n\t"
"movq %%r10, 40(%[ctx])\n\t"
: [m] "+r" (m), [bytes] "+r" (bytes)
: [ctx] "r" (ctx)
: "rax", "rdx", "r11", "r12", "r13", "r14", "r15",
"r8", "r9", "r10", "memory"
);
}
extern void poly1305_blocks_avx(Poly1305* ctx, const unsigned char* m,
size_t bytes);
/* Set the key to use when processing data.
* Initialize the context.
*
* ctx Poly1305 context.
* key The key data (16 bytes).
*/
static void poly1305_setkey_avx(Poly1305* ctx, const byte* key)
{
int i;
ctx->r[0] = *(word64*)(key + 0) & 0x0ffffffc0fffffffL;
ctx->r[1] = *(word64*)(key + 8) & 0x0ffffffc0ffffffcL;
for (i=0; i<7; i++) {
ctx->hm[i + 0] = ctx->r[0] * i;
ctx->hm[i + 7] = ctx->r[1] * i;
}
/* h (accumulator) = 0 */
ctx->h[0] = 0;
ctx->h[1] = 0;
ctx->h[2] = 0;
/* save pad for later */
ctx->pad[0] = *(word64*)(key + 16);
ctx->pad[1] = *(word64*)(key + 24);
ctx->leftover = 0;
ctx->finished = 1;
}
extern void poly1305_setkey_avx(Poly1305* ctx, const byte* key);
/* Calculate the final result - authentication data.
* Zeros out the private data in the context.
*
* ctx Poly1305 context.
* mac Buffer to hold 16 bytes.
*/
static void poly1305_final_avx(Poly1305* ctx, byte* mac)
{
word64 h0, h1, h2;
/* process the remaining block */
if (ctx->leftover) {
size_t i = ctx->leftover;
ctx->buffer[i] = 1;
for (i = i + 1; i < POLY1305_BLOCK_SIZE; i++)
ctx->buffer[i] = 0;
ctx->finished = 0;
poly1305_block_avx(ctx, ctx->buffer);
}
h0 = ctx->h[0];
h1 = ctx->h[1];
h2 = ctx->h[2];
/* h %= p */
/* h = (h + pad) */
__asm__ __volatile__ (
"# mod 2^130 - 5\n\t"
"movq %[h2], %%r13\n\t"
"andq $0x3, %[h2]\n\t"
"shrq $0x2, %%r13\n\t"
"leaq (%%r13, %%r13, 4), %%r13\n\t"
"add %%r13, %[h0]\n\t"
"adc $0, %[h1]\n\t"
"adc $0, %[h2]\n\t"
"# Fixup when between (1 << 130) - 1 and (1 << 130) - 5\n\t"
"movq %[h0], %%r13\n\t"
"movq %[h1], %%r14\n\t"
"movq %[h2], %%r15\n\t"
"addq $5, %%r13\n\t"
"adcq $0, %%r14\n\t"
"adcq $0, %%r15\n\t"
"movq %%r15, %%r12\n\t"
"andq $3, %%r15\n\t"
"cmpq $4, %%r12\n\t"
"cmove %%r13, %[h0]\n\t"
"cmove %%r14, %[h1]\n\t"
"cmove %%r15, %[h2]\n\t"
"# h += pad\n\t"
"add %[p0], %[h0]\n\t"
"adc %[p1], %[h1]\n\t"
"movq %[h0], (%[m])\n\t"
"movq %[h1], 8(%[m])\n\t"
: [h0] "+r" (h0), [h1] "+r" (h1), [h2] "+r" (h2),
[p0] "+r" (ctx->pad[0]), [p1] "+r" (ctx->pad[1])
: [m] "r" (mac)
: "memory", "r15", "r14", "r13", "r12"
);
/* zero out the state */
ctx->h[0] = 0;
ctx->h[1] = 0;
ctx->h[2] = 0;
ctx->r[0] = 0;
ctx->r[1] = 0;
ctx->pad[0] = 0;
ctx->pad[1] = 0;
}
extern void poly1305_final_avx(Poly1305* ctx, byte* mac);
#endif
#ifdef HAVE_INTEL_AVX2
#if defined(_MSC_VER)
#define POLY1305_NOINLINE __declspec(noinline)
#elif defined(__GNUC__)
#define POLY1305_NOINLINE __attribute__((noinline))
#else
#define POLY1305_NOINLINE
#endif
/* Load H into five 256-bit registers.
*
* h is the memory location of the data - 26 of 32 bits.
* h0-h4 the 4 H values with 26 bits stored in 64 for multiply.
*/
#define LOAD_H(h, h0, h1, h2, h3, h4) \
"vmovdqu ("#h"), "#h0"\n\t" \
"vmovdqu 32("#h"), "#h1"\n\t" \
"vmovdqu 64("#h"), "#h2"\n\t" \
"vmovdqu 96("#h"), "#h3"\n\t" \
"vmovdqu 128("#h"), "#h4"\n\t"
/* Store H, five 256-bit registers, packed.
*
* h is the memory location of the data - 26 bits in 32.
* h0-h4 the 4 H values with 26 bits stored in 64.
* x4 is the xmm register of h4.
*/
#define STORE_H(h, h0, h1, h2, h3, h4, x4) \
"vmovdqu "#h0", ("#h")\n\t" \
"vmovdqu "#h1", 32("#h")\n\t" \
"vmovdqu "#h2", 64("#h")\n\t" \
"vmovdqu "#h3", 96("#h")\n\t" \
"vmovdqu "#h4", 128("#h")\n\t"
/* Load four powers of r into position to be multiplied by the 4 H values.
*
* r0-r4 holds the loaded values with 26 bits stored in 64 for multiply.
* t0-t3 are temporary registers.
*/
#define LOAD_Rx4(r0, r1, r2, r3, r4, \
t0, t1, t2, t3) \
"vmovdqu 224(%[ctx]), "#r3"\n\t" \
"vmovdqu 256(%[ctx]), "#r2"\n\t" \
"vmovdqu 288(%[ctx]), "#r1"\n\t" \
"vmovdqu 320(%[ctx]), "#r0"\n\t" \
"vpermq $0xd8, "#r0", "#r0"\n\t" \
"vpermq $0xd8, "#r1", "#r1"\n\t" \
"vpermq $0xd8, "#r2", "#r2"\n\t" \
"vpermq $0xd8, "#r3", "#r3"\n\t" \
"vpunpcklqdq "#r1", "#r0", "#t0"\n\t" \
"vpunpckhqdq "#r1", "#r0", "#t1"\n\t" \
"vpunpcklqdq "#r3", "#r2", "#t2"\n\t" \
"vpunpckhqdq "#r3", "#r2", "#t3"\n\t" \
"vperm2i128 $0x20, "#t2", "#t0", "#r0"\n\t" \
"vperm2i128 $0x31, "#t2", "#t0", "#r2"\n\t" \
"vperm2i128 $0x20, "#t3", "#t1", "#r4"\n\t" \
"vpsrlq $32, "#r0", "#r1"\n\t" \
"vpsrlq $32, "#r2", "#r3"\n\t"
/* Load the r^4 value into position to be multiplied by all 4 H values.
*
* r4 holds r^4 as five 26 bits each in 32.
* r0-r4 holds the loaded values with 26 bits stored in 64 for multiply.
* t0-t1 are temporary registers.
*/
#define LOAD_R4(r4, r40, r41, r42, r43, r44, \
t0, t1) \
"vmovdqu "#r4", "#t0"\n\t" \
"vpermq $0x0, "#t0", "#r40"\n\t" \
"vpsrlq $32, "#t0", "#t1"\n\t" \
"vpermq $0x55, "#t0", "#r42"\n\t" \
"vpermq $0xaa, "#t0", "#r44"\n\t" \
"vpermq $0x0, "#t1", "#r41"\n\t" \
"vpermq $0x55, "#t1", "#r43"\n\t"
/* Multiply the top 4 26-bit values in 64 bits of each H by 5 for reduction in
* multiply.
*
* s1-s4 are each 64 bit value in r1-r4 multiplied by 5.
* r1-r4 are the top 4
*/
#define MUL5(s1, s2, s3, s4, r1, r2, r3, r4) \
"vpslld $2, "#r1", "#s1"\n\t" \
"vpslld $2, "#r2", "#s2"\n\t" \
"vpslld $2, "#r3", "#s3"\n\t" \
"vpslld $2, "#r4", "#s4"\n\t" \
"vpaddq "#s1", "#r1", "#s1"\n\t" \
"vpaddq "#s2", "#r2", "#s2"\n\t" \
"vpaddq "#s3", "#r3", "#s3"\n\t" \
"vpaddq "#s4", "#r4", "#s4"\n\t"
/* Add the 4 H values together.
* Each 64 bits in a register is 26 bits of one of the H values.
*
* h0-h4 contains the 4 H values.
* t1-t4 are temporary registers.
*/
#define FINALIZE_H(h0, h1, h2, h3, h4, \
t0, t1, t2, t3, t4) \
"vpsrldq $8, "#h0", "#t0"\n\t" \
"vpsrldq $8, "#h1", "#t1"\n\t" \
"vpsrldq $8, "#h2", "#t2"\n\t" \
"vpsrldq $8, "#h3", "#t3"\n\t" \
"vpsrldq $8, "#h4", "#t4"\n\t" \
"vpaddq "#h0", "#t0", "#h0"\n\t" \
"vpaddq "#h1", "#t1", "#h1"\n\t" \
"vpaddq "#h2", "#t2", "#h2"\n\t" \
"vpaddq "#h3", "#t3", "#h3"\n\t" \
"vpaddq "#h4", "#t4", "#h4"\n\t" \
"vpermq $0x02, "#h0", "#t0"\n\t" \
"vpermq $0x02, "#h1", "#t1"\n\t" \
"vpermq $0x02, "#h2", "#t2"\n\t" \
"vpermq $0x02, "#h3", "#t3"\n\t" \
"vpermq $0x02, "#h4", "#t4"\n\t" \
"vpaddq "#h0", "#t0", "#h0"\n\t" \
"vpaddq "#h1", "#t1", "#h1"\n\t" \
"vpaddq "#h2", "#t2", "#h2"\n\t" \
"vpaddq "#h3", "#t3", "#h3"\n\t" \
"vpaddq "#h4", "#t4", "#h4"\n\t"
/* Move 32 bits from each xmm register to a 32 bit register.
*
* x0-x4 are the xmm version of the ymm registers used.
* t0-t4 are the 32-bit registers to store data in.
*/
#define MOVE_TO_32(x0, x1, x2, x3, x4, \
t0, t1, t2, t3, t4) \
"vmovd "#x0", "#t0"\n\t" \
"vmovd "#x1", "#t1"\n\t" \
"vmovd "#x2", "#t2"\n\t" \
"vmovd "#x3", "#t3"\n\t" \
"vmovd "#x4", "#t4"\n\t"
/* Multiply using AVX2 instructions.
* Each register contains up to 32 bits of data in 64 bits.
* This is a 4 way parallel multiply.
*
* h0-h4 contain 4 H values with the 32 bits of each per register.
* r0-r4 contain the 4 powers of r.
* s1-s4 contain r1-r4 times 5.
* t0-t4 and v0-v3 are temporary registers.
*/
#define MUL_AVX2(h0, h1, h2, h3, h4, \
r0, r1, r2, r3, r4, \
s1, s2, s3, s4, \
t0, t1, t2, t3, t4, \
v0, v1, v2, v3) \
"vpmuludq "#s1", "#h4", "#t0"\n\t" \
"vpmuludq "#s2", "#h3", "#v0"\n\t" \
"vpmuludq "#s2", "#h4", "#t1"\n\t" \
"vpmuludq "#s3", "#h3", "#v1"\n\t" \
"vpmuludq "#s3", "#h4", "#t2"\n\t" \
"vpaddq "#t0", "#v0", "#t0"\n\t" \
"vpmuludq "#s3", "#h2", "#v2"\n\t" \
"vpmuludq "#s4", "#h4", "#t3"\n\t" \
"vpaddq "#t1", "#v1", "#t1"\n\t" \
"vpmuludq "#s4", "#h1", "#v3"\n\t" \
"vpmuludq "#s4", "#h2", "#v0"\n\t" \
"vpaddq "#t0", "#v2", "#t0"\n\t" \
"vpmuludq "#s4", "#h3", "#v1"\n\t" \
"vpmuludq "#r0", "#h3", "#v2"\n\t" \
"vpaddq "#t0", "#v3", "#t0"\n\t" \
"vpmuludq "#r0", "#h4", "#t4"\n\t" \
"vpaddq "#t1", "#v0", "#t1"\n\t" \
"vpmuludq "#r0", "#h0", "#v3"\n\t" \
"vpaddq "#t2", "#v1", "#t2"\n\t" \
"vpmuludq "#r0", "#h1", "#v0"\n\t" \
"vpaddq "#t3", "#v2", "#t3"\n\t" \
"vpmuludq "#r0", "#h2", "#v1"\n\t" \
"vpmuludq "#r1", "#h2", "#v2"\n\t" \
"vpaddq "#t0", "#v3", "#t0"\n\t" \
"vpmuludq "#r1", "#h3", "#v3"\n\t" \
"vpaddq "#t1", "#v0", "#t1"\n\t" \
"vpmuludq "#r1", "#h0", "#v0"\n\t" \
"vpaddq "#t2", "#v1", "#t2"\n\t" \
"vpmuludq "#r1", "#h1", "#v1"\n\t" \
"vpaddq "#t3", "#v2", "#t3"\n\t" \
"vpmuludq "#r2", "#h1", "#v2"\n\t" \
"vpaddq "#t4", "#v3", "#t4"\n\t" \
"vpmuludq "#r2", "#h2", "#v3"\n\t" \
"vpaddq "#t1", "#v0", "#t1"\n\t" \
"vpmuludq "#r2", "#h0", "#v0"\n\t" \
"vpaddq "#t2", "#v1", "#t2"\n\t" \
"vpmuludq "#r3", "#h0", "#v1"\n\t" \
"vpaddq "#t3", "#v2", "#t3"\n\t" \
"vpmuludq "#r3", "#h1", "#v2"\n\t" \
"vpaddq "#t4", "#v3", "#t4"\n\t" \
"vpmuludq "#r4", "#h0", "#v3"\n\t" \
"vpaddq "#t2", "#v0", "#t2"\n\t" \
"vpaddq "#t3", "#v1", "#t3"\n\t" \
"vpaddq "#t4", "#v2", "#t4"\n\t" \
"vpaddq "#t4", "#v3", "#t4"\n\t"
/* Load the 4 blocks of the message.
*
* m the address of the message to load.
* m0-m4 is the loaded message with 32 bits in 64. Loaded so data is parallel.
* hi is the high bits of the 4 m (1 << 128 as not final block).
* z is zero.
*/
#define LOAD_M(m, m0, m1, m2, m3, m4, hi, z) \
"vmovdqu (%[m]), "#m0"\n\t" \
"vmovdqu 32(%[m]), "#m1"\n\t" \
"vperm2i128 $0x20, "#m1", "#m0", "#m2"\n\t" \
"vperm2i128 $0x31, "#m1", "#m0", "#m0"\n\t" \
"vpunpckldq "#m0", "#m2", "#m1"\n\t" \
"vpunpckhdq "#m0", "#m2", "#m3"\n\t" \
"vpunpckldq "#z", "#m1", "#m0"\n\t" \
"vpunpckhdq "#z", "#m1", "#m1"\n\t" \
"vpunpckldq "#z", "#m3", "#m2"\n\t" \
"vpunpckhdq "#z", "#m3", "#m3"\n\t" \
"vmovdqu "#hi", "#m4"\n\t" \
"vpsllq $6, "#m1", "#m1"\n\t" \
"vpsllq $12, "#m2", "#m2"\n\t" \
"vpsllq $18, "#m3", "#m3"\n\t"
/* Multiply using AVX2 instructions - adding with message.
* Each register contains up to 32 bits of data in 64 bits.
* This is a 4 way parallel multiply.
* The message data is loaded first and the multiplication adds into it.
*
* h0-h4 contain 4 H values with the 32 bits of each per register.
* r0-r4 contain the 4 powers of r.
* s1-s4 contain r1-r4 times 5.
* t0-t4 and v0-v3 are temporary registers.
* hi is the high bits of the 4 m (1 << 128 as not final block).
* z is zero.
*/
#define MUL_ADD_AVX2(h0, h1, h2, h3, h4, \
r0, r1, r2, r3, r4, \
s1, s2, s3, s4, \
t0, t1, t2, t3, t4, \
v0, v1, v2, v3, \
hi, z) \
"vmovdqu (%[m]), "#t0"\n\t" \
"vmovdqu 32(%[m]), "#t1"\n\t" \
"vperm2i128 $0x20, "#t1", "#t0", "#t2"\n\t" \
"vperm2i128 $0x31, "#t1", "#t0", "#t0"\n\t" \
"vpunpckldq "#t0", "#t2", "#t1"\n\t" \
"vpunpckhdq "#t0", "#t2", "#t3"\n\t" \
"vpunpckldq "#z", "#t1", "#t0"\n\t" \
"vpunpckhdq "#z", "#t1", "#t1"\n\t" \
"vpunpckldq "#z", "#t3", "#t2"\n\t" \
"vpunpckhdq "#z", "#t3", "#t3"\n\t" \
"vmovdqu "#hi", "#t4"\n\t" \
"vpsllq $6, "#t1", "#t1"\n\t" \
"vpsllq $12, "#t2", "#t2"\n\t" \
"vpsllq $18, "#t3", "#t3"\n\t" \
"vpmuludq "#s1", "#h4", "#v0"\n\t" \
"vpaddq "#t0", "#v0", "#t0"\n\t" \
"vpmuludq "#s2", "#h3", "#v0"\n\t" \
"vpmuludq "#s2", "#h4", "#v1"\n\t" \
"vpaddq "#t1", "#v1", "#t1"\n\t" \
"vpmuludq "#s3", "#h3", "#v1"\n\t" \
"vpmuludq "#s3", "#h4", "#v2"\n\t" \
"vpaddq "#t2", "#v2", "#t2"\n\t" \
"vpaddq "#t0", "#v0", "#t0"\n\t" \
"vpmuludq "#s3", "#h2", "#v2"\n\t" \
"vpmuludq "#s4", "#h4", "#v3"\n\t" \
"vpaddq "#t3", "#v3", "#t3"\n\t" \
"vpaddq "#t1", "#v1", "#t1"\n\t" \
"vpmuludq "#s4", "#h1", "#v3"\n\t" \
"vpmuludq "#s4", "#h2", "#v0"\n\t" \
"vpaddq "#t0", "#v2", "#t0"\n\t" \
"vpmuludq "#s4", "#h3", "#v1"\n\t" \
"vpmuludq "#r0", "#h3", "#v2"\n\t" \
"vpaddq "#t0", "#v3", "#t0"\n\t" \
"vpmuludq "#r0", "#h4", "#v3"\n\t" \
"vpaddq "#t4", "#v3", "#t4"\n\t" \
"vpaddq "#t1", "#v0", "#t1"\n\t" \
"vpmuludq "#r0", "#h0", "#v3"\n\t" \
"vpaddq "#t2", "#v1", "#t2"\n\t" \
"vpmuludq "#r0", "#h1", "#v0"\n\t" \
"vpaddq "#t3", "#v2", "#t3"\n\t" \
"vpmuludq "#r0", "#h2", "#v1"\n\t" \
"vpmuludq "#r1", "#h2", "#v2"\n\t" \
"vpaddq "#t0", "#v3", "#t0"\n\t" \
"vpmuludq "#r1", "#h3", "#v3"\n\t" \
"vpaddq "#t1", "#v0", "#t1"\n\t" \
"vpmuludq "#r1", "#h0", "#v0"\n\t" \
"vpaddq "#t2", "#v1", "#t2"\n\t" \
"vpmuludq "#r1", "#h1", "#v1"\n\t" \
"vpaddq "#t3", "#v2", "#t3"\n\t" \
"vpmuludq "#r2", "#h1", "#v2"\n\t" \
"vpaddq "#t4", "#v3", "#t4"\n\t" \
"vpmuludq "#r2", "#h2", "#v3"\n\t" \
"vpaddq "#t1", "#v0", "#t1"\n\t" \
"vpmuludq "#r2", "#h0", "#v0"\n\t" \
"vpaddq "#t2", "#v1", "#t2"\n\t" \
"vpmuludq "#r3", "#h0", "#v1"\n\t" \
"vpaddq "#t3", "#v2", "#t3"\n\t" \
"vpmuludq "#r3", "#h1", "#v2"\n\t" \
"vpaddq "#t4", "#v3", "#t4"\n\t" \
"vpmuludq "#r4", "#h0", "#v3"\n\t" \
"vpaddq "#t2", "#v0", "#t2"\n\t" \
"vpaddq "#t3", "#v1", "#t3"\n\t" \
"vpaddq "#t4", "#v2", "#t4"\n\t" \
"vpaddq "#t4", "#v3", "#t4"\n\t"
/* Reduce the 64 bits of data to 26 bits.
*
* h0-h4 contain the reduced H values.
* m0-m4 contain the 4 H values to reduce.
* t0-t2 are temporaries.
* mask contains the 26-bit mask for each 64 bit value in the 256 bit register.
*/
#define REDUCE(h0, h1, h2, h3, h4, \
m0, m1, m2, m3, m4, \
t0, t1, t2, mask) \
"vpsrlq $26, "#m0", "#t0"\n\t" \
"vpsrlq $26, "#m3", "#t1"\n\t" \
"vpand "#mask", "#m0", "#m0"\n\t" \
"vpand "#mask", "#m3", "#m3"\n\t" \
"vpaddq "#m1", "#t0", "#m1"\n\t" \
"vpaddq "#m4", "#t1", "#m4"\n\t" \
\
"vpsrlq $26, "#m1", "#t0"\n\t" \
"vpsrlq $26, "#m4", "#t1"\n\t" \
"vpand "#mask", "#m1", "#h1"\n\t" \
"vpand "#mask", "#m4", "#h4"\n\t" \
"vpaddq "#m2", "#t0", "#m2"\n\t" \
"vpslld $2, "#t1", "#t2"\n\t" \
"vpaddd "#t2", "#t1", "#t2"\n\t" \
\
"vpsrlq $26, "#m2", "#t0"\n\t" \
"vpaddq "#m0", "#t2", "#m0"\n\t" \
"vpsrlq $26, "#m0", "#t1"\n\t" \
"vpand "#mask", "#m2", "#h2"\n\t" \
"vpand "#mask", "#m0", "#h0"\n\t" \
"vpaddq "#m3", "#t0", "#m3"\n\t" \
"vpaddq "#h1", "#t1", "#h1"\n\t" \
\
"vpsrlq $26, "#m3", "#t0"\n\t" \
"vpand "#mask", "#m3", "#h3"\n\t" \
"vpaddq "#h4", "#t0", "#h4"\n\t" \
/* Process multiple blocks (n * 16 bytes) of data.
*
* ctx Poly1305 context.
* m Blocks of message data.
* bytes The number of bytes to process.
*/
POLY1305_NOINLINE static void poly1305_blocks_avx2(Poly1305* ctx,
const unsigned char* m, size_t bytes)
{
ALIGN256 word64 r4[5][4];
ALIGN256 word64 s[4][4];
register word32 t0 asm("r8") = 0;
register word32 t1 asm("r9") = 0;
register word32 t2 asm("r10") = 0;
register word32 t3 asm("r11") = 0;
register word32 t4 asm("r12") = 0;
static const word64 mask[4] = { 0x0000000003ffffff, 0x0000000003ffffff,
0x0000000003ffffff, 0x0000000003ffffff };
static const word64 hibit[4] = { 0x1000000, 0x1000000,
0x1000000, 0x1000000 };
__asm__ __volatile__ (
"vpxor %%ymm15, %%ymm15, %%ymm15\n\t"
"cmpb $1, %[started]\n\t"
"je L_begin\n\t"
"cmpb $1, %[fin]\n\t"
"je L_begin\n\t"
"# Load the message data\n\t"
LOAD_M(m, %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, %[hibit], %%ymm15)
"vmovdqu %[mask], %%ymm14\n\t"
"# Reduce, in place, the message data\n\t"
REDUCE(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
%%ymm10, %%ymm11, %%ymm12, %%ymm14)
"addq $64, %[m]\n\t"
"subq $64, %[bytes]\n\t"
"jz L_store\n\t"
"jmp L_load_r4\n\t"
"\n"
"L_begin:\n\t"
"# Load the H values.\n\t"
LOAD_H(%[h], %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4)
"# Check if there is a power of r to load - otherwise use r^4.\n\t"
"cmpb $0, %[fin]\n\t"
"je L_load_r4\n\t"
"\n\t"
"# Load the 4 powers of r - r^4, r^3, r^2, r^1.\n\t"
LOAD_Rx4(%%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
%%ymm10, %%ymm11, %%ymm12, %%ymm13)
"jmp L_mul_5\n\t"
"\n"
"L_load_r4:\n\t"
"# Load r^4 into all four positions.\n\t"
LOAD_R4(320(%[ctx]), %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
%%ymm13, %%ymm14)
"\n"
"L_mul_5:\n\t"
"# Multiply top 4 26-bit values of all four H by 5\n\t"
MUL5(%%ymm10, %%ymm11, %%ymm12, %%ymm13, %%ymm6, %%ymm7, %%ymm8, %%ymm9)
"# Store powers of r and multiple of 5 for use in multiply.\n\t"
"vmovdqa %%ymm10, (%[s])\n\t"
"vmovdqa %%ymm11, 32(%[s])\n\t"
"vmovdqa %%ymm12, 64(%[s])\n\t"
"vmovdqa %%ymm13, 96(%[s])\n\t"
"vmovdqa %%ymm5 , (%[r4])\n\t"
"vmovdqa %%ymm6 , 32(%[r4])\n\t"
"vmovdqa %%ymm7 , 64(%[r4])\n\t"
"vmovdqa %%ymm8 , 96(%[r4])\n\t"
"vmovdqa %%ymm9 , 128(%[r4])\n\t"
"vmovdqu %[mask], %%ymm14\n\t"
"\n"
"# If not finished then loop over data\n\t"
"cmpb $0x1, %[fin]\n\t"
"jne L_start\n\t"
"# Do last multiply, reduce, add the four H together and move to\n\t"
"# 32-bit registers\n\t"
MUL_AVX2(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
(%[r4]), 32(%[r4]), 64(%[r4]), 96(%[r4]), 128(%[r4]),
(%[s]), 32(%[s]), 64(%[s]), 96(%[s]),
%%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
%%ymm10, %%ymm11, %%ymm12, %%ymm13)
REDUCE(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
%%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
%%ymm10, %%ymm11, %%ymm12, %%ymm14)
FINALIZE_H(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
%%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9)
MOVE_TO_32(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4,
%[t0], %[t1], %[t2], %[t3], %[t4])
"jmp L_end\n\t"
"\n"
"L_start:\n\t"
MUL_ADD_AVX2(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
(%[r4]), 32(%[r4]), 64(%[r4]), 96(%[r4]), 128(%[r4]),
(%[s]), 32(%[s]), 64(%[s]), 96(%[s]),
%%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
%%ymm10, %%ymm11, %%ymm12, %%ymm13,
%[hibit], %%ymm15)
REDUCE(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
%%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
%%ymm10, %%ymm11, %%ymm12, %%ymm14)
"addq $64, %[m]\n\t"
"subq $64, %[bytes]\n\t"
"jnz L_start\n\t"
"\n"
"L_store:\n\t"
"# Store four H values - state\n\t"
STORE_H(%[h], %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, %%xmm4)
"\n"
"L_end:\n\t"
: [m] "+r" (m), [bytes] "+r" (bytes),
[t0] "+r" (t0), [t1] "+r" (t1), [t2] "+r" (t2),
[t3] "+r" (t3), [t4] "+r" (t4)
: [ctx] "r" (ctx), [h] "r" (ctx->hh),
[r4] "r" (r4), [s] "r" (s),
[fin] "m" (ctx->finished), [started] "m" (ctx->started),
[mask] "m" (mask), [hibit] "m" (hibit)
: "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7",
"ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
"memory"
);
if (ctx->finished)
{
word64 h0, h1, h2, c;
/* Convert to 64-bit form. */
h0 = (((word64)(t1 & 0x3FFFF)) << 26) + t0;
h1 = (((word64)(t3 & 0x3FF)) << 34) +
(((word64) t2 ) << 8) + (t1 >> 18);
h2 = (((word64) t4 ) << 16) + (t3 >> 10);
/* Perform modulur reduction. */
c = (h1 >> 44); h1 &= 0xfffffffffff;
h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff;
h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
h1 += c;
/* Convert from 42/44/44 to 2/64/64 bits used and store result. */
ctx->h[0] = h0 | (h1 << 44);
ctx->h[1] = (h1 >> 20) | (h2 << 24);
ctx->h[2] = h2 >> 40;
}
ctx->started = 1;
}
/* Multiply two 130-bit numbers in 64-bit registers and reduce.
* 44 + 44 + 42 = 130 bits
*
* r0-r2 are the first operand and the result.
* a0-a2 are the second operand.
*/
#define MUL_64(r0, r1, r2, a0, a1, a2) \
s1 = a1 * (5 << 2); \
s2 = a2 * (5 << 2); \
MUL(d0, r0, a0); MUL(d, r1, s2); ADD(d0, d); MUL(d, r2, s1); ADD(d0, d); \
MUL(d1, r0, a1); MUL(d, r1, a0); ADD(d1, d); MUL(d, r2, s2); ADD(d1, d); \
MUL(d2, r0, a2); MUL(d, r1, a1); ADD(d2, d); MUL(d, r2, a0); ADD(d2, d); \
\
c = SHR(d0, 44); r0 = LO(d0) & 0xfffffffffff; \
ADDLO(d1, c); c = SHR(d1, 44); r1 = LO(d1) & 0xfffffffffff; \
ADDLO(d2, c); c = SHR(d2, 42); r2 = LO(d2) & 0x3ffffffffff; \
r0 += c * 5; c = (r0 >> 44); r0 = r0 & 0xfffffffffff; \
r1 += c
#define SQR_64(r0, r1, r2) \
s2 = r2 * (5 << 2); \
MUL(d0, r1, s2); ADD(d0, d0); MUL(d, r0, r0); ADD(d0, d); \
MUL(d1, r0, r1); ADD(d1, d1); MUL(d, r2, s2); ADD(d1, d); \
MUL(d2, r0, r2); ADD(d2, d2); MUL(d, r1, r1); ADD(d2, d); \
\
c = SHR(d0, 44); r0 = LO(d0) & 0xfffffffffff; \
ADDLO(d1, c); c = SHR(d1, 44); r1 = LO(d1) & 0xfffffffffff; \
ADDLO(d2, c); c = SHR(d2, 42); r2 = LO(d2) & 0x3ffffffffff; \
r0 += c * 5; c = (r0 >> 44); r0 = r0 & 0xfffffffffff; \
r1 += c
/* Store the 130-bit number in 64-bit registers as 26-bit values in 32 bits.
*
* r0-r2 contains the 130-bit number in 64-bit registers.
* r is the address of where to store the 26 of 32 bits result.
*/
#define CONV_64_TO_32(r0, r1, r2, r) \
r[0] = (word32)( r0 ) & 0x3ffffff; \
r[1] = (word32)((r0 >> 26) | (r1 << 18)) & 0x3ffffff; \
r[2] = (word32)( r1 >> 8 ) & 0x3ffffff; \
r[3] = (word32)((r1 >> 34) | (r2 << 10)) & 0x3ffffff; \
r[4] = (word32)( r2 >> 16 )
extern void poly1305_blocks_avx2(Poly1305* ctx, const unsigned char* m,
size_t bytes);
/* Calculate R^1, R^2, R^3 and R^4 and store them in the context.
*
* ctx Poly1305 context.
*/
static void poly1305_calc_powers(Poly1305* ctx)
{
word64 r0, r1, r2, t0, t1, c;
word64 r20, r21, r22;
word64 r30, r31, r32;
word64 r40, r41, r42;
word64 s1, s2;
word128 d0, d1, d2, d;
t0 = ctx->r[0];
t1 = ctx->r[1];
r0 = ( t0 ) & 0xfffffffffff;
r1 = ((t0 >> 44) | (t1 << 20)) & 0xfffffffffff;
r2 = ((t1 >> 24) ) & 0x00fffffffff;
/* Store r^1 */
CONV_64_TO_32(r0, r1, r2, ctx->r1);
/* Calc and store r^2 */
r20 = r0; r21 = r1; r22 = r2;
SQR_64(r20, r21, r22);
CONV_64_TO_32(r20, r21, r22, ctx->r2);
/* Calc and store r^3 */
r30 = r20; r31 = r21; r32 = r22;
MUL_64(r30, r31, r32, r0, r1, r2);
CONV_64_TO_32(r30, r31, r32, ctx->r3);
/* Calc and store r^4 */
r40 = r20; r41 = r21; r42 = r22;
SQR_64(r40, r41, r42);
CONV_64_TO_32(r40, r41, r42, ctx->r4);
}
extern void poly1305_calc_powers_avx2(Poly1305* ctx);
/* Set the key to use when processing data.
* Initialize the context.
* Calls AVX set key function as final function calls AVX code.
@ -946,27 +174,7 @@ static void poly1305_calc_powers(Poly1305* ctx)
* ctx Poly1305 context.
* key The key data (16 bytes).
*/
static void poly1305_setkey_avx2(Poly1305* ctx, const byte* key)
{
poly1305_setkey_avx(ctx, key);
__asm__ __volatile__ (
"vpxor %%ymm0, %%ymm0, %%ymm0\n\t"
"vmovdqu %%ymm0, (%[hh])\n\t"
"vmovdqu %%ymm0, 32(%[hh])\n\t"
"vmovdqu %%ymm0, 64(%[hh])\n\t"
"vmovdqu %%ymm0, 96(%[hh])\n\t"
"vmovdqu %%ymm0, 128(%[hh])\n\t"
:
: [hh] "r" (ctx->hh)
: "memory", "ymm0"
);
ctx->leftover = 0;
ctx->finished = 0;
ctx->started = 0;
}
extern void poly1305_setkey_avx2(Poly1305* ctx, const byte* key);
/* Calculate the final result - authentication data.
* Zeros out the private data in the context.
* Calls AVX final function to quickly process last blocks.
@ -974,46 +182,11 @@ static void poly1305_setkey_avx2(Poly1305* ctx, const byte* key)
* ctx Poly1305 context.
* mac Buffer to hold 16 bytes - authentication data.
*/
static void poly1305_final_avx2(Poly1305* ctx, byte* mac)
{
int i, j;
int l = (int)ctx->leftover;
extern void poly1305_final_avx2(Poly1305* ctx, byte* mac);
#endif
ctx->finished = 1;
if (ctx->started)
poly1305_blocks_avx2(ctx, ctx->buffer, POLY1305_BLOCK_SIZE * 4);
i = l & ~(POLY1305_BLOCK_SIZE - 1);
if (i > 0)
poly1305_blocks_avx(ctx, ctx->buffer, i);
ctx->leftover -= i;
for (j = 0; i < l; i++, j++)
ctx->buffer[j] = ctx->buffer[i];
poly1305_final_avx(ctx, mac);
/* zero out the state */
__asm__ __volatile__ (
"vpxor %%ymm0, %%ymm0, %%ymm0\n\t"
"vmovdqu %%ymm0, (%[hh])\n\t"
"vmovdqu %%ymm0, 32(%[hh])\n\t"
"vmovdqu %%ymm0, 64(%[hh])\n\t"
"vmovdqu %%ymm0, 96(%[hh])\n\t"
"vmovdqu %%ymm0, 128(%[hh])\n\t"
"vmovdqu %%ymm0, (%[r1])\n\t"
"vmovdqu %%ymm0, (%[r2])\n\t"
"vmovdqu %%ymm0, (%[r3])\n\t"
"vmovdqu %%ymm0, (%[r4])\n\t"
:
: [hh] "r" (ctx->hh), [r1] "r" (ctx->r1), [r2] "r" (ctx->r2),
[r3] "r" (ctx->r3), [r4] "r" (ctx->r4)
: "memory", "ymm0"
);
ctx->leftover = 0;
ctx->finished = 0;
ctx->started = 0;
}
#ifdef __cplusplus
} /* extern "C" */
#endif
#elif defined(POLY130564)
@ -1511,7 +684,7 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
return 0;
if (!ctx->started)
poly1305_calc_powers(ctx);
poly1305_calc_powers_avx2(ctx);
poly1305_blocks_avx2(ctx, ctx->buffer, sizeof(ctx->buffer));
ctx->leftover = 0;
}
@ -1521,7 +694,7 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
size_t want = bytes & ~(sizeof(ctx->buffer) - 1);
if (!ctx->started)
poly1305_calc_powers(ctx);
poly1305_calc_powers_avx2(ctx);
poly1305_blocks_avx2(ctx, m, want);
m += want;
bytes -= (word32)want;

View File

@ -0,0 +1,986 @@
/* poly1305_asm
*
* Copyright (C) 2006-2018 wolfSSL Inc.
*
* This file is part of wolfSSL.
*
* wolfSSL is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* wolfSSL is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
*/
#ifndef HAVE_INTEL_AVX1
#define HAVE_INTEL_AVX1
#endif /* HAVE_INTEL_AVX1 */
#ifndef HAVE_INTEL_AVX2
#define HAVE_INTEL_AVX2
#endif /* HAVE_INTEL_AVX2 */
#ifdef HAVE_INTEL_AVX1
.globl poly1305_setkey_avx
.type poly1305_setkey_avx,@function
.align 4
poly1305_setkey_avx:
movabsq $0xffffffc0fffffff, %r10
movabsq $0xffffffc0ffffffc, %r11
movq (%rsi), %rdx
movq 8(%rsi), %rax
movq 16(%rsi), %rcx
movq 24(%rsi), %r8
andq %r10, %rdx
andq %r11, %rax
movq %rdx, %r10
movq %rax, %r11
xorq %r9, %r9
movq %rdx, (%rdi)
movq %rax, 8(%rdi)
movq %r9, 24(%rdi)
movq %r9, 32(%rdi)
movq %r9, 40(%rdi)
movq %rcx, 48(%rdi)
movq %r8, 56(%rdi)
movq %r9, 352(%rdi)
movq %r9, 408(%rdi)
movq %rdx, 360(%rdi)
movq %rax, 416(%rdi)
addq %rdx, %r10
addq %rax, %r11
movq %r10, 368(%rdi)
movq %r11, 424(%rdi)
addq %rdx, %r10
addq %rax, %r11
movq %r10, 376(%rdi)
movq %r11, 432(%rdi)
addq %rdx, %r10
addq %rax, %r11
movq %r10, 384(%rdi)
movq %r11, 440(%rdi)
addq %rdx, %r10
addq %rax, %r11
movq %r10, 392(%rdi)
movq %r11, 448(%rdi)
addq %rdx, %r10
addq %rax, %r11
movq %r10, 400(%rdi)
movq %r11, 456(%rdi)
movq %r9, 608(%rdi)
movb $0x01, 616(%rdi)
repz retq
.size poly1305_setkey_avx,.-poly1305_setkey_avx
.globl poly1305_block_avx
.type poly1305_block_avx,@function
.align 4
poly1305_block_avx:
pushq %r15
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
movq (%rdi), %r15
movq 8(%rdi), %rbx
movq 24(%rdi), %r8
movq 32(%rdi), %r9
movq 40(%rdi), %r10
xorq %r14, %r14
movb 616(%rdi), %r14b
# h += m
movq (%rsi), %r11
movq 8(%rsi), %r12
addq %r11, %r8
adcq %r12, %r9
movq %rbx, %rax
adcq %r14, %r10
# r[1] * h[0] => rdx, rax ==> t2, t1
mulq %r8
movq %rax, %r12
movq %rdx, %r13
# r[0] * h[1] => rdx, rax ++> t2, t1
movq %r15, %rax
mulq %r9
addq %rax, %r12
movq %r15, %rax
adcq %rdx, %r13
# r[0] * h[0] => rdx, rax ==> t4, t0
mulq %r8
movq %rax, %r11
movq %rdx, %r8
# r[1] * h[1] => rdx, rax =+> t3, t2
movq %rbx, %rax
mulq %r9
# r[0] * h[2] +> t2
addq 352(%rdi,%r10,8), %r13
movq %rdx, %r14
addq %r8, %r12
adcq %rax, %r13
# r[1] * h[2] +> t3
adcq 408(%rdi,%r10,8), %r14
# r * h in r14, r13, r12, r11
# h = (r * h) mod 2^130 - 5
movq %r13, %r10
andq $-4, %r13
andq $3, %r10
addq %r13, %r11
movq %r13, %r8
adcq %r14, %r12
adcq $0x00, %r10
shrdq $2, %r14, %r8
shrq $2, %r14
addq %r11, %r8
adcq %r14, %r12
movq %r12, %r9
adcq $0x00, %r10
# h in r10, r9, r8
# Store h to ctx
movq %r8, 24(%rdi)
movq %r9, 32(%rdi)
movq %r10, 40(%rdi)
popq %r14
popq %r13
popq %r12
popq %rbx
popq %r15
repz retq
.size poly1305_block_avx,.-poly1305_block_avx
.globl poly1305_blocks_avx
.type poly1305_blocks_avx,@function
.align 4
poly1305_blocks_avx:
pushq %r15
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
movq %rdx, %rcx
movq (%rdi), %r15
movq 8(%rdi), %rbx
movq 24(%rdi), %r8
movq 32(%rdi), %r9
movq 40(%rdi), %r10
L_poly1305_avx_blocks_start:
# h += m
movq (%rsi), %r11
movq 8(%rsi), %r12
addq %r11, %r8
adcq %r12, %r9
movq %rbx, %rax
adcq $0x00, %r10
# r[1] * h[0] => rdx, rax ==> t2, t1
mulq %r8
movq %rax, %r12
movq %rdx, %r13
# r[0] * h[1] => rdx, rax ++> t2, t1
movq %r15, %rax
mulq %r9
addq %rax, %r12
movq %r15, %rax
adcq %rdx, %r13
# r[0] * h[0] => rdx, rax ==> t4, t0
mulq %r8
movq %rax, %r11
movq %rdx, %r8
# r[1] * h[1] => rdx, rax =+> t3, t2
movq %rbx, %rax
mulq %r9
# r[0] * h[2] +> t2
addq 360(%rdi,%r10,8), %r13
movq %rdx, %r14
addq %r8, %r12
adcq %rax, %r13
# r[1] * h[2] +> t3
adcq 416(%rdi,%r10,8), %r14
# r * h in r14, r13, r12, r11
# h = (r * h) mod 2^130 - 5
movq %r13, %r10
andq $-4, %r13
andq $3, %r10
addq %r13, %r11
movq %r13, %r8
adcq %r14, %r12
adcq $0x00, %r10
shrdq $2, %r14, %r8
shrq $2, %r14
addq %r11, %r8
adcq %r14, %r12
movq %r12, %r9
adcq $0x00, %r10
# h in r10, r9, r8
# Next block from message
addq $16, %rsi
subq $16, %rcx
jg L_poly1305_avx_blocks_start
# Store h to ctx
movq %r8, 24(%rdi)
movq %r9, 32(%rdi)
movq %r10, 40(%rdi)
popq %r14
popq %r13
popq %r12
popq %rbx
popq %r15
repz retq
.size poly1305_blocks_avx,.-poly1305_blocks_avx
.globl poly1305_final_avx
.type poly1305_final_avx,@function
.align 4
poly1305_final_avx:
pushq %rbx
pushq %r12
movq %rsi, %rbx
movq 608(%rdi), %rax
testq %rax, %rax
je L_poly1305_avx_final_no_more
movb $0x01, 480(%rdi,%rax,1)
jmp L_poly1305_avx_final_cmp_rem
L_poly1305_avx_final_zero_rem:
movb $0x00, 480(%rdi,%rax,1)
L_poly1305_avx_final_cmp_rem:
incb %al
cmpq $16, %rax
jl L_poly1305_avx_final_zero_rem
movb $0x00, 616(%rdi)
leaq 480(%rdi), %rsi
callq poly1305_block_avx@plt
L_poly1305_avx_final_no_more:
movq 24(%rdi), %rax
movq 32(%rdi), %rdx
movq 40(%rdi), %rcx
movq 48(%rdi), %r11
movq 56(%rdi), %r12
# h %= p
# h = (h + pad)
# mod 2^130 - 5
movq %rcx, %r8
andq $3, %rcx
shrq $2, %r8
# Multily by 5
leaq 0(%r8,%r8,4), %r8
addq %r8, %rax
adcq $0x00, %rdx
adcq $0x00, %rcx
# Fixup when between (1 << 130) - 1 and (1 << 130) - 5
movq %rax, %r8
movq %rdx, %r9
movq %rcx, %r10
addq $5, %r8
adcq $0x00, %r9
adcq $0x00, %r10
cmpq $4, %r10
cmoveq %r8, %rax
cmoveq %r9, %rdx
# h += pad
addq %r11, %rax
adcq %r12, %rdx
movq %rax, (%rbx)
movq %rdx, 8(%rbx)
# Zero out r
movq $0x00, (%rdi)
movq $0x00, 8(%rdi)
# Zero out h
movq $0x00, 24(%rdi)
movq $0x00, 32(%rdi)
movq $0x00, 40(%rdi)
# Zero out pad
movq $0x00, 48(%rdi)
movq $0x00, 56(%rdi)
popq %r12
popq %rbx
repz retq
.size poly1305_final_avx,.-poly1305_final_avx
#endif /* HAVE_INTEL_AVX1 */
#ifdef HAVE_INTEL_AVX2
.globl poly1305_calc_powers_avx2
.type poly1305_calc_powers_avx2,@function
.align 4
poly1305_calc_powers_avx2:
pushq %r12
pushq %r13
pushq %r14
pushq %r15
pushq %rbx
pushq %rbp
movq (%rdi), %rcx
movq 8(%rdi), %r8
xorq %r9, %r9
# Convert to 26 bits in 32
movq %rcx, %rax
movq %rcx, %rdx
movq %rcx, %rsi
movq %r8, %rbx
movq %r8, %rbp
shrq $26, %rdx
shrdq $52, %r8, %rsi
shrq $14, %rbx
shrdq $40, %r9, %rbp
andq $0x3ffffff, %rax
andq $0x3ffffff, %rdx
andq $0x3ffffff, %rsi
andq $0x3ffffff, %rbx
andq $0x3ffffff, %rbp
movl %eax, 224(%rdi)
movl %edx, 228(%rdi)
movl %esi, 232(%rdi)
movl %ebx, 236(%rdi)
movl %ebp, 240(%rdi)
movl $0x00, 244(%rdi)
# Square 128-bit
movq %r8, %rax
mulq %rcx
xorq %r13, %r13
movq %rax, %r11
movq %rdx, %r12
addq %rax, %r11
adcq %rdx, %r12
adcq $0x00, %r13
movq %rcx, %rax
mulq %rax
movq %rax, %r10
movq %rdx, %r15
movq %r8, %rax
mulq %rax
addq %r15, %r11
adcq %rax, %r12
adcq %rdx, %r13
# Reduce 256-bit to 130-bit
movq %r12, %rax
movq %r13, %rdx
andq $-4, %rax
andq $3, %r12
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
shrdq $2, %rdx, %rax
shrq $2, %rdx
addq %rax, %r10
adcq %rdx, %r11
adcq $0x00, %r12
movq %r12, %rax
shrq $2, %rax
leaq 0(%rax,%rax,4), %rax
addq %rax, %r10
adcq $0x00, %r11
adcq $0x00, %r12
# Convert to 26 bits in 32
movq %r10, %rax
movq %r10, %rdx
movq %r10, %rsi
movq %r11, %rbx
movq %r11, %rbp
shrq $26, %rdx
shrdq $52, %r11, %rsi
shrq $14, %rbx
shrdq $40, %r12, %rbp
andq $0x3ffffff, %rax
andq $0x3ffffff, %rdx
andq $0x3ffffff, %rsi
andq $0x3ffffff, %rbx
andq $0x3ffffff, %rbp
movl %eax, 256(%rdi)
movl %edx, 260(%rdi)
movl %esi, 264(%rdi)
movl %ebx, 268(%rdi)
movl %ebp, 272(%rdi)
movl $0x00, 276(%rdi)
# Multiply 128-bit by 130-bit
# r1[0] * r2[0]
movq %rcx, %rax
mulq %r10
movq %rax, %r13
movq %rdx, %r14
# r1[0] * r2[1]
movq %rcx, %rax
mulq %r11
movq $0x00, %r15
addq %rax, %r14
adcq %rdx, %r15
# r1[1] * r2[0]
movq %r8, %rax
mulq %r10
movq $0x00, %rsi
addq %rax, %r14
adcq %rdx, %r15
adcq $0x00, %rsi
# r1[0] * r2[2]
movq %rcx, %rax
mulq %r12
addq %rax, %r15
adcq %rdx, %rsi
# r1[1] * r2[1]
movq %r8, %rax
mulq %r11
movq $0x00, %rbx
addq %rax, %r15
adcq %rdx, %rsi
adcq $0x00, %rbx
# r1[1] * r2[2]
movq %r8, %rax
mulq %r12
addq %rax, %rsi
adcq %rdx, %rbx
# Reduce 260-bit to 130-bit
movq %r15, %rax
movq %rsi, %rdx
movq %rbx, %rbx
andq $-4, %rax
andq $3, %r15
addq %rax, %r13
adcq %rdx, %r14
adcq %rbx, %r15
shrdq $2, %rdx, %rax
shrdq $2, %rbx, %rdx
shrq $2, %rbx
addq %rax, %r13
adcq %rdx, %r14
adcq %rbx, %r15
movq %r15, %rax
andq $3, %r15
shrq $2, %rax
leaq 0(%rax,%rax,4), %rax
addq %rax, %r13
adcq $0x00, %r14
adcq $0x00, %r15
# Convert to 26 bits in 32
movq %r13, %rax
movq %r13, %rdx
movq %r13, %rsi
movq %r14, %rbx
movq %r14, %rbp
shrq $26, %rdx
shrdq $52, %r14, %rsi
shrq $14, %rbx
shrdq $40, %r15, %rbp
andq $0x3ffffff, %rax
andq $0x3ffffff, %rdx
andq $0x3ffffff, %rsi
andq $0x3ffffff, %rbx
andq $0x3ffffff, %rbp
movl %eax, 288(%rdi)
movl %edx, 292(%rdi)
movl %esi, 296(%rdi)
movl %ebx, 300(%rdi)
movl %ebp, 304(%rdi)
movl $0x00, 308(%rdi)
# Square 130-bit
movq %r11, %rax
mulq %r10
xorq %r13, %r13
movq %rax, %r8
movq %rdx, %r9
addq %rax, %r8
adcq %rdx, %r9
adcq $0x00, %r13
movq %r10, %rax
mulq %rax
movq %rax, %rcx
movq %rdx, %r15
movq %r11, %rax
mulq %rax
addq %r15, %r8
adcq %rax, %r9
adcq %rdx, %r13
movq %r12, %rax
mulq %rax
movq %rax, %r14
movq %r12, %rax
mulq %r10
addq %rax, %r9
adcq %rdx, %r13
adcq $0x00, %r14
addq %rax, %r9
adcq %rdx, %r13
adcq $0x00, %r14
movq %r12, %rax
mulq %r11
addq %rax, %r13
adcq %rdx, %r14
addq %rax, %r13
adcq %rdx, %r14
# Reduce 260-bit to 130-bit
movq %r9, %rax
movq %r13, %rdx
movq %r14, %r15
andq $-4, %rax
andq $3, %r9
addq %rax, %rcx
adcq %rdx, %r8
adcq %r15, %r9
shrdq $2, %rdx, %rax
shrdq $2, %r15, %rdx
shrq $2, %r15
addq %rax, %rcx
adcq %rdx, %r8
adcq %r15, %r9
movq %r9, %rax
andq $3, %r9
shrq $2, %rax
leaq 0(%rax,%rax,4), %rax
addq %rax, %rcx
adcq $0x00, %r8
adcq $0x00, %r9
# Convert to 26 bits in 32
movq %rcx, %rax
movq %rcx, %rdx
movq %rcx, %rsi
movq %r8, %rbx
movq %r8, %rbp
shrq $26, %rdx
shrdq $52, %r8, %rsi
shrq $14, %rbx
shrdq $40, %r9, %rbp
andq $0x3ffffff, %rax
andq $0x3ffffff, %rdx
andq $0x3ffffff, %rsi
andq $0x3ffffff, %rbx
andq $0x3ffffff, %rbp
movl %eax, 320(%rdi)
movl %edx, 324(%rdi)
movl %esi, 328(%rdi)
movl %ebx, 332(%rdi)
movl %ebp, 336(%rdi)
movl $0x00, 340(%rdi)
popq %rbp
popq %rbx
popq %r15
popq %r14
popq %r13
popq %r12
repz retq
.size poly1305_calc_powers_avx2,.-poly1305_calc_powers_avx2
.globl poly1305_setkey_avx2
.type poly1305_setkey_avx2,@function
.align 4
poly1305_setkey_avx2:
callq poly1305_setkey_avx@plt
vpxor %ymm0, %ymm0, %ymm0
vmovdqu %ymm0, 64(%rdi)
vmovdqu %ymm0, 96(%rdi)
vmovdqu %ymm0, 128(%rdi)
vmovdqu %ymm0, 160(%rdi)
vmovdqu %ymm0, 192(%rdi)
movq $0x00, 608(%rdi)
movw $0x00, 616(%rdi)
repz retq
.size poly1305_setkey_avx2,.-poly1305_setkey_avx2
.align 32
L_poly1305_avx2_blocks_mask:
.quad 0x3ffffff, 0x3ffffff
.quad 0x3ffffff, 0x3ffffff
.align 32
L_poly1305_avx2_blocks_hibit:
.quad 0x1000000, 0x1000000
.quad 0x1000000, 0x1000000
.globl poly1305_blocks_avx2
.type poly1305_blocks_avx2,@function
.align 4
poly1305_blocks_avx2:
pushq %r12
pushq %rbx
subq $0x140, %rsp
movq %rsp, %rcx
andq $-32, %rcx
addq $32, %rcx
vpxor %ymm15, %ymm15, %ymm15
movq %rcx, %rbx
leaq 64(%rdi), %rax
addq $0xa0, %rbx
cmpw $0x00, 616(%rdi)
jne L_poly1305_avx2_blocks_begin_h
# Load the message data
vmovdqu (%rsi), %ymm0
vmovdqu 32(%rsi), %ymm1
vperm2i128 $32, %ymm1, %ymm0, %ymm2
vperm2i128 $49, %ymm1, %ymm0, %ymm0
vpunpckldq %ymm0, %ymm2, %ymm1
vpunpckhdq %ymm0, %ymm2, %ymm3
vpunpckldq %ymm15, %ymm1, %ymm0
vpunpckhdq %ymm15, %ymm1, %ymm1
vpunpckldq %ymm15, %ymm3, %ymm2
vpunpckhdq %ymm15, %ymm3, %ymm3
vmovdqu L_poly1305_avx2_blocks_hibit(%rip), %ymm4
vpsllq $6, %ymm1, %ymm1
vpsllq $12, %ymm2, %ymm2
vpsllq $18, %ymm3, %ymm3
vmovdqu L_poly1305_avx2_blocks_mask(%rip), %ymm14
# Reduce, in place, the message data
vpsrlq $26, %ymm0, %ymm10
vpsrlq $26, %ymm3, %ymm11
vpand %ymm14, %ymm0, %ymm0
vpand %ymm14, %ymm3, %ymm3
vpaddq %ymm1, %ymm10, %ymm1
vpaddq %ymm4, %ymm11, %ymm4
vpsrlq $26, %ymm1, %ymm10
vpsrlq $26, %ymm4, %ymm11
vpand %ymm14, %ymm1, %ymm1
vpand %ymm14, %ymm4, %ymm4
vpaddq %ymm2, %ymm10, %ymm2
vpslld $2, %ymm11, %ymm12
vpaddd %ymm12, %ymm11, %ymm12
vpsrlq $26, %ymm2, %ymm10
vpaddq %ymm0, %ymm12, %ymm0
vpsrlq $26, %ymm0, %ymm11
vpand %ymm14, %ymm2, %ymm2
vpand %ymm14, %ymm0, %ymm0
vpaddq %ymm3, %ymm10, %ymm3
vpaddq %ymm1, %ymm11, %ymm1
vpsrlq $26, %ymm3, %ymm10
vpand %ymm14, %ymm3, %ymm3
vpaddq %ymm4, %ymm10, %ymm4
addq $0x40, %rsi
subq $0x40, %rdx
jz L_poly1305_avx2_blocks_store
jmp L_poly1305_avx2_blocks_load_r4
L_poly1305_avx2_blocks_begin_h:
# Load the H values.
vmovdqu (%rax), %ymm0
vmovdqu 32(%rax), %ymm1
vmovdqu 64(%rax), %ymm2
vmovdqu 96(%rax), %ymm3
vmovdqu 128(%rax), %ymm4
# Check if there is a power of r to load - otherwise use r^4.
cmpb $0x00, 616(%rdi)
je L_poly1305_avx2_blocks_load_r4
# Load the 4 powers of r - r^4, r^3, r^2, r^1.
vmovdqu 224(%rdi), %ymm8
vmovdqu 256(%rdi), %ymm7
vmovdqu 288(%rdi), %ymm6
vmovdqu 320(%rdi), %ymm5
vpermq $0xd8, %ymm5, %ymm5
vpermq $0xd8, %ymm6, %ymm6
vpermq $0xd8, %ymm7, %ymm7
vpermq $0xd8, %ymm8, %ymm8
vpunpcklqdq %ymm6, %ymm5, %ymm10
vpunpckhqdq %ymm6, %ymm5, %ymm11
vpunpcklqdq %ymm8, %ymm7, %ymm12
vpunpckhqdq %ymm8, %ymm7, %ymm13
vperm2i128 $32, %ymm12, %ymm10, %ymm5
vperm2i128 $49, %ymm12, %ymm10, %ymm7
vperm2i128 $32, %ymm13, %ymm11, %ymm9
vpsrlq $32, %ymm5, %ymm6
vpsrlq $32, %ymm7, %ymm8
jmp L_poly1305_avx2_blocks_mul_5
L_poly1305_avx2_blocks_load_r4:
# Load r^4 into all four positions.
vmovdqu 320(%rdi), %ymm13
vpermq $0x00, %ymm13, %ymm5
vpsrlq $32, %ymm13, %ymm14
vpermq $0x55, %ymm13, %ymm7
vpermq $0xaa, %ymm13, %ymm9
vpermq $0x00, %ymm14, %ymm6
vpermq $0x55, %ymm14, %ymm8
L_poly1305_avx2_blocks_mul_5:
# Multiply top 4 26-bit values of all four H by 5
vpslld $2, %ymm6, %ymm10
vpslld $2, %ymm7, %ymm11
vpslld $2, %ymm8, %ymm12
vpslld $2, %ymm9, %ymm13
vpaddq %ymm10, %ymm6, %ymm10
vpaddq %ymm11, %ymm7, %ymm11
vpaddq %ymm12, %ymm8, %ymm12
vpaddq %ymm13, %ymm9, %ymm13
# Store powers of r and multiple of 5 for use in multiply.
vmovdqa %ymm10, (%rbx)
vmovdqa %ymm11, 32(%rbx)
vmovdqa %ymm12, 64(%rbx)
vmovdqa %ymm13, 96(%rbx)
vmovdqa %ymm5, (%rcx)
vmovdqa %ymm6, 32(%rcx)
vmovdqa %ymm7, 64(%rcx)
vmovdqa %ymm8, 96(%rcx)
vmovdqa %ymm9, 128(%rcx)
vmovdqu L_poly1305_avx2_blocks_mask(%rip), %ymm14
# If not finished then loop over data
cmpb $0x01, 616(%rdi)
jne L_poly1305_avx2_blocks_start
# Do last multiply, reduce, add the four H together and move to
# 32-bit registers
vpmuludq (%rbx), %ymm4, %ymm5
vpmuludq 32(%rbx), %ymm3, %ymm10
vpmuludq 32(%rbx), %ymm4, %ymm6
vpmuludq 64(%rbx), %ymm3, %ymm11
vpmuludq 64(%rbx), %ymm4, %ymm7
vpaddq %ymm5, %ymm10, %ymm5
vpmuludq 64(%rbx), %ymm2, %ymm12
vpmuludq 96(%rbx), %ymm4, %ymm8
vpaddq %ymm6, %ymm11, %ymm6
vpmuludq 96(%rbx), %ymm1, %ymm13
vpmuludq 96(%rbx), %ymm2, %ymm10
vpaddq %ymm5, %ymm12, %ymm5
vpmuludq 96(%rbx), %ymm3, %ymm11
vpmuludq (%rcx), %ymm3, %ymm12
vpaddq %ymm5, %ymm13, %ymm5
vpmuludq (%rcx), %ymm4, %ymm9
vpaddq %ymm6, %ymm10, %ymm6
vpmuludq (%rcx), %ymm0, %ymm13
vpaddq %ymm7, %ymm11, %ymm7
vpmuludq (%rcx), %ymm1, %ymm10
vpaddq %ymm8, %ymm12, %ymm8
vpmuludq (%rcx), %ymm2, %ymm11
vpmuludq 32(%rcx), %ymm2, %ymm12
vpaddq %ymm5, %ymm13, %ymm5
vpmuludq 32(%rcx), %ymm3, %ymm13
vpaddq %ymm6, %ymm10, %ymm6
vpmuludq 32(%rcx), %ymm0, %ymm10
vpaddq %ymm7, %ymm11, %ymm7
vpmuludq 32(%rcx), %ymm1, %ymm11
vpaddq %ymm8, %ymm12, %ymm8
vpmuludq 64(%rcx), %ymm1, %ymm12
vpaddq %ymm9, %ymm13, %ymm9
vpmuludq 64(%rcx), %ymm2, %ymm13
vpaddq %ymm6, %ymm10, %ymm6
vpmuludq 64(%rcx), %ymm0, %ymm10
vpaddq %ymm7, %ymm11, %ymm7
vpmuludq 96(%rcx), %ymm0, %ymm11
vpaddq %ymm8, %ymm12, %ymm8
vpmuludq 96(%rcx), %ymm1, %ymm12
vpaddq %ymm9, %ymm13, %ymm9
vpaddq %ymm7, %ymm10, %ymm7
vpmuludq 128(%rcx), %ymm0, %ymm13
vpaddq %ymm8, %ymm11, %ymm8
vpaddq %ymm9, %ymm12, %ymm9
vpaddq %ymm9, %ymm13, %ymm9
vpsrlq $26, %ymm5, %ymm10
vpsrlq $26, %ymm8, %ymm11
vpand %ymm14, %ymm5, %ymm5
vpand %ymm14, %ymm8, %ymm8
vpaddq %ymm6, %ymm10, %ymm6
vpaddq %ymm9, %ymm11, %ymm9
vpsrlq $26, %ymm6, %ymm10
vpsrlq $26, %ymm9, %ymm11
vpand %ymm14, %ymm6, %ymm1
vpand %ymm14, %ymm9, %ymm4
vpaddq %ymm7, %ymm10, %ymm7
vpslld $2, %ymm11, %ymm12
vpaddd %ymm12, %ymm11, %ymm12
vpsrlq $26, %ymm7, %ymm10
vpaddq %ymm5, %ymm12, %ymm5
vpsrlq $26, %ymm5, %ymm11
vpand %ymm14, %ymm7, %ymm2
vpand %ymm14, %ymm5, %ymm0
vpaddq %ymm8, %ymm10, %ymm8
vpaddq %ymm1, %ymm11, %ymm1
vpsrlq $26, %ymm8, %ymm10
vpand %ymm14, %ymm8, %ymm3
vpaddq %ymm4, %ymm10, %ymm4
vpsrldq $8, %ymm0, %ymm5
vpsrldq $8, %ymm1, %ymm6
vpsrldq $8, %ymm2, %ymm7
vpsrldq $8, %ymm3, %ymm8
vpsrldq $8, %ymm4, %ymm9
vpaddq %ymm0, %ymm5, %ymm0
vpaddq %ymm1, %ymm6, %ymm1
vpaddq %ymm2, %ymm7, %ymm2
vpaddq %ymm3, %ymm8, %ymm3
vpaddq %ymm4, %ymm9, %ymm4
vpermq $2, %ymm0, %ymm5
vpermq $2, %ymm1, %ymm6
vpermq $2, %ymm2, %ymm7
vpermq $2, %ymm3, %ymm8
vpermq $2, %ymm4, %ymm9
vpaddq %ymm0, %ymm5, %ymm0
vpaddq %ymm1, %ymm6, %ymm1
vpaddq %ymm2, %ymm7, %ymm2
vpaddq %ymm3, %ymm8, %ymm3
vpaddq %ymm4, %ymm9, %ymm4
vmovd %xmm0, %r8d
vmovd %xmm1, %r9d
vmovd %xmm2, %r10d
vmovd %xmm3, %r11d
vmovd %xmm4, %r12d
jmp L_poly1305_avx2_blocks_end_calc
L_poly1305_avx2_blocks_start:
vmovdqu (%rsi), %ymm5
vmovdqu 32(%rsi), %ymm6
vperm2i128 $32, %ymm6, %ymm5, %ymm7
vperm2i128 $49, %ymm6, %ymm5, %ymm5
vpunpckldq %ymm5, %ymm7, %ymm6
vpunpckhdq %ymm5, %ymm7, %ymm8
vpunpckldq %ymm15, %ymm6, %ymm5
vpunpckhdq %ymm15, %ymm6, %ymm6
vpunpckldq %ymm15, %ymm8, %ymm7
vpunpckhdq %ymm15, %ymm8, %ymm8
vmovdqu L_poly1305_avx2_blocks_hibit(%rip), %ymm9
vpsllq $6, %ymm6, %ymm6
vpsllq $12, %ymm7, %ymm7
vpsllq $18, %ymm8, %ymm8
vpmuludq (%rbx), %ymm4, %ymm10
vpaddq %ymm5, %ymm10, %ymm5
vpmuludq 32(%rbx), %ymm3, %ymm10
vpmuludq 32(%rbx), %ymm4, %ymm11
vpaddq %ymm6, %ymm11, %ymm6
vpmuludq 64(%rbx), %ymm3, %ymm11
vpmuludq 64(%rbx), %ymm4, %ymm12
vpaddq %ymm7, %ymm12, %ymm7
vpaddq %ymm5, %ymm10, %ymm5
vpmuludq 64(%rbx), %ymm2, %ymm12
vpmuludq 96(%rbx), %ymm4, %ymm13
vpaddq %ymm8, %ymm13, %ymm8
vpaddq %ymm6, %ymm11, %ymm6
vpmuludq 96(%rbx), %ymm1, %ymm13
vpmuludq 96(%rbx), %ymm2, %ymm10
vpaddq %ymm5, %ymm12, %ymm5
vpmuludq 96(%rbx), %ymm3, %ymm11
vpmuludq (%rcx), %ymm3, %ymm12
vpaddq %ymm5, %ymm13, %ymm5
vpmuludq (%rcx), %ymm4, %ymm13
vpaddq %ymm9, %ymm13, %ymm9
vpaddq %ymm6, %ymm10, %ymm6
vpmuludq (%rcx), %ymm0, %ymm13
vpaddq %ymm7, %ymm11, %ymm7
vpmuludq (%rcx), %ymm1, %ymm10
vpaddq %ymm8, %ymm12, %ymm8
vpmuludq (%rcx), %ymm2, %ymm11
vpmuludq 32(%rcx), %ymm2, %ymm12
vpaddq %ymm5, %ymm13, %ymm5
vpmuludq 32(%rcx), %ymm3, %ymm13
vpaddq %ymm6, %ymm10, %ymm6
vpmuludq 32(%rcx), %ymm0, %ymm10
vpaddq %ymm7, %ymm11, %ymm7
vpmuludq 32(%rcx), %ymm1, %ymm11
vpaddq %ymm8, %ymm12, %ymm8
vpmuludq 64(%rcx), %ymm1, %ymm12
vpaddq %ymm9, %ymm13, %ymm9
vpmuludq 64(%rcx), %ymm2, %ymm13
vpaddq %ymm6, %ymm10, %ymm6
vpmuludq 64(%rcx), %ymm0, %ymm10
vpaddq %ymm7, %ymm11, %ymm7
vpmuludq 96(%rcx), %ymm0, %ymm11
vpaddq %ymm8, %ymm12, %ymm8
vpmuludq 96(%rcx), %ymm1, %ymm12
vpaddq %ymm9, %ymm13, %ymm9
vpaddq %ymm7, %ymm10, %ymm7
vpmuludq 128(%rcx), %ymm0, %ymm13
vpaddq %ymm8, %ymm11, %ymm8
vpaddq %ymm9, %ymm12, %ymm9
vpaddq %ymm9, %ymm13, %ymm9
vpsrlq $26, %ymm5, %ymm10
vpsrlq $26, %ymm8, %ymm11
vpand %ymm14, %ymm5, %ymm5
vpand %ymm14, %ymm8, %ymm8
vpaddq %ymm6, %ymm10, %ymm6
vpaddq %ymm9, %ymm11, %ymm9
vpsrlq $26, %ymm6, %ymm10
vpsrlq $26, %ymm9, %ymm11
vpand %ymm14, %ymm6, %ymm1
vpand %ymm14, %ymm9, %ymm4
vpaddq %ymm7, %ymm10, %ymm7
vpslld $2, %ymm11, %ymm12
vpaddd %ymm12, %ymm11, %ymm12
vpsrlq $26, %ymm7, %ymm10
vpaddq %ymm5, %ymm12, %ymm5
vpsrlq $26, %ymm5, %ymm11
vpand %ymm14, %ymm7, %ymm2
vpand %ymm14, %ymm5, %ymm0
vpaddq %ymm8, %ymm10, %ymm8
vpaddq %ymm1, %ymm11, %ymm1
vpsrlq $26, %ymm8, %ymm10
vpand %ymm14, %ymm8, %ymm3
vpaddq %ymm4, %ymm10, %ymm4
addq $0x40, %rsi
subq $0x40, %rdx
jnz L_poly1305_avx2_blocks_start
L_poly1305_avx2_blocks_store:
# Store four H values - state
vmovdqu %ymm0, (%rax)
vmovdqu %ymm1, 32(%rax)
vmovdqu %ymm2, 64(%rax)
vmovdqu %ymm3, 96(%rax)
vmovdqu %ymm4, 128(%rax)
L_poly1305_avx2_blocks_end_calc:
cmpb $0x00, 616(%rdi)
je L_poly1305_avx2_blocks_complete
movq %r8, %rax
movq %r10, %rdx
movq %r12, %rcx
shrq $12, %rdx
shrq $24, %rcx
shlq $26, %r9
shlq $52, %r10
shlq $14, %r11
shlq $40, %r12
addq %r9, %rax
adcq %r10, %rax
addq %r11, %rdx
adcq %r12, %rdx
adcq $0x00, %rcx
movq %rcx, %r8
andq $3, %rcx
shrq $2, %r8
leaq 0(%r8,%r8,4), %r8
addq %r8, %rax
adcq $0x00, %rdx
adcq $0x00, %rcx
movq %rax, 24(%rdi)
movq %rdx, 32(%rdi)
movq %rcx, 40(%rdi)
L_poly1305_avx2_blocks_complete:
movb $0x01, 617(%rdi)
addq $0x140, %rsp
popq %rbx
popq %r12
repz retq
.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
.globl poly1305_final_avx2
.type poly1305_final_avx2,@function
.align 4
poly1305_final_avx2:
movb $0x01, 616(%rdi)
movb 617(%rdi), %cl
cmpb $0x00, %cl
je L_poly1305_avx2_final_done_blocks_X4
pushq %rsi
movq $0x40, %rdx
xorq %rsi, %rsi
callq poly1305_blocks_avx2@plt
popq %rsi
L_poly1305_avx2_final_done_blocks_X4:
movq 608(%rdi), %rax
movq %rax, %rcx
andq $-16, %rcx
cmpb $0x00, %cl
je L_poly1305_avx2_final_done_blocks
pushq %rcx
pushq %rax
pushq %rsi
movq %rcx, %rdx
leaq 480(%rdi), %rsi
callq poly1305_blocks_avx@plt
popq %rsi
popq %rax
popq %rcx
L_poly1305_avx2_final_done_blocks:
subq %rcx, 608(%rdi)
xorq %rdx, %rdx
jmp L_poly1305_avx2_final_cmp_copy
L_poly1305_avx2_final_start_copy:
movb 480(%rdi,%rcx,1), %r8b
movb %r8b, 480(%rdi,%rdx,1)
incb %cl
incb %dl
L_poly1305_avx2_final_cmp_copy:
cmp %rcx, %rax
jne L_poly1305_avx2_final_start_copy
callq poly1305_final_avx@plt
vpxor %ymm0, %ymm0, %ymm0
vmovdqu %ymm0, 64(%rdi)
vmovdqu %ymm0, 96(%rdi)
vmovdqu %ymm0, 128(%rdi)
vmovdqu %ymm0, 160(%rdi)
vmovdqu %ymm0, 192(%rdi)
vmovdqu %ymm0, 224(%rdi)
vmovdqu %ymm0, 256(%rdi)
vmovdqu %ymm0, 288(%rdi)
vmovdqu %ymm0, 320(%rdi)
movq $0x00, 608(%rdi)
movw $0x00, 616(%rdi)
repz retq
.size poly1305_final_avx2,.-poly1305_final_avx2
#endif /* HAVE_INTEL_AVX2 */

File diff suppressed because it is too large Load Diff

22437
wolfcrypt/src/sha256_asm.S Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

10545
wolfcrypt/src/sha512_asm.S Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

25934
wolfcrypt/src/sp_x86_64_asm.S Normal file

File diff suppressed because it is too large Load Diff

View File

@ -56,8 +56,8 @@
#include <wolfssl/wolfcrypt/sha512.h>
#include <wolfssl/wolfcrypt/arc4.h>
#if defined(WC_NO_RNG) && defined(USE_FAST_MATH)
#include <wolfssl/wolfcrypt/tfm.h>
#if defined(WC_NO_RNG)
#include <wolfssl/wolfcrypt/integer.h>
#else
#include <wolfssl/wolfcrypt/random.h>
#endif

View File

@ -135,9 +135,6 @@ typedef struct wc_Sha256 {
word32 loLen; /* length in bytes */
word32 hiLen; /* length in bytes */
void* heap;
#ifdef USE_INTEL_SPEEDUP
const byte* data;
#endif
#ifdef WOLFSSL_PIC32MZ_HASH
hashUpdCache cache; /* cache for updates */
#endif