Pull out x86_64 ASM into separate files
This commit is contained in:
parent
d16c2ca7c6
commit
7822cef1ac
392
configure.ac
392
configure.ac
@ -1063,6 +1063,7 @@ then
|
||||
fi
|
||||
|
||||
AM_CONDITIONAL([BUILD_AESNI], [test "x$ENABLED_AESNI" = "xyes"])
|
||||
AM_CONDITIONAL([BUILD_INTELASM], [test "x$ENABLED_INTELASM" = "xyes"])
|
||||
|
||||
|
||||
# Linux af_alg
|
||||
@ -3554,191 +3555,6 @@ then
|
||||
fi
|
||||
fi
|
||||
|
||||
# Single Precision maths implementation
|
||||
AC_ARG_ENABLE([sp],
|
||||
[AS_HELP_STRING([--enable-sp],[Enable Single Precision maths implementation (default: disabled)])],
|
||||
[ ENABLED_SP=$enableval ],
|
||||
[ ENABLED_SP=no ],
|
||||
)
|
||||
|
||||
ENABLED_SP_RSA=no
|
||||
ENABLED_SP_DH=no
|
||||
ENABLED_SP_ECC=no
|
||||
for v in `echo $ENABLED_SP | tr "," " "`
|
||||
do
|
||||
case $v in
|
||||
small)
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
|
||||
ENABLED_SP_RSA=yes
|
||||
ENABLED_SP_DH=yes
|
||||
ENABLED_SP_ECC=yes
|
||||
;;
|
||||
yes)
|
||||
ENABLED_SP_RSA=yes
|
||||
ENABLED_SP_DH=yes
|
||||
ENABLED_SP_ECC=yes
|
||||
;;
|
||||
no)
|
||||
;;
|
||||
|
||||
smallec256 | smallp256 | small256)
|
||||
ENABLED_SP_ECC=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
|
||||
;;
|
||||
ec256 | p256 | 256)
|
||||
ENABLED_SP_ECC=yes
|
||||
;;
|
||||
|
||||
small2048)
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
|
||||
ENABLED_SP_RSA=yes
|
||||
ENABLED_SP_DH=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_3072"
|
||||
;;
|
||||
2048)
|
||||
ENABLED_SP_RSA=yes
|
||||
ENABLED_SP_DH=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_3072"
|
||||
;;
|
||||
|
||||
smallrsa2048)
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
|
||||
ENABLED_SP_RSA=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_3072"
|
||||
;;
|
||||
rsa2048)
|
||||
ENABLED_SP_RSA=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_3072"
|
||||
;;
|
||||
|
||||
small3072)
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
|
||||
ENABLED_SP_RSA=yes
|
||||
ENABLED_SP_DH=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_2048"
|
||||
;;
|
||||
3072)
|
||||
ENABLED_SP_RSA=yes
|
||||
ENABLED_SP_DH=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_2048"
|
||||
;;
|
||||
|
||||
smallrsa3072)
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
|
||||
ENABLED_SP_RSA=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_2048"
|
||||
;;
|
||||
rsa3072)
|
||||
ENABLED_SP_RSA=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_2048"
|
||||
;;
|
||||
|
||||
*)
|
||||
AC_MSG_ERROR([Invalid choice of Single Precision length in bits [256, 2048, 3072]: $ENABLED_SP.])
|
||||
break;;
|
||||
esac
|
||||
done
|
||||
|
||||
ENABLED_SP=no
|
||||
if test "$ENABLED_RSA" = "yes" && test "$ENABLED_SP_RSA" = "yes"; then
|
||||
ENABLED_SP=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_HAVE_SP_RSA"
|
||||
fi
|
||||
if test "$ENABLED_DH" = "yes" && test "$ENABLED_SP_DH" = "yes"; then
|
||||
ENABLED_SP=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_HAVE_SP_DH"
|
||||
fi
|
||||
if test "$ENABLED_ECC" = "yes" && test "$ENABLED_SP_ECC" = "yes"; then
|
||||
ENABLED_SP=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_HAVE_SP_ECC"
|
||||
fi
|
||||
|
||||
|
||||
AC_ARG_ENABLE([sp-asm],
|
||||
[AS_HELP_STRING([--enable-sp-asm],[Enable Single Precision assembly implementation (default: disabled)])],
|
||||
[ ENABLED_SP_ASM=$enableval ],
|
||||
[ ENABLED_SP_ASM=no ],
|
||||
)
|
||||
if test "$ENABLED_SP_ASM" = "yes"; then
|
||||
if test "$ENABLED_SP" = "no"; then
|
||||
AC_MSG_ERROR([Must have SP enabled: --enable-sp])
|
||||
fi
|
||||
if test "$ENABLED_ASM" = "no"; then
|
||||
AC_MSG_ERROR([Assembly code turned off])
|
||||
fi
|
||||
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ASM"
|
||||
case $host_cpu in
|
||||
*aarch64*)
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM64_ASM"
|
||||
ENABLED_SP_ARM64_ASM=yes
|
||||
;;
|
||||
*arm*)
|
||||
if test $host_alias = "thumb"; then
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM_THUMB_ASM -mthumb -march=armv6"
|
||||
ENABLED_SP_ARM_THUMB_ASM=yes
|
||||
else
|
||||
if test $host_alias = "cortex"; then
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM_CORTEX_ASM"
|
||||
ENABLED_SP_ARM_CORTEX_ASM=yes
|
||||
else
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM32_ASM"
|
||||
ENABLED_SP_ARM32_ASM=yes
|
||||
fi
|
||||
fi
|
||||
;;
|
||||
*x86_64*)
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_X86_64_ASM"
|
||||
ENABLED_SP_X86_64_ASM=yes
|
||||
;;
|
||||
*)
|
||||
AC_MSG_ERROR([ASM not available for CPU. Supported CPUs: x86_64, aarch64, arm])
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
AC_ARG_ENABLE([sp-math],
|
||||
[AS_HELP_STRING([--enable-sp-math],[Enable Single Precision math implementation only (default: disabled)])],
|
||||
[ ENABLED_SP_MATH=$enableval ],
|
||||
[ ENABLED_SP_MATH=no ],
|
||||
)
|
||||
if test "$ENABLED_SP_MATH" = "yes"; then
|
||||
if test "$ENABLED_SP" = "no"; then
|
||||
AC_MSG_ERROR([Must have SP enabled: --enable-sp])
|
||||
fi
|
||||
if test "$ENABLED_ECCCUSTCURVES" = "yes"; then
|
||||
AC_MSG_ERROR([Cannot use single precision math and custom curves])
|
||||
fi
|
||||
if test "$ENABLED_OPENSSLEXTRA" = "yes"; then
|
||||
AC_MSG_ERROR([Cannot use single precision math and OpenSSL extra])
|
||||
fi
|
||||
if test "$ENABLED_DSA" = "yes"; then
|
||||
AC_MSG_ERROR([Cannot use single precision math and DSA])
|
||||
fi
|
||||
if test "$ENABLED_SRP" = "yes"; then
|
||||
AC_MSG_ERROR([Cannot use single precision math and SRP])
|
||||
fi
|
||||
if test "$ENABLED_SP_RSA" = "no" && test "$ENABLED_RSA" = "yes"; then
|
||||
AC_MSG_ERROR([Cannot use Single Precision maths without RSA with RSA])
|
||||
fi
|
||||
if test "$ENABLED_SP_DH" = "no" && test "$ENABLED_DH" = "yes"; then
|
||||
AC_MSG_ERROR([Cannot use Single Precision maths without DH with DH])
|
||||
fi
|
||||
fi
|
||||
if test "$ENABLED_SP_MATH" = "yes"; then
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_MATH"
|
||||
fi
|
||||
|
||||
AM_CONDITIONAL([BUILD_SP], [test "x$ENABLED_SP" = "xyes"])
|
||||
AM_CONDITIONAL([BUILD_SP_C], [test "x$ENABLED_SP" = "xyes" && test "x$ENABLED_SP_ASM" = "xno" ])
|
||||
AM_CONDITIONAL([BUILD_SP_ARM64], [test "x$ENABLED_SP_ARM64_ASM" = "xyes" ])
|
||||
AM_CONDITIONAL([BUILD_SP_ARM32], [test "x$ENABLED_SP_ARM32_ASM" = "xyes" ])
|
||||
AM_CONDITIONAL([BUILD_SP_ARM_THUMB], [test "x$ENABLED_SP_ARM_THUMB_ASM" = "xyes" ])
|
||||
AM_CONDITIONAL([BUILD_SP_ARM_CORTEX], [test "x$ENABLED_SP_ARM_CORTEX_ASM" = "xyes" ])
|
||||
AM_CONDITIONAL([BUILD_SP_X86_64], [test "x$ENABLED_SP_X86_64_ASM" = "xyes" ])
|
||||
AM_CONDITIONAL([BUILD_SP_INT], [test "x$ENABLED_SP_MATH" = "xyes" ])
|
||||
|
||||
|
||||
# set fastmath default
|
||||
FASTMATH_DEFAULT=no
|
||||
|
||||
@ -4016,6 +3832,212 @@ AC_ARG_WITH([intelqa],
|
||||
)
|
||||
AM_CONDITIONAL([BUILD_INTEL_QA], [test "x$ENABLED_INTEL_QA" = "xyes"])
|
||||
|
||||
# Single Precision maths implementation
|
||||
AC_ARG_ENABLE([sp],
|
||||
[AS_HELP_STRING([--enable-sp],[Enable Single Precision maths implementation (default: disabled)])],
|
||||
[ ENABLED_SP=$enableval ],
|
||||
[ ENABLED_SP=no ],
|
||||
)
|
||||
|
||||
ENABLED_SP_RSA=no
|
||||
ENABLED_SP_DH=no
|
||||
ENABLED_SP_ECC=no
|
||||
for v in `echo $ENABLED_SP | tr "," " "`
|
||||
do
|
||||
case $v in
|
||||
small)
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
|
||||
ENABLED_SP_RSA=yes
|
||||
ENABLED_SP_DH=yes
|
||||
ENABLED_SP_ECC=yes
|
||||
;;
|
||||
yes)
|
||||
ENABLED_SP_RSA=yes
|
||||
ENABLED_SP_DH=yes
|
||||
ENABLED_SP_ECC=yes
|
||||
;;
|
||||
no)
|
||||
;;
|
||||
|
||||
smallec256 | smallp256 | small256)
|
||||
ENABLED_SP_ECC=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_SMALL"
|
||||
;;
|
||||
ec256 | p256 | 256)
|
||||
ENABLED_SP_ECC=yes
|
||||
;;
|
||||
|
||||
small2048)
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_SMALL"
|
||||
ENABLED_SP_RSA=yes
|
||||
ENABLED_SP_DH=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_3072"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_NO_3072"
|
||||
;;
|
||||
2048)
|
||||
ENABLED_SP_RSA=yes
|
||||
ENABLED_SP_DH=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_3072"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_NO_3072"
|
||||
;;
|
||||
|
||||
smallrsa2048)
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_SMALL"
|
||||
ENABLED_SP_RSA=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_3072"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_NO_3072"
|
||||
;;
|
||||
rsa2048)
|
||||
ENABLED_SP_RSA=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_3072"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_NO_3072"
|
||||
;;
|
||||
|
||||
small3072)
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_SMALL"
|
||||
ENABLED_SP_RSA=yes
|
||||
ENABLED_SP_DH=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_2048"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_NO_2048"
|
||||
;;
|
||||
3072)
|
||||
ENABLED_SP_RSA=yes
|
||||
ENABLED_SP_DH=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_2048"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_NO_2048"
|
||||
;;
|
||||
|
||||
smallrsa3072)
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_SMALL"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_SMALL"
|
||||
ENABLED_SP_RSA=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_2048"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_NO_2048"
|
||||
;;
|
||||
rsa3072)
|
||||
ENABLED_SP_RSA=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_NO_2048"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_NO_2048"
|
||||
;;
|
||||
|
||||
*)
|
||||
AC_MSG_ERROR([Invalid choice of Single Precision length in bits [256, 2048, 3072]: $ENABLED_SP.])
|
||||
break;;
|
||||
esac
|
||||
done
|
||||
|
||||
ENABLED_SP=no
|
||||
if test "$ENABLED_RSA" = "yes" && test "$ENABLED_SP_RSA" = "yes"; then
|
||||
ENABLED_SP=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_HAVE_SP_RSA"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_HAVE_SP_RSA"
|
||||
fi
|
||||
if test "$ENABLED_DH" = "yes" && test "$ENABLED_SP_DH" = "yes"; then
|
||||
ENABLED_SP=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_HAVE_SP_DH"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_HAVE_SP_DH"
|
||||
fi
|
||||
if test "$ENABLED_ECC" = "yes" && test "$ENABLED_SP_ECC" = "yes"; then
|
||||
ENABLED_SP=yes
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_HAVE_SP_ECC"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_HAVE_SP_ECC"
|
||||
fi
|
||||
|
||||
|
||||
AC_ARG_ENABLE([sp-asm],
|
||||
[AS_HELP_STRING([--enable-sp-asm],[Enable Single Precision assembly implementation (default: disabled)])],
|
||||
[ ENABLED_SP_ASM=$enableval ],
|
||||
[ ENABLED_SP_ASM=no ],
|
||||
)
|
||||
if test "$ENABLED_SP_ASM" = "yes"; then
|
||||
if test "$ENABLED_SP" = "no"; then
|
||||
AC_MSG_ERROR([Must have SP enabled: --enable-sp])
|
||||
fi
|
||||
if test "$ENABLED_ASM" = "no"; then
|
||||
AC_MSG_ERROR([Assembly code turned off])
|
||||
fi
|
||||
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ASM"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_ASM"
|
||||
case $host_cpu in
|
||||
*aarch64*)
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM64_ASM"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_ARM64_ASM"
|
||||
ENABLED_SP_ARM64_ASM=yes
|
||||
;;
|
||||
*arm*)
|
||||
if test $host_alias = "thumb"; then
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM_THUMB_ASM -mthumb -march=armv6"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_ARM_THUMB_ASM"
|
||||
ENABLED_SP_ARM_THUMB_ASM=yes
|
||||
else
|
||||
if test $host_alias = "cortex"; then
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM_CORTEX_ASM"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_ARM_CORTEX_ASM"
|
||||
ENABLED_SP_ARM_CORTEX_ASM=yes
|
||||
else
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_ARM32_ASM"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_ARM32_ASM"
|
||||
ENABLED_SP_ARM32_ASM=yes
|
||||
fi
|
||||
fi
|
||||
;;
|
||||
*x86_64*)
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_X86_64_ASM"
|
||||
AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_X86_64_ASM"
|
||||
ENABLED_SP_X86_64_ASM=yes
|
||||
;;
|
||||
*)
|
||||
AC_MSG_ERROR([ASM not available for CPU. Supported CPUs: x86_64, aarch64, arm])
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
AC_ARG_ENABLE([sp-math],
|
||||
[AS_HELP_STRING([--enable-sp-math],[Enable Single Precision math implementation only (default: disabled)])],
|
||||
[ ENABLED_SP_MATH=$enableval ],
|
||||
[ ENABLED_SP_MATH=no ],
|
||||
)
|
||||
if test "$ENABLED_SP_MATH" = "yes"; then
|
||||
if test "$ENABLED_SP" = "no"; then
|
||||
AC_MSG_ERROR([Must have SP enabled: --enable-sp])
|
||||
fi
|
||||
if test "$ENABLED_ECCCUSTCURVES" = "yes"; then
|
||||
AC_MSG_ERROR([Cannot use single precision math and custom curves])
|
||||
fi
|
||||
if test "$ENABLED_OPENSSLEXTRA" = "yes"; then
|
||||
AC_MSG_ERROR([Cannot use single precision math and OpenSSL extra])
|
||||
fi
|
||||
if test "$ENABLED_DSA" = "yes"; then
|
||||
AC_MSG_ERROR([Cannot use single precision math and DSA])
|
||||
fi
|
||||
if test "$ENABLED_SRP" = "yes"; then
|
||||
AC_MSG_ERROR([Cannot use single precision math and SRP])
|
||||
fi
|
||||
if test "$ENABLED_SP_RSA" = "no" && test "$ENABLED_RSA" = "yes"; then
|
||||
AC_MSG_ERROR([Cannot use P256 single precision only math and RSA])
|
||||
fi
|
||||
if test "$ENABLED_SP_DH" = "no" && test "$ENABLED_DH" = "yes"; then
|
||||
AC_MSG_ERROR([Cannot use P256 single precision only math and DH])
|
||||
fi
|
||||
fi
|
||||
if test "$ENABLED_SP_MATH" = "yes"; then
|
||||
AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_MATH"
|
||||
fi
|
||||
|
||||
AM_CONDITIONAL([BUILD_SP], [test "x$ENABLED_SP" = "xyes"])
|
||||
AM_CONDITIONAL([BUILD_SP_C], [test "x$ENABLED_SP" = "xyes" && test "x$ENABLED_SP_ASM" = "xno" ])
|
||||
AM_CONDITIONAL([BUILD_SP_ARM64], [test "x$ENABLED_SP_ARM64_ASM" = "xyes" ])
|
||||
AM_CONDITIONAL([BUILD_SP_ARM32], [test "x$ENABLED_SP_ARM32_ASM" = "xyes" ])
|
||||
AM_CONDITIONAL([BUILD_SP_ARM_THUMB], [test "x$ENABLED_SP_ARM_THUMB_ASM" = "xyes" ])
|
||||
AM_CONDITIONAL([BUILD_SP_ARM_CORTEX], [test "x$ENABLED_SP_ARM_CORTEX_ASM" = "xyes" ])
|
||||
AM_CONDITIONAL([BUILD_SP_X86_64], [test "x$ENABLED_SP_X86_64_ASM" = "xyes" ])
|
||||
AM_CONDITIONAL([BUILD_SP_INT], [test "x$ENABLED_SP_MATH" = "xyes" ])
|
||||
|
||||
# Fast RSA using Intel IPP
|
||||
ippdir="${srcdir}/IPP"
|
||||
ipplib="lib" # if autoconf guesses 32bit system changes lib directory
|
||||
|
@ -145,6 +145,9 @@ if BUILD_ARMASM
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-sha256.c
|
||||
else
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/sha256.c
|
||||
if BUILD_INTELASM
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/sha256_asm.S
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
@ -179,6 +182,7 @@ src_libwolfssl_la_SOURCES += wolfcrypt/src/sp_c64.c
|
||||
endif
|
||||
if BUILD_SP_X86_64
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/sp_x86_64.c
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/sp_x86_64_asm.S
|
||||
endif
|
||||
if BUILD_SP_ARM32
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/sp_arm32.c
|
||||
@ -230,6 +234,9 @@ endif
|
||||
if !BUILD_FIPS_V2
|
||||
if BUILD_SHA512
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/sha512.c
|
||||
if BUILD_INTELASM
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/sha512_asm.S
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
@ -267,6 +274,9 @@ endif
|
||||
|
||||
if BUILD_POLY1305
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/poly1305.c
|
||||
if BUILD_INTELASM
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/poly1305_asm.S
|
||||
endif
|
||||
endif
|
||||
|
||||
if BUILD_RC4
|
||||
@ -293,6 +303,7 @@ endif
|
||||
if !BUILD_FIPS_V2
|
||||
if BUILD_AESNI
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_asm.S
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_gcm_asm.S
|
||||
endif
|
||||
endif
|
||||
|
||||
@ -322,6 +333,9 @@ endif
|
||||
|
||||
if BUILD_CHACHA
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/chacha.c
|
||||
if BUILD_INTELASM
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/chacha_asm.S
|
||||
endif
|
||||
if BUILD_POLY1305
|
||||
src_libwolfssl_la_SOURCES += wolfcrypt/src/chacha20_poly1305.c
|
||||
endif
|
||||
|
@ -4141,7 +4141,9 @@ exit_rsa_verify:
|
||||
}
|
||||
|
||||
FREE_ARRAY_DYNAMIC(enc, BENCH_MAX_PENDING, HEAP_HINT);
|
||||
#if !defined(WOLFSSL_RSA_VERIFY_INLINE) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)
|
||||
FREE_ARRAY_DYNAMIC(out, BENCH_MAX_PENDING, HEAP_HINT);
|
||||
#endif
|
||||
FREE_VAR(message, HEAP_HINT);
|
||||
}
|
||||
|
||||
|
3154
wolfcrypt/src/aes.c
3154
wolfcrypt/src/aes.c
File diff suppressed because it is too large
Load Diff
8367
wolfcrypt/src/aes_gcm_asm.S
Normal file
8367
wolfcrypt/src/aes_gcm_asm.S
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
1314
wolfcrypt/src/chacha_asm.S
Normal file
1314
wolfcrypt/src/chacha_asm.S
Normal file
File diff suppressed because it is too large
Load Diff
@ -118,827 +118,55 @@ static word32 cpu_flags_set = 0;
|
||||
#endif
|
||||
|
||||
#ifdef USE_INTEL_SPEEDUP
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_INTEL_AVX1
|
||||
/* Process one block (16 bytes) of data.
|
||||
*
|
||||
* ctx Poly1305 context.
|
||||
* m One block of message data.
|
||||
*/
|
||||
static void poly1305_block_avx(Poly1305* ctx, const unsigned char *m)
|
||||
{
|
||||
__asm__ __volatile__ (
|
||||
"movq (%[ctx]), %%r15\n\t"
|
||||
"movq 24(%[ctx]), %%r8\n\t"
|
||||
"movq 32(%[ctx]), %%r9\n\t"
|
||||
"movq 40(%[ctx]), %%r10\n\t"
|
||||
"xorq %%rbx, %%rbx\n\t"
|
||||
"movb %[nfin], %%bl\n\t"
|
||||
"# h += m\n\t"
|
||||
"movq (%[m]), %%r11\n\t"
|
||||
"movq 8(%[m]), %%r12\n\t"
|
||||
"addq %%r11, %%r8\n\t"
|
||||
"adcq %%r12, %%r9\n\t"
|
||||
"movq 8(%[ctx]), %%rax\n\t"
|
||||
"adcq %%rbx, %%r10\n\t"
|
||||
"# r[1] * h[0] => rdx, rax ==> t2, t1\n\t"
|
||||
"mulq %%r8\n\t"
|
||||
"movq %%rax, %%r12\n\t"
|
||||
"movq %%rdx, %%r13\n\t"
|
||||
"# r[0] * h[1] => rdx, rax ++> t2, t1\n\t"
|
||||
"movq %%r15, %%rax\n\t"
|
||||
"mulq %%r9\n\t"
|
||||
"addq %%rax, %%r12\n\t"
|
||||
"movq %%r15, %%rax\n\t"
|
||||
"adcq %%rdx, %%r13\n\t"
|
||||
"# r[0] * h[0] => rdx, rax ==> t4, t0\n\t"
|
||||
"mulq %%r8\n\t"
|
||||
"movq %%rax, %%r11\n\t"
|
||||
"movq %%rdx, %%r8\n\t"
|
||||
"# r[1] * h[1] => rdx, rax =+> t3, t2\n\t"
|
||||
"movq 8(%[ctx]), %%rax\n\t"
|
||||
"mulq %%r9\n\t"
|
||||
"# r[0] * h[2] +> t2\n\t"
|
||||
"addq 352(%[ctx],%%r10,8), %%r13\n\t"
|
||||
"movq %%rdx, %%r14\n\t"
|
||||
"addq %%r8, %%r12\n\t"
|
||||
"adcq %%rax, %%r13\n\t"
|
||||
"# r[1] * h[2] +> t3\n\t"
|
||||
"adcq 408(%[ctx],%%r10,8), %%r14\n\t"
|
||||
"# r * h in r14, r13, r12, r11 \n\t"
|
||||
"# h = (r * h) mod 2^130 - 5\n\t"
|
||||
"movq %%r13, %%r10\n\t"
|
||||
"andq $-4, %%r13\n\t"
|
||||
"andq $3, %%r10\n\t"
|
||||
"addq %%r13, %%r11\n\t"
|
||||
"movq %%r13, %%r8\n\t"
|
||||
"adcq %%r14, %%r12\n\t"
|
||||
"adcq $0, %%r10\n\t"
|
||||
"shrdq $2, %%r14, %%r8\n\t"
|
||||
"shrq $2, %%r14\n\t"
|
||||
"addq %%r11, %%r8\n\t"
|
||||
"adcq %%r14, %%r12\n\t"
|
||||
"movq %%r12, %%r9\n\t"
|
||||
"adcq $0, %%r10\n\t"
|
||||
"# h in r10, r9, r8 \n\t"
|
||||
"# Store h to ctx\n\t"
|
||||
"movq %%r8, 24(%[ctx])\n\t"
|
||||
"movq %%r9, 32(%[ctx])\n\t"
|
||||
"movq %%r10, 40(%[ctx])\n\t"
|
||||
:
|
||||
: [m] "r" (m), [ctx] "r" (ctx), [nfin] "m" (ctx->finished)
|
||||
: "rax", "rdx", "r11", "r12", "r13", "r14", "r15", "rbx",
|
||||
"r8", "r9", "r10", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
extern void poly1305_block_avx(Poly1305* ctx, const unsigned char *m);
|
||||
/* Process multiple blocks (n * 16 bytes) of data.
|
||||
*
|
||||
* ctx Poly1305 context.
|
||||
* m Blocks of message data.
|
||||
* bytes The number of bytes to process.
|
||||
*/
|
||||
POLY1305_NOINLINE static void poly1305_blocks_avx(Poly1305* ctx,
|
||||
const unsigned char* m, size_t bytes)
|
||||
{
|
||||
__asm__ __volatile__ (
|
||||
"movq (%[ctx]), %%r15\n\t"
|
||||
"movq 24(%[ctx]), %%r8\n\t"
|
||||
"movq 32(%[ctx]), %%r9\n\t"
|
||||
"movq 40(%[ctx]), %%r10\n"
|
||||
"L_avx_start:\n\t"
|
||||
"# h += m\n\t"
|
||||
"movq (%[m]), %%r11\n\t"
|
||||
"movq 8(%[m]), %%r12\n\t"
|
||||
"addq %%r11, %%r8\n\t"
|
||||
"adcq %%r12, %%r9\n\t"
|
||||
"movq 8(%[ctx]), %%rax\n\t"
|
||||
"adcq $0, %%r10\n\t"
|
||||
"# r[1] * h[0] => rdx, rax ==> t2, t1\n\t"
|
||||
"mulq %%r8\n\t"
|
||||
"movq %%rax, %%r12\n\t"
|
||||
"movq %%rdx, %%r13\n\t"
|
||||
"# r[0] * h[1] => rdx, rax ++> t2, t1\n\t"
|
||||
"movq %%r15, %%rax\n\t"
|
||||
"mulq %%r9\n\t"
|
||||
"addq %%rax, %%r12\n\t"
|
||||
"movq %%r15, %%rax\n\t"
|
||||
"adcq %%rdx, %%r13\n\t"
|
||||
"# r[0] * h[0] => rdx, rax ==> t4, t0\n\t"
|
||||
"mulq %%r8\n\t"
|
||||
"movq %%rax, %%r11\n\t"
|
||||
"movq %%rdx, %%r8\n\t"
|
||||
"# r[1] * h[1] => rdx, rax =+> t3, t2\n\t"
|
||||
"movq 8(%[ctx]), %%rax\n\t"
|
||||
"mulq %%r9\n\t"
|
||||
"# r[0] * h[2] +> t2\n\t"
|
||||
"addq 360(%[ctx],%%r10,8), %%r13\n\t"
|
||||
"movq %%rdx, %%r14\n\t"
|
||||
"addq %%r8, %%r12\n\t"
|
||||
"adcq %%rax, %%r13\n\t"
|
||||
"# r[1] * h[2] +> t3\n\t"
|
||||
"adcq 416(%[ctx],%%r10,8), %%r14\n\t"
|
||||
"# r * h in r14, r13, r12, r11 \n\t"
|
||||
"# h = (r * h) mod 2^130 - 5\n\t"
|
||||
"movq %%r13, %%r10\n\t"
|
||||
"andq $-4, %%r13\n\t"
|
||||
"andq $3, %%r10\n\t"
|
||||
"addq %%r13, %%r11\n\t"
|
||||
"movq %%r13, %%r8\n\t"
|
||||
"adcq %%r14, %%r12\n\t"
|
||||
"adcq $0, %%r10\n\t"
|
||||
"shrdq $2, %%r14, %%r8\n\t"
|
||||
"shrq $2, %%r14\n\t"
|
||||
"addq %%r11, %%r8\n\t"
|
||||
"adcq %%r14, %%r12\n\t"
|
||||
"movq %%r12, %%r9\n\t"
|
||||
"adcq $0, %%r10\n\t"
|
||||
"# h in r10, r9, r8 \n\t"
|
||||
"# Next block from message\n\t"
|
||||
"addq $16, %[m]\n\t"
|
||||
"subq $16, %[bytes]\n\t"
|
||||
"cmp $16, %[bytes]\n\t"
|
||||
"jge L_avx_start\n\t"
|
||||
"# Store h to ctx\n\t"
|
||||
"movq %%r8, 24(%[ctx])\n\t"
|
||||
"movq %%r9, 32(%[ctx])\n\t"
|
||||
"movq %%r10, 40(%[ctx])\n\t"
|
||||
: [m] "+r" (m), [bytes] "+r" (bytes)
|
||||
: [ctx] "r" (ctx)
|
||||
: "rax", "rdx", "r11", "r12", "r13", "r14", "r15",
|
||||
"r8", "r9", "r10", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
extern void poly1305_blocks_avx(Poly1305* ctx, const unsigned char* m,
|
||||
size_t bytes);
|
||||
/* Set the key to use when processing data.
|
||||
* Initialize the context.
|
||||
*
|
||||
* ctx Poly1305 context.
|
||||
* key The key data (16 bytes).
|
||||
*/
|
||||
static void poly1305_setkey_avx(Poly1305* ctx, const byte* key)
|
||||
{
|
||||
int i;
|
||||
|
||||
ctx->r[0] = *(word64*)(key + 0) & 0x0ffffffc0fffffffL;
|
||||
ctx->r[1] = *(word64*)(key + 8) & 0x0ffffffc0ffffffcL;
|
||||
|
||||
for (i=0; i<7; i++) {
|
||||
ctx->hm[i + 0] = ctx->r[0] * i;
|
||||
ctx->hm[i + 7] = ctx->r[1] * i;
|
||||
}
|
||||
|
||||
/* h (accumulator) = 0 */
|
||||
ctx->h[0] = 0;
|
||||
ctx->h[1] = 0;
|
||||
ctx->h[2] = 0;
|
||||
|
||||
/* save pad for later */
|
||||
ctx->pad[0] = *(word64*)(key + 16);
|
||||
ctx->pad[1] = *(word64*)(key + 24);
|
||||
|
||||
ctx->leftover = 0;
|
||||
ctx->finished = 1;
|
||||
}
|
||||
|
||||
extern void poly1305_setkey_avx(Poly1305* ctx, const byte* key);
|
||||
/* Calculate the final result - authentication data.
|
||||
* Zeros out the private data in the context.
|
||||
*
|
||||
* ctx Poly1305 context.
|
||||
* mac Buffer to hold 16 bytes.
|
||||
*/
|
||||
static void poly1305_final_avx(Poly1305* ctx, byte* mac)
|
||||
{
|
||||
word64 h0, h1, h2;
|
||||
|
||||
/* process the remaining block */
|
||||
if (ctx->leftover) {
|
||||
size_t i = ctx->leftover;
|
||||
ctx->buffer[i] = 1;
|
||||
for (i = i + 1; i < POLY1305_BLOCK_SIZE; i++)
|
||||
ctx->buffer[i] = 0;
|
||||
ctx->finished = 0;
|
||||
poly1305_block_avx(ctx, ctx->buffer);
|
||||
}
|
||||
|
||||
h0 = ctx->h[0];
|
||||
h1 = ctx->h[1];
|
||||
h2 = ctx->h[2];
|
||||
|
||||
/* h %= p */
|
||||
/* h = (h + pad) */
|
||||
__asm__ __volatile__ (
|
||||
"# mod 2^130 - 5\n\t"
|
||||
"movq %[h2], %%r13\n\t"
|
||||
"andq $0x3, %[h2]\n\t"
|
||||
"shrq $0x2, %%r13\n\t"
|
||||
"leaq (%%r13, %%r13, 4), %%r13\n\t"
|
||||
"add %%r13, %[h0]\n\t"
|
||||
"adc $0, %[h1]\n\t"
|
||||
"adc $0, %[h2]\n\t"
|
||||
"# Fixup when between (1 << 130) - 1 and (1 << 130) - 5\n\t"
|
||||
"movq %[h0], %%r13\n\t"
|
||||
"movq %[h1], %%r14\n\t"
|
||||
"movq %[h2], %%r15\n\t"
|
||||
"addq $5, %%r13\n\t"
|
||||
"adcq $0, %%r14\n\t"
|
||||
"adcq $0, %%r15\n\t"
|
||||
"movq %%r15, %%r12\n\t"
|
||||
"andq $3, %%r15\n\t"
|
||||
"cmpq $4, %%r12\n\t"
|
||||
"cmove %%r13, %[h0]\n\t"
|
||||
"cmove %%r14, %[h1]\n\t"
|
||||
"cmove %%r15, %[h2]\n\t"
|
||||
"# h += pad\n\t"
|
||||
"add %[p0], %[h0]\n\t"
|
||||
"adc %[p1], %[h1]\n\t"
|
||||
"movq %[h0], (%[m])\n\t"
|
||||
"movq %[h1], 8(%[m])\n\t"
|
||||
: [h0] "+r" (h0), [h1] "+r" (h1), [h2] "+r" (h2),
|
||||
[p0] "+r" (ctx->pad[0]), [p1] "+r" (ctx->pad[1])
|
||||
: [m] "r" (mac)
|
||||
: "memory", "r15", "r14", "r13", "r12"
|
||||
);
|
||||
|
||||
/* zero out the state */
|
||||
ctx->h[0] = 0;
|
||||
ctx->h[1] = 0;
|
||||
ctx->h[2] = 0;
|
||||
ctx->r[0] = 0;
|
||||
ctx->r[1] = 0;
|
||||
ctx->pad[0] = 0;
|
||||
ctx->pad[1] = 0;
|
||||
}
|
||||
extern void poly1305_final_avx(Poly1305* ctx, byte* mac);
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_INTEL_AVX2
|
||||
#if defined(_MSC_VER)
|
||||
#define POLY1305_NOINLINE __declspec(noinline)
|
||||
#elif defined(__GNUC__)
|
||||
#define POLY1305_NOINLINE __attribute__((noinline))
|
||||
#else
|
||||
#define POLY1305_NOINLINE
|
||||
#endif
|
||||
|
||||
/* Load H into five 256-bit registers.
|
||||
*
|
||||
* h is the memory location of the data - 26 of 32 bits.
|
||||
* h0-h4 the 4 H values with 26 bits stored in 64 for multiply.
|
||||
*/
|
||||
#define LOAD_H(h, h0, h1, h2, h3, h4) \
|
||||
"vmovdqu ("#h"), "#h0"\n\t" \
|
||||
"vmovdqu 32("#h"), "#h1"\n\t" \
|
||||
"vmovdqu 64("#h"), "#h2"\n\t" \
|
||||
"vmovdqu 96("#h"), "#h3"\n\t" \
|
||||
"vmovdqu 128("#h"), "#h4"\n\t"
|
||||
|
||||
/* Store H, five 256-bit registers, packed.
|
||||
*
|
||||
* h is the memory location of the data - 26 bits in 32.
|
||||
* h0-h4 the 4 H values with 26 bits stored in 64.
|
||||
* x4 is the xmm register of h4.
|
||||
*/
|
||||
#define STORE_H(h, h0, h1, h2, h3, h4, x4) \
|
||||
"vmovdqu "#h0", ("#h")\n\t" \
|
||||
"vmovdqu "#h1", 32("#h")\n\t" \
|
||||
"vmovdqu "#h2", 64("#h")\n\t" \
|
||||
"vmovdqu "#h3", 96("#h")\n\t" \
|
||||
"vmovdqu "#h4", 128("#h")\n\t"
|
||||
|
||||
/* Load four powers of r into position to be multiplied by the 4 H values.
|
||||
*
|
||||
* r0-r4 holds the loaded values with 26 bits stored in 64 for multiply.
|
||||
* t0-t3 are temporary registers.
|
||||
*/
|
||||
#define LOAD_Rx4(r0, r1, r2, r3, r4, \
|
||||
t0, t1, t2, t3) \
|
||||
"vmovdqu 224(%[ctx]), "#r3"\n\t" \
|
||||
"vmovdqu 256(%[ctx]), "#r2"\n\t" \
|
||||
"vmovdqu 288(%[ctx]), "#r1"\n\t" \
|
||||
"vmovdqu 320(%[ctx]), "#r0"\n\t" \
|
||||
"vpermq $0xd8, "#r0", "#r0"\n\t" \
|
||||
"vpermq $0xd8, "#r1", "#r1"\n\t" \
|
||||
"vpermq $0xd8, "#r2", "#r2"\n\t" \
|
||||
"vpermq $0xd8, "#r3", "#r3"\n\t" \
|
||||
"vpunpcklqdq "#r1", "#r0", "#t0"\n\t" \
|
||||
"vpunpckhqdq "#r1", "#r0", "#t1"\n\t" \
|
||||
"vpunpcklqdq "#r3", "#r2", "#t2"\n\t" \
|
||||
"vpunpckhqdq "#r3", "#r2", "#t3"\n\t" \
|
||||
"vperm2i128 $0x20, "#t2", "#t0", "#r0"\n\t" \
|
||||
"vperm2i128 $0x31, "#t2", "#t0", "#r2"\n\t" \
|
||||
"vperm2i128 $0x20, "#t3", "#t1", "#r4"\n\t" \
|
||||
"vpsrlq $32, "#r0", "#r1"\n\t" \
|
||||
"vpsrlq $32, "#r2", "#r3"\n\t"
|
||||
|
||||
/* Load the r^4 value into position to be multiplied by all 4 H values.
|
||||
*
|
||||
* r4 holds r^4 as five 26 bits each in 32.
|
||||
* r0-r4 holds the loaded values with 26 bits stored in 64 for multiply.
|
||||
* t0-t1 are temporary registers.
|
||||
*/
|
||||
#define LOAD_R4(r4, r40, r41, r42, r43, r44, \
|
||||
t0, t1) \
|
||||
"vmovdqu "#r4", "#t0"\n\t" \
|
||||
"vpermq $0x0, "#t0", "#r40"\n\t" \
|
||||
"vpsrlq $32, "#t0", "#t1"\n\t" \
|
||||
"vpermq $0x55, "#t0", "#r42"\n\t" \
|
||||
"vpermq $0xaa, "#t0", "#r44"\n\t" \
|
||||
"vpermq $0x0, "#t1", "#r41"\n\t" \
|
||||
"vpermq $0x55, "#t1", "#r43"\n\t"
|
||||
|
||||
/* Multiply the top 4 26-bit values in 64 bits of each H by 5 for reduction in
|
||||
* multiply.
|
||||
*
|
||||
* s1-s4 are each 64 bit value in r1-r4 multiplied by 5.
|
||||
* r1-r4 are the top 4
|
||||
*/
|
||||
#define MUL5(s1, s2, s3, s4, r1, r2, r3, r4) \
|
||||
"vpslld $2, "#r1", "#s1"\n\t" \
|
||||
"vpslld $2, "#r2", "#s2"\n\t" \
|
||||
"vpslld $2, "#r3", "#s3"\n\t" \
|
||||
"vpslld $2, "#r4", "#s4"\n\t" \
|
||||
"vpaddq "#s1", "#r1", "#s1"\n\t" \
|
||||
"vpaddq "#s2", "#r2", "#s2"\n\t" \
|
||||
"vpaddq "#s3", "#r3", "#s3"\n\t" \
|
||||
"vpaddq "#s4", "#r4", "#s4"\n\t"
|
||||
|
||||
/* Add the 4 H values together.
|
||||
* Each 64 bits in a register is 26 bits of one of the H values.
|
||||
*
|
||||
* h0-h4 contains the 4 H values.
|
||||
* t1-t4 are temporary registers.
|
||||
*/
|
||||
#define FINALIZE_H(h0, h1, h2, h3, h4, \
|
||||
t0, t1, t2, t3, t4) \
|
||||
"vpsrldq $8, "#h0", "#t0"\n\t" \
|
||||
"vpsrldq $8, "#h1", "#t1"\n\t" \
|
||||
"vpsrldq $8, "#h2", "#t2"\n\t" \
|
||||
"vpsrldq $8, "#h3", "#t3"\n\t" \
|
||||
"vpsrldq $8, "#h4", "#t4"\n\t" \
|
||||
"vpaddq "#h0", "#t0", "#h0"\n\t" \
|
||||
"vpaddq "#h1", "#t1", "#h1"\n\t" \
|
||||
"vpaddq "#h2", "#t2", "#h2"\n\t" \
|
||||
"vpaddq "#h3", "#t3", "#h3"\n\t" \
|
||||
"vpaddq "#h4", "#t4", "#h4"\n\t" \
|
||||
"vpermq $0x02, "#h0", "#t0"\n\t" \
|
||||
"vpermq $0x02, "#h1", "#t1"\n\t" \
|
||||
"vpermq $0x02, "#h2", "#t2"\n\t" \
|
||||
"vpermq $0x02, "#h3", "#t3"\n\t" \
|
||||
"vpermq $0x02, "#h4", "#t4"\n\t" \
|
||||
"vpaddq "#h0", "#t0", "#h0"\n\t" \
|
||||
"vpaddq "#h1", "#t1", "#h1"\n\t" \
|
||||
"vpaddq "#h2", "#t2", "#h2"\n\t" \
|
||||
"vpaddq "#h3", "#t3", "#h3"\n\t" \
|
||||
"vpaddq "#h4", "#t4", "#h4"\n\t"
|
||||
|
||||
/* Move 32 bits from each xmm register to a 32 bit register.
|
||||
*
|
||||
* x0-x4 are the xmm version of the ymm registers used.
|
||||
* t0-t4 are the 32-bit registers to store data in.
|
||||
*/
|
||||
#define MOVE_TO_32(x0, x1, x2, x3, x4, \
|
||||
t0, t1, t2, t3, t4) \
|
||||
"vmovd "#x0", "#t0"\n\t" \
|
||||
"vmovd "#x1", "#t1"\n\t" \
|
||||
"vmovd "#x2", "#t2"\n\t" \
|
||||
"vmovd "#x3", "#t3"\n\t" \
|
||||
"vmovd "#x4", "#t4"\n\t"
|
||||
|
||||
/* Multiply using AVX2 instructions.
|
||||
* Each register contains up to 32 bits of data in 64 bits.
|
||||
* This is a 4 way parallel multiply.
|
||||
*
|
||||
* h0-h4 contain 4 H values with the 32 bits of each per register.
|
||||
* r0-r4 contain the 4 powers of r.
|
||||
* s1-s4 contain r1-r4 times 5.
|
||||
* t0-t4 and v0-v3 are temporary registers.
|
||||
*/
|
||||
#define MUL_AVX2(h0, h1, h2, h3, h4, \
|
||||
r0, r1, r2, r3, r4, \
|
||||
s1, s2, s3, s4, \
|
||||
t0, t1, t2, t3, t4, \
|
||||
v0, v1, v2, v3) \
|
||||
"vpmuludq "#s1", "#h4", "#t0"\n\t" \
|
||||
"vpmuludq "#s2", "#h3", "#v0"\n\t" \
|
||||
"vpmuludq "#s2", "#h4", "#t1"\n\t" \
|
||||
"vpmuludq "#s3", "#h3", "#v1"\n\t" \
|
||||
"vpmuludq "#s3", "#h4", "#t2"\n\t" \
|
||||
"vpaddq "#t0", "#v0", "#t0"\n\t" \
|
||||
"vpmuludq "#s3", "#h2", "#v2"\n\t" \
|
||||
"vpmuludq "#s4", "#h4", "#t3"\n\t" \
|
||||
"vpaddq "#t1", "#v1", "#t1"\n\t" \
|
||||
"vpmuludq "#s4", "#h1", "#v3"\n\t" \
|
||||
"vpmuludq "#s4", "#h2", "#v0"\n\t" \
|
||||
"vpaddq "#t0", "#v2", "#t0"\n\t" \
|
||||
"vpmuludq "#s4", "#h3", "#v1"\n\t" \
|
||||
"vpmuludq "#r0", "#h3", "#v2"\n\t" \
|
||||
"vpaddq "#t0", "#v3", "#t0"\n\t" \
|
||||
"vpmuludq "#r0", "#h4", "#t4"\n\t" \
|
||||
"vpaddq "#t1", "#v0", "#t1"\n\t" \
|
||||
"vpmuludq "#r0", "#h0", "#v3"\n\t" \
|
||||
"vpaddq "#t2", "#v1", "#t2"\n\t" \
|
||||
"vpmuludq "#r0", "#h1", "#v0"\n\t" \
|
||||
"vpaddq "#t3", "#v2", "#t3"\n\t" \
|
||||
"vpmuludq "#r0", "#h2", "#v1"\n\t" \
|
||||
"vpmuludq "#r1", "#h2", "#v2"\n\t" \
|
||||
"vpaddq "#t0", "#v3", "#t0"\n\t" \
|
||||
"vpmuludq "#r1", "#h3", "#v3"\n\t" \
|
||||
"vpaddq "#t1", "#v0", "#t1"\n\t" \
|
||||
"vpmuludq "#r1", "#h0", "#v0"\n\t" \
|
||||
"vpaddq "#t2", "#v1", "#t2"\n\t" \
|
||||
"vpmuludq "#r1", "#h1", "#v1"\n\t" \
|
||||
"vpaddq "#t3", "#v2", "#t3"\n\t" \
|
||||
"vpmuludq "#r2", "#h1", "#v2"\n\t" \
|
||||
"vpaddq "#t4", "#v3", "#t4"\n\t" \
|
||||
"vpmuludq "#r2", "#h2", "#v3"\n\t" \
|
||||
"vpaddq "#t1", "#v0", "#t1"\n\t" \
|
||||
"vpmuludq "#r2", "#h0", "#v0"\n\t" \
|
||||
"vpaddq "#t2", "#v1", "#t2"\n\t" \
|
||||
"vpmuludq "#r3", "#h0", "#v1"\n\t" \
|
||||
"vpaddq "#t3", "#v2", "#t3"\n\t" \
|
||||
"vpmuludq "#r3", "#h1", "#v2"\n\t" \
|
||||
"vpaddq "#t4", "#v3", "#t4"\n\t" \
|
||||
"vpmuludq "#r4", "#h0", "#v3"\n\t" \
|
||||
"vpaddq "#t2", "#v0", "#t2"\n\t" \
|
||||
"vpaddq "#t3", "#v1", "#t3"\n\t" \
|
||||
"vpaddq "#t4", "#v2", "#t4"\n\t" \
|
||||
"vpaddq "#t4", "#v3", "#t4"\n\t"
|
||||
|
||||
/* Load the 4 blocks of the message.
|
||||
*
|
||||
* m the address of the message to load.
|
||||
* m0-m4 is the loaded message with 32 bits in 64. Loaded so data is parallel.
|
||||
* hi is the high bits of the 4 m (1 << 128 as not final block).
|
||||
* z is zero.
|
||||
*/
|
||||
#define LOAD_M(m, m0, m1, m2, m3, m4, hi, z) \
|
||||
"vmovdqu (%[m]), "#m0"\n\t" \
|
||||
"vmovdqu 32(%[m]), "#m1"\n\t" \
|
||||
"vperm2i128 $0x20, "#m1", "#m0", "#m2"\n\t" \
|
||||
"vperm2i128 $0x31, "#m1", "#m0", "#m0"\n\t" \
|
||||
"vpunpckldq "#m0", "#m2", "#m1"\n\t" \
|
||||
"vpunpckhdq "#m0", "#m2", "#m3"\n\t" \
|
||||
"vpunpckldq "#z", "#m1", "#m0"\n\t" \
|
||||
"vpunpckhdq "#z", "#m1", "#m1"\n\t" \
|
||||
"vpunpckldq "#z", "#m3", "#m2"\n\t" \
|
||||
"vpunpckhdq "#z", "#m3", "#m3"\n\t" \
|
||||
"vmovdqu "#hi", "#m4"\n\t" \
|
||||
"vpsllq $6, "#m1", "#m1"\n\t" \
|
||||
"vpsllq $12, "#m2", "#m2"\n\t" \
|
||||
"vpsllq $18, "#m3", "#m3"\n\t"
|
||||
|
||||
|
||||
/* Multiply using AVX2 instructions - adding with message.
|
||||
* Each register contains up to 32 bits of data in 64 bits.
|
||||
* This is a 4 way parallel multiply.
|
||||
* The message data is loaded first and the multiplication adds into it.
|
||||
*
|
||||
* h0-h4 contain 4 H values with the 32 bits of each per register.
|
||||
* r0-r4 contain the 4 powers of r.
|
||||
* s1-s4 contain r1-r4 times 5.
|
||||
* t0-t4 and v0-v3 are temporary registers.
|
||||
* hi is the high bits of the 4 m (1 << 128 as not final block).
|
||||
* z is zero.
|
||||
*/
|
||||
#define MUL_ADD_AVX2(h0, h1, h2, h3, h4, \
|
||||
r0, r1, r2, r3, r4, \
|
||||
s1, s2, s3, s4, \
|
||||
t0, t1, t2, t3, t4, \
|
||||
v0, v1, v2, v3, \
|
||||
hi, z) \
|
||||
"vmovdqu (%[m]), "#t0"\n\t" \
|
||||
"vmovdqu 32(%[m]), "#t1"\n\t" \
|
||||
"vperm2i128 $0x20, "#t1", "#t0", "#t2"\n\t" \
|
||||
"vperm2i128 $0x31, "#t1", "#t0", "#t0"\n\t" \
|
||||
"vpunpckldq "#t0", "#t2", "#t1"\n\t" \
|
||||
"vpunpckhdq "#t0", "#t2", "#t3"\n\t" \
|
||||
"vpunpckldq "#z", "#t1", "#t0"\n\t" \
|
||||
"vpunpckhdq "#z", "#t1", "#t1"\n\t" \
|
||||
"vpunpckldq "#z", "#t3", "#t2"\n\t" \
|
||||
"vpunpckhdq "#z", "#t3", "#t3"\n\t" \
|
||||
"vmovdqu "#hi", "#t4"\n\t" \
|
||||
"vpsllq $6, "#t1", "#t1"\n\t" \
|
||||
"vpsllq $12, "#t2", "#t2"\n\t" \
|
||||
"vpsllq $18, "#t3", "#t3"\n\t" \
|
||||
"vpmuludq "#s1", "#h4", "#v0"\n\t" \
|
||||
"vpaddq "#t0", "#v0", "#t0"\n\t" \
|
||||
"vpmuludq "#s2", "#h3", "#v0"\n\t" \
|
||||
"vpmuludq "#s2", "#h4", "#v1"\n\t" \
|
||||
"vpaddq "#t1", "#v1", "#t1"\n\t" \
|
||||
"vpmuludq "#s3", "#h3", "#v1"\n\t" \
|
||||
"vpmuludq "#s3", "#h4", "#v2"\n\t" \
|
||||
"vpaddq "#t2", "#v2", "#t2"\n\t" \
|
||||
"vpaddq "#t0", "#v0", "#t0"\n\t" \
|
||||
"vpmuludq "#s3", "#h2", "#v2"\n\t" \
|
||||
"vpmuludq "#s4", "#h4", "#v3"\n\t" \
|
||||
"vpaddq "#t3", "#v3", "#t3"\n\t" \
|
||||
"vpaddq "#t1", "#v1", "#t1"\n\t" \
|
||||
"vpmuludq "#s4", "#h1", "#v3"\n\t" \
|
||||
"vpmuludq "#s4", "#h2", "#v0"\n\t" \
|
||||
"vpaddq "#t0", "#v2", "#t0"\n\t" \
|
||||
"vpmuludq "#s4", "#h3", "#v1"\n\t" \
|
||||
"vpmuludq "#r0", "#h3", "#v2"\n\t" \
|
||||
"vpaddq "#t0", "#v3", "#t0"\n\t" \
|
||||
"vpmuludq "#r0", "#h4", "#v3"\n\t" \
|
||||
"vpaddq "#t4", "#v3", "#t4"\n\t" \
|
||||
"vpaddq "#t1", "#v0", "#t1"\n\t" \
|
||||
"vpmuludq "#r0", "#h0", "#v3"\n\t" \
|
||||
"vpaddq "#t2", "#v1", "#t2"\n\t" \
|
||||
"vpmuludq "#r0", "#h1", "#v0"\n\t" \
|
||||
"vpaddq "#t3", "#v2", "#t3"\n\t" \
|
||||
"vpmuludq "#r0", "#h2", "#v1"\n\t" \
|
||||
"vpmuludq "#r1", "#h2", "#v2"\n\t" \
|
||||
"vpaddq "#t0", "#v3", "#t0"\n\t" \
|
||||
"vpmuludq "#r1", "#h3", "#v3"\n\t" \
|
||||
"vpaddq "#t1", "#v0", "#t1"\n\t" \
|
||||
"vpmuludq "#r1", "#h0", "#v0"\n\t" \
|
||||
"vpaddq "#t2", "#v1", "#t2"\n\t" \
|
||||
"vpmuludq "#r1", "#h1", "#v1"\n\t" \
|
||||
"vpaddq "#t3", "#v2", "#t3"\n\t" \
|
||||
"vpmuludq "#r2", "#h1", "#v2"\n\t" \
|
||||
"vpaddq "#t4", "#v3", "#t4"\n\t" \
|
||||
"vpmuludq "#r2", "#h2", "#v3"\n\t" \
|
||||
"vpaddq "#t1", "#v0", "#t1"\n\t" \
|
||||
"vpmuludq "#r2", "#h0", "#v0"\n\t" \
|
||||
"vpaddq "#t2", "#v1", "#t2"\n\t" \
|
||||
"vpmuludq "#r3", "#h0", "#v1"\n\t" \
|
||||
"vpaddq "#t3", "#v2", "#t3"\n\t" \
|
||||
"vpmuludq "#r3", "#h1", "#v2"\n\t" \
|
||||
"vpaddq "#t4", "#v3", "#t4"\n\t" \
|
||||
"vpmuludq "#r4", "#h0", "#v3"\n\t" \
|
||||
"vpaddq "#t2", "#v0", "#t2"\n\t" \
|
||||
"vpaddq "#t3", "#v1", "#t3"\n\t" \
|
||||
"vpaddq "#t4", "#v2", "#t4"\n\t" \
|
||||
"vpaddq "#t4", "#v3", "#t4"\n\t"
|
||||
|
||||
/* Reduce the 64 bits of data to 26 bits.
|
||||
*
|
||||
* h0-h4 contain the reduced H values.
|
||||
* m0-m4 contain the 4 H values to reduce.
|
||||
* t0-t2 are temporaries.
|
||||
* mask contains the 26-bit mask for each 64 bit value in the 256 bit register.
|
||||
*/
|
||||
#define REDUCE(h0, h1, h2, h3, h4, \
|
||||
m0, m1, m2, m3, m4, \
|
||||
t0, t1, t2, mask) \
|
||||
"vpsrlq $26, "#m0", "#t0"\n\t" \
|
||||
"vpsrlq $26, "#m3", "#t1"\n\t" \
|
||||
"vpand "#mask", "#m0", "#m0"\n\t" \
|
||||
"vpand "#mask", "#m3", "#m3"\n\t" \
|
||||
"vpaddq "#m1", "#t0", "#m1"\n\t" \
|
||||
"vpaddq "#m4", "#t1", "#m4"\n\t" \
|
||||
\
|
||||
"vpsrlq $26, "#m1", "#t0"\n\t" \
|
||||
"vpsrlq $26, "#m4", "#t1"\n\t" \
|
||||
"vpand "#mask", "#m1", "#h1"\n\t" \
|
||||
"vpand "#mask", "#m4", "#h4"\n\t" \
|
||||
"vpaddq "#m2", "#t0", "#m2"\n\t" \
|
||||
"vpslld $2, "#t1", "#t2"\n\t" \
|
||||
"vpaddd "#t2", "#t1", "#t2"\n\t" \
|
||||
\
|
||||
"vpsrlq $26, "#m2", "#t0"\n\t" \
|
||||
"vpaddq "#m0", "#t2", "#m0"\n\t" \
|
||||
"vpsrlq $26, "#m0", "#t1"\n\t" \
|
||||
"vpand "#mask", "#m2", "#h2"\n\t" \
|
||||
"vpand "#mask", "#m0", "#h0"\n\t" \
|
||||
"vpaddq "#m3", "#t0", "#m3"\n\t" \
|
||||
"vpaddq "#h1", "#t1", "#h1"\n\t" \
|
||||
\
|
||||
"vpsrlq $26, "#m3", "#t0"\n\t" \
|
||||
"vpand "#mask", "#m3", "#h3"\n\t" \
|
||||
"vpaddq "#h4", "#t0", "#h4"\n\t" \
|
||||
|
||||
|
||||
/* Process multiple blocks (n * 16 bytes) of data.
|
||||
*
|
||||
* ctx Poly1305 context.
|
||||
* m Blocks of message data.
|
||||
* bytes The number of bytes to process.
|
||||
*/
|
||||
POLY1305_NOINLINE static void poly1305_blocks_avx2(Poly1305* ctx,
|
||||
const unsigned char* m, size_t bytes)
|
||||
{
|
||||
ALIGN256 word64 r4[5][4];
|
||||
ALIGN256 word64 s[4][4];
|
||||
register word32 t0 asm("r8") = 0;
|
||||
register word32 t1 asm("r9") = 0;
|
||||
register word32 t2 asm("r10") = 0;
|
||||
register word32 t3 asm("r11") = 0;
|
||||
register word32 t4 asm("r12") = 0;
|
||||
static const word64 mask[4] = { 0x0000000003ffffff, 0x0000000003ffffff,
|
||||
0x0000000003ffffff, 0x0000000003ffffff };
|
||||
static const word64 hibit[4] = { 0x1000000, 0x1000000,
|
||||
0x1000000, 0x1000000 };
|
||||
|
||||
__asm__ __volatile__ (
|
||||
"vpxor %%ymm15, %%ymm15, %%ymm15\n\t"
|
||||
"cmpb $1, %[started]\n\t"
|
||||
"je L_begin\n\t"
|
||||
"cmpb $1, %[fin]\n\t"
|
||||
"je L_begin\n\t"
|
||||
"# Load the message data\n\t"
|
||||
LOAD_M(m, %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, %[hibit], %%ymm15)
|
||||
"vmovdqu %[mask], %%ymm14\n\t"
|
||||
"# Reduce, in place, the message data\n\t"
|
||||
REDUCE(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
|
||||
%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
|
||||
%%ymm10, %%ymm11, %%ymm12, %%ymm14)
|
||||
"addq $64, %[m]\n\t"
|
||||
"subq $64, %[bytes]\n\t"
|
||||
"jz L_store\n\t"
|
||||
"jmp L_load_r4\n\t"
|
||||
"\n"
|
||||
"L_begin:\n\t"
|
||||
"# Load the H values.\n\t"
|
||||
LOAD_H(%[h], %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4)
|
||||
"# Check if there is a power of r to load - otherwise use r^4.\n\t"
|
||||
"cmpb $0, %[fin]\n\t"
|
||||
"je L_load_r4\n\t"
|
||||
"\n\t"
|
||||
"# Load the 4 powers of r - r^4, r^3, r^2, r^1.\n\t"
|
||||
LOAD_Rx4(%%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
|
||||
%%ymm10, %%ymm11, %%ymm12, %%ymm13)
|
||||
"jmp L_mul_5\n\t"
|
||||
"\n"
|
||||
"L_load_r4:\n\t"
|
||||
"# Load r^4 into all four positions.\n\t"
|
||||
LOAD_R4(320(%[ctx]), %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
|
||||
%%ymm13, %%ymm14)
|
||||
"\n"
|
||||
"L_mul_5:\n\t"
|
||||
"# Multiply top 4 26-bit values of all four H by 5\n\t"
|
||||
MUL5(%%ymm10, %%ymm11, %%ymm12, %%ymm13, %%ymm6, %%ymm7, %%ymm8, %%ymm9)
|
||||
"# Store powers of r and multiple of 5 for use in multiply.\n\t"
|
||||
"vmovdqa %%ymm10, (%[s])\n\t"
|
||||
"vmovdqa %%ymm11, 32(%[s])\n\t"
|
||||
"vmovdqa %%ymm12, 64(%[s])\n\t"
|
||||
"vmovdqa %%ymm13, 96(%[s])\n\t"
|
||||
"vmovdqa %%ymm5 , (%[r4])\n\t"
|
||||
"vmovdqa %%ymm6 , 32(%[r4])\n\t"
|
||||
"vmovdqa %%ymm7 , 64(%[r4])\n\t"
|
||||
"vmovdqa %%ymm8 , 96(%[r4])\n\t"
|
||||
"vmovdqa %%ymm9 , 128(%[r4])\n\t"
|
||||
"vmovdqu %[mask], %%ymm14\n\t"
|
||||
"\n"
|
||||
"# If not finished then loop over data\n\t"
|
||||
"cmpb $0x1, %[fin]\n\t"
|
||||
"jne L_start\n\t"
|
||||
"# Do last multiply, reduce, add the four H together and move to\n\t"
|
||||
"# 32-bit registers\n\t"
|
||||
MUL_AVX2(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
|
||||
(%[r4]), 32(%[r4]), 64(%[r4]), 96(%[r4]), 128(%[r4]),
|
||||
(%[s]), 32(%[s]), 64(%[s]), 96(%[s]),
|
||||
%%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
|
||||
%%ymm10, %%ymm11, %%ymm12, %%ymm13)
|
||||
REDUCE(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
|
||||
%%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
|
||||
%%ymm10, %%ymm11, %%ymm12, %%ymm14)
|
||||
FINALIZE_H(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
|
||||
%%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9)
|
||||
MOVE_TO_32(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4,
|
||||
%[t0], %[t1], %[t2], %[t3], %[t4])
|
||||
"jmp L_end\n\t"
|
||||
"\n"
|
||||
"L_start:\n\t"
|
||||
MUL_ADD_AVX2(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
|
||||
(%[r4]), 32(%[r4]), 64(%[r4]), 96(%[r4]), 128(%[r4]),
|
||||
(%[s]), 32(%[s]), 64(%[s]), 96(%[s]),
|
||||
%%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
|
||||
%%ymm10, %%ymm11, %%ymm12, %%ymm13,
|
||||
%[hibit], %%ymm15)
|
||||
REDUCE(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
|
||||
%%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
|
||||
%%ymm10, %%ymm11, %%ymm12, %%ymm14)
|
||||
"addq $64, %[m]\n\t"
|
||||
"subq $64, %[bytes]\n\t"
|
||||
"jnz L_start\n\t"
|
||||
"\n"
|
||||
"L_store:\n\t"
|
||||
"# Store four H values - state\n\t"
|
||||
STORE_H(%[h], %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, %%xmm4)
|
||||
"\n"
|
||||
"L_end:\n\t"
|
||||
: [m] "+r" (m), [bytes] "+r" (bytes),
|
||||
[t0] "+r" (t0), [t1] "+r" (t1), [t2] "+r" (t2),
|
||||
[t3] "+r" (t3), [t4] "+r" (t4)
|
||||
: [ctx] "r" (ctx), [h] "r" (ctx->hh),
|
||||
[r4] "r" (r4), [s] "r" (s),
|
||||
[fin] "m" (ctx->finished), [started] "m" (ctx->started),
|
||||
[mask] "m" (mask), [hibit] "m" (hibit)
|
||||
: "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7",
|
||||
"ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
|
||||
"memory"
|
||||
);
|
||||
|
||||
if (ctx->finished)
|
||||
{
|
||||
word64 h0, h1, h2, c;
|
||||
|
||||
/* Convert to 64-bit form. */
|
||||
h0 = (((word64)(t1 & 0x3FFFF)) << 26) + t0;
|
||||
h1 = (((word64)(t3 & 0x3FF)) << 34) +
|
||||
(((word64) t2 ) << 8) + (t1 >> 18);
|
||||
h2 = (((word64) t4 ) << 16) + (t3 >> 10);
|
||||
|
||||
/* Perform modulur reduction. */
|
||||
c = (h1 >> 44); h1 &= 0xfffffffffff;
|
||||
h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
|
||||
h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
|
||||
h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff;
|
||||
h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff;
|
||||
h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
|
||||
h1 += c;
|
||||
|
||||
/* Convert from 42/44/44 to 2/64/64 bits used and store result. */
|
||||
ctx->h[0] = h0 | (h1 << 44);
|
||||
ctx->h[1] = (h1 >> 20) | (h2 << 24);
|
||||
ctx->h[2] = h2 >> 40;
|
||||
}
|
||||
|
||||
ctx->started = 1;
|
||||
}
|
||||
|
||||
/* Multiply two 130-bit numbers in 64-bit registers and reduce.
|
||||
* 44 + 44 + 42 = 130 bits
|
||||
*
|
||||
* r0-r2 are the first operand and the result.
|
||||
* a0-a2 are the second operand.
|
||||
*/
|
||||
#define MUL_64(r0, r1, r2, a0, a1, a2) \
|
||||
s1 = a1 * (5 << 2); \
|
||||
s2 = a2 * (5 << 2); \
|
||||
MUL(d0, r0, a0); MUL(d, r1, s2); ADD(d0, d); MUL(d, r2, s1); ADD(d0, d); \
|
||||
MUL(d1, r0, a1); MUL(d, r1, a0); ADD(d1, d); MUL(d, r2, s2); ADD(d1, d); \
|
||||
MUL(d2, r0, a2); MUL(d, r1, a1); ADD(d2, d); MUL(d, r2, a0); ADD(d2, d); \
|
||||
\
|
||||
c = SHR(d0, 44); r0 = LO(d0) & 0xfffffffffff; \
|
||||
ADDLO(d1, c); c = SHR(d1, 44); r1 = LO(d1) & 0xfffffffffff; \
|
||||
ADDLO(d2, c); c = SHR(d2, 42); r2 = LO(d2) & 0x3ffffffffff; \
|
||||
r0 += c * 5; c = (r0 >> 44); r0 = r0 & 0xfffffffffff; \
|
||||
r1 += c
|
||||
|
||||
#define SQR_64(r0, r1, r2) \
|
||||
s2 = r2 * (5 << 2); \
|
||||
MUL(d0, r1, s2); ADD(d0, d0); MUL(d, r0, r0); ADD(d0, d); \
|
||||
MUL(d1, r0, r1); ADD(d1, d1); MUL(d, r2, s2); ADD(d1, d); \
|
||||
MUL(d2, r0, r2); ADD(d2, d2); MUL(d, r1, r1); ADD(d2, d); \
|
||||
\
|
||||
c = SHR(d0, 44); r0 = LO(d0) & 0xfffffffffff; \
|
||||
ADDLO(d1, c); c = SHR(d1, 44); r1 = LO(d1) & 0xfffffffffff; \
|
||||
ADDLO(d2, c); c = SHR(d2, 42); r2 = LO(d2) & 0x3ffffffffff; \
|
||||
r0 += c * 5; c = (r0 >> 44); r0 = r0 & 0xfffffffffff; \
|
||||
r1 += c
|
||||
|
||||
/* Store the 130-bit number in 64-bit registers as 26-bit values in 32 bits.
|
||||
*
|
||||
* r0-r2 contains the 130-bit number in 64-bit registers.
|
||||
* r is the address of where to store the 26 of 32 bits result.
|
||||
*/
|
||||
#define CONV_64_TO_32(r0, r1, r2, r) \
|
||||
r[0] = (word32)( r0 ) & 0x3ffffff; \
|
||||
r[1] = (word32)((r0 >> 26) | (r1 << 18)) & 0x3ffffff; \
|
||||
r[2] = (word32)( r1 >> 8 ) & 0x3ffffff; \
|
||||
r[3] = (word32)((r1 >> 34) | (r2 << 10)) & 0x3ffffff; \
|
||||
r[4] = (word32)( r2 >> 16 )
|
||||
|
||||
extern void poly1305_blocks_avx2(Poly1305* ctx, const unsigned char* m,
|
||||
size_t bytes);
|
||||
/* Calculate R^1, R^2, R^3 and R^4 and store them in the context.
|
||||
*
|
||||
* ctx Poly1305 context.
|
||||
*/
|
||||
static void poly1305_calc_powers(Poly1305* ctx)
|
||||
{
|
||||
word64 r0, r1, r2, t0, t1, c;
|
||||
word64 r20, r21, r22;
|
||||
word64 r30, r31, r32;
|
||||
word64 r40, r41, r42;
|
||||
word64 s1, s2;
|
||||
word128 d0, d1, d2, d;
|
||||
|
||||
t0 = ctx->r[0];
|
||||
t1 = ctx->r[1];
|
||||
r0 = ( t0 ) & 0xfffffffffff;
|
||||
r1 = ((t0 >> 44) | (t1 << 20)) & 0xfffffffffff;
|
||||
r2 = ((t1 >> 24) ) & 0x00fffffffff;
|
||||
|
||||
/* Store r^1 */
|
||||
CONV_64_TO_32(r0, r1, r2, ctx->r1);
|
||||
|
||||
/* Calc and store r^2 */
|
||||
r20 = r0; r21 = r1; r22 = r2;
|
||||
SQR_64(r20, r21, r22);
|
||||
CONV_64_TO_32(r20, r21, r22, ctx->r2);
|
||||
|
||||
/* Calc and store r^3 */
|
||||
r30 = r20; r31 = r21; r32 = r22;
|
||||
MUL_64(r30, r31, r32, r0, r1, r2);
|
||||
CONV_64_TO_32(r30, r31, r32, ctx->r3);
|
||||
|
||||
/* Calc and store r^4 */
|
||||
r40 = r20; r41 = r21; r42 = r22;
|
||||
SQR_64(r40, r41, r42);
|
||||
CONV_64_TO_32(r40, r41, r42, ctx->r4);
|
||||
|
||||
}
|
||||
|
||||
extern void poly1305_calc_powers_avx2(Poly1305* ctx);
|
||||
/* Set the key to use when processing data.
|
||||
* Initialize the context.
|
||||
* Calls AVX set key function as final function calls AVX code.
|
||||
@ -946,27 +174,7 @@ static void poly1305_calc_powers(Poly1305* ctx)
|
||||
* ctx Poly1305 context.
|
||||
* key The key data (16 bytes).
|
||||
*/
|
||||
static void poly1305_setkey_avx2(Poly1305* ctx, const byte* key)
|
||||
{
|
||||
poly1305_setkey_avx(ctx, key);
|
||||
|
||||
__asm__ __volatile__ (
|
||||
"vpxor %%ymm0, %%ymm0, %%ymm0\n\t"
|
||||
"vmovdqu %%ymm0, (%[hh])\n\t"
|
||||
"vmovdqu %%ymm0, 32(%[hh])\n\t"
|
||||
"vmovdqu %%ymm0, 64(%[hh])\n\t"
|
||||
"vmovdqu %%ymm0, 96(%[hh])\n\t"
|
||||
"vmovdqu %%ymm0, 128(%[hh])\n\t"
|
||||
:
|
||||
: [hh] "r" (ctx->hh)
|
||||
: "memory", "ymm0"
|
||||
);
|
||||
|
||||
ctx->leftover = 0;
|
||||
ctx->finished = 0;
|
||||
ctx->started = 0;
|
||||
}
|
||||
|
||||
extern void poly1305_setkey_avx2(Poly1305* ctx, const byte* key);
|
||||
/* Calculate the final result - authentication data.
|
||||
* Zeros out the private data in the context.
|
||||
* Calls AVX final function to quickly process last blocks.
|
||||
@ -974,46 +182,11 @@ static void poly1305_setkey_avx2(Poly1305* ctx, const byte* key)
|
||||
* ctx Poly1305 context.
|
||||
* mac Buffer to hold 16 bytes - authentication data.
|
||||
*/
|
||||
static void poly1305_final_avx2(Poly1305* ctx, byte* mac)
|
||||
{
|
||||
int i, j;
|
||||
int l = (int)ctx->leftover;
|
||||
extern void poly1305_final_avx2(Poly1305* ctx, byte* mac);
|
||||
#endif
|
||||
|
||||
ctx->finished = 1;
|
||||
if (ctx->started)
|
||||
poly1305_blocks_avx2(ctx, ctx->buffer, POLY1305_BLOCK_SIZE * 4);
|
||||
|
||||
i = l & ~(POLY1305_BLOCK_SIZE - 1);
|
||||
if (i > 0)
|
||||
poly1305_blocks_avx(ctx, ctx->buffer, i);
|
||||
ctx->leftover -= i;
|
||||
for (j = 0; i < l; i++, j++)
|
||||
ctx->buffer[j] = ctx->buffer[i];
|
||||
|
||||
poly1305_final_avx(ctx, mac);
|
||||
|
||||
/* zero out the state */
|
||||
__asm__ __volatile__ (
|
||||
"vpxor %%ymm0, %%ymm0, %%ymm0\n\t"
|
||||
"vmovdqu %%ymm0, (%[hh])\n\t"
|
||||
"vmovdqu %%ymm0, 32(%[hh])\n\t"
|
||||
"vmovdqu %%ymm0, 64(%[hh])\n\t"
|
||||
"vmovdqu %%ymm0, 96(%[hh])\n\t"
|
||||
"vmovdqu %%ymm0, 128(%[hh])\n\t"
|
||||
"vmovdqu %%ymm0, (%[r1])\n\t"
|
||||
"vmovdqu %%ymm0, (%[r2])\n\t"
|
||||
"vmovdqu %%ymm0, (%[r3])\n\t"
|
||||
"vmovdqu %%ymm0, (%[r4])\n\t"
|
||||
:
|
||||
: [hh] "r" (ctx->hh), [r1] "r" (ctx->r1), [r2] "r" (ctx->r2),
|
||||
[r3] "r" (ctx->r3), [r4] "r" (ctx->r4)
|
||||
: "memory", "ymm0"
|
||||
);
|
||||
|
||||
ctx->leftover = 0;
|
||||
ctx->finished = 0;
|
||||
ctx->started = 0;
|
||||
}
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#elif defined(POLY130564)
|
||||
@ -1511,7 +684,7 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
|
||||
return 0;
|
||||
|
||||
if (!ctx->started)
|
||||
poly1305_calc_powers(ctx);
|
||||
poly1305_calc_powers_avx2(ctx);
|
||||
poly1305_blocks_avx2(ctx, ctx->buffer, sizeof(ctx->buffer));
|
||||
ctx->leftover = 0;
|
||||
}
|
||||
@ -1521,7 +694,7 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
|
||||
size_t want = bytes & ~(sizeof(ctx->buffer) - 1);
|
||||
|
||||
if (!ctx->started)
|
||||
poly1305_calc_powers(ctx);
|
||||
poly1305_calc_powers_avx2(ctx);
|
||||
poly1305_blocks_avx2(ctx, m, want);
|
||||
m += want;
|
||||
bytes -= (word32)want;
|
||||
|
986
wolfcrypt/src/poly1305_asm.S
Normal file
986
wolfcrypt/src/poly1305_asm.S
Normal file
@ -0,0 +1,986 @@
|
||||
/* poly1305_asm
|
||||
*
|
||||
* Copyright (C) 2006-2018 wolfSSL Inc.
|
||||
*
|
||||
* This file is part of wolfSSL.
|
||||
*
|
||||
* wolfSSL is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* wolfSSL is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
|
||||
*/
|
||||
|
||||
#ifndef HAVE_INTEL_AVX1
|
||||
#define HAVE_INTEL_AVX1
|
||||
#endif /* HAVE_INTEL_AVX1 */
|
||||
#ifndef HAVE_INTEL_AVX2
|
||||
#define HAVE_INTEL_AVX2
|
||||
#endif /* HAVE_INTEL_AVX2 */
|
||||
|
||||
#ifdef HAVE_INTEL_AVX1
|
||||
.globl poly1305_setkey_avx
|
||||
.type poly1305_setkey_avx,@function
|
||||
.align 4
|
||||
poly1305_setkey_avx:
|
||||
movabsq $0xffffffc0fffffff, %r10
|
||||
movabsq $0xffffffc0ffffffc, %r11
|
||||
movq (%rsi), %rdx
|
||||
movq 8(%rsi), %rax
|
||||
movq 16(%rsi), %rcx
|
||||
movq 24(%rsi), %r8
|
||||
andq %r10, %rdx
|
||||
andq %r11, %rax
|
||||
movq %rdx, %r10
|
||||
movq %rax, %r11
|
||||
xorq %r9, %r9
|
||||
movq %rdx, (%rdi)
|
||||
movq %rax, 8(%rdi)
|
||||
movq %r9, 24(%rdi)
|
||||
movq %r9, 32(%rdi)
|
||||
movq %r9, 40(%rdi)
|
||||
movq %rcx, 48(%rdi)
|
||||
movq %r8, 56(%rdi)
|
||||
movq %r9, 352(%rdi)
|
||||
movq %r9, 408(%rdi)
|
||||
movq %rdx, 360(%rdi)
|
||||
movq %rax, 416(%rdi)
|
||||
addq %rdx, %r10
|
||||
addq %rax, %r11
|
||||
movq %r10, 368(%rdi)
|
||||
movq %r11, 424(%rdi)
|
||||
addq %rdx, %r10
|
||||
addq %rax, %r11
|
||||
movq %r10, 376(%rdi)
|
||||
movq %r11, 432(%rdi)
|
||||
addq %rdx, %r10
|
||||
addq %rax, %r11
|
||||
movq %r10, 384(%rdi)
|
||||
movq %r11, 440(%rdi)
|
||||
addq %rdx, %r10
|
||||
addq %rax, %r11
|
||||
movq %r10, 392(%rdi)
|
||||
movq %r11, 448(%rdi)
|
||||
addq %rdx, %r10
|
||||
addq %rax, %r11
|
||||
movq %r10, 400(%rdi)
|
||||
movq %r11, 456(%rdi)
|
||||
movq %r9, 608(%rdi)
|
||||
movb $0x01, 616(%rdi)
|
||||
repz retq
|
||||
.size poly1305_setkey_avx,.-poly1305_setkey_avx
|
||||
.globl poly1305_block_avx
|
||||
.type poly1305_block_avx,@function
|
||||
.align 4
|
||||
poly1305_block_avx:
|
||||
pushq %r15
|
||||
pushq %rbx
|
||||
pushq %r12
|
||||
pushq %r13
|
||||
pushq %r14
|
||||
movq (%rdi), %r15
|
||||
movq 8(%rdi), %rbx
|
||||
movq 24(%rdi), %r8
|
||||
movq 32(%rdi), %r9
|
||||
movq 40(%rdi), %r10
|
||||
xorq %r14, %r14
|
||||
movb 616(%rdi), %r14b
|
||||
# h += m
|
||||
movq (%rsi), %r11
|
||||
movq 8(%rsi), %r12
|
||||
addq %r11, %r8
|
||||
adcq %r12, %r9
|
||||
movq %rbx, %rax
|
||||
adcq %r14, %r10
|
||||
# r[1] * h[0] => rdx, rax ==> t2, t1
|
||||
mulq %r8
|
||||
movq %rax, %r12
|
||||
movq %rdx, %r13
|
||||
# r[0] * h[1] => rdx, rax ++> t2, t1
|
||||
movq %r15, %rax
|
||||
mulq %r9
|
||||
addq %rax, %r12
|
||||
movq %r15, %rax
|
||||
adcq %rdx, %r13
|
||||
# r[0] * h[0] => rdx, rax ==> t4, t0
|
||||
mulq %r8
|
||||
movq %rax, %r11
|
||||
movq %rdx, %r8
|
||||
# r[1] * h[1] => rdx, rax =+> t3, t2
|
||||
movq %rbx, %rax
|
||||
mulq %r9
|
||||
# r[0] * h[2] +> t2
|
||||
addq 352(%rdi,%r10,8), %r13
|
||||
movq %rdx, %r14
|
||||
addq %r8, %r12
|
||||
adcq %rax, %r13
|
||||
# r[1] * h[2] +> t3
|
||||
adcq 408(%rdi,%r10,8), %r14
|
||||
# r * h in r14, r13, r12, r11
|
||||
# h = (r * h) mod 2^130 - 5
|
||||
movq %r13, %r10
|
||||
andq $-4, %r13
|
||||
andq $3, %r10
|
||||
addq %r13, %r11
|
||||
movq %r13, %r8
|
||||
adcq %r14, %r12
|
||||
adcq $0x00, %r10
|
||||
shrdq $2, %r14, %r8
|
||||
shrq $2, %r14
|
||||
addq %r11, %r8
|
||||
adcq %r14, %r12
|
||||
movq %r12, %r9
|
||||
adcq $0x00, %r10
|
||||
# h in r10, r9, r8
|
||||
# Store h to ctx
|
||||
movq %r8, 24(%rdi)
|
||||
movq %r9, 32(%rdi)
|
||||
movq %r10, 40(%rdi)
|
||||
popq %r14
|
||||
popq %r13
|
||||
popq %r12
|
||||
popq %rbx
|
||||
popq %r15
|
||||
repz retq
|
||||
.size poly1305_block_avx,.-poly1305_block_avx
|
||||
.globl poly1305_blocks_avx
|
||||
.type poly1305_blocks_avx,@function
|
||||
.align 4
|
||||
poly1305_blocks_avx:
|
||||
pushq %r15
|
||||
pushq %rbx
|
||||
pushq %r12
|
||||
pushq %r13
|
||||
pushq %r14
|
||||
movq %rdx, %rcx
|
||||
movq (%rdi), %r15
|
||||
movq 8(%rdi), %rbx
|
||||
movq 24(%rdi), %r8
|
||||
movq 32(%rdi), %r9
|
||||
movq 40(%rdi), %r10
|
||||
L_poly1305_avx_blocks_start:
|
||||
# h += m
|
||||
movq (%rsi), %r11
|
||||
movq 8(%rsi), %r12
|
||||
addq %r11, %r8
|
||||
adcq %r12, %r9
|
||||
movq %rbx, %rax
|
||||
adcq $0x00, %r10
|
||||
# r[1] * h[0] => rdx, rax ==> t2, t1
|
||||
mulq %r8
|
||||
movq %rax, %r12
|
||||
movq %rdx, %r13
|
||||
# r[0] * h[1] => rdx, rax ++> t2, t1
|
||||
movq %r15, %rax
|
||||
mulq %r9
|
||||
addq %rax, %r12
|
||||
movq %r15, %rax
|
||||
adcq %rdx, %r13
|
||||
# r[0] * h[0] => rdx, rax ==> t4, t0
|
||||
mulq %r8
|
||||
movq %rax, %r11
|
||||
movq %rdx, %r8
|
||||
# r[1] * h[1] => rdx, rax =+> t3, t2
|
||||
movq %rbx, %rax
|
||||
mulq %r9
|
||||
# r[0] * h[2] +> t2
|
||||
addq 360(%rdi,%r10,8), %r13
|
||||
movq %rdx, %r14
|
||||
addq %r8, %r12
|
||||
adcq %rax, %r13
|
||||
# r[1] * h[2] +> t3
|
||||
adcq 416(%rdi,%r10,8), %r14
|
||||
# r * h in r14, r13, r12, r11
|
||||
# h = (r * h) mod 2^130 - 5
|
||||
movq %r13, %r10
|
||||
andq $-4, %r13
|
||||
andq $3, %r10
|
||||
addq %r13, %r11
|
||||
movq %r13, %r8
|
||||
adcq %r14, %r12
|
||||
adcq $0x00, %r10
|
||||
shrdq $2, %r14, %r8
|
||||
shrq $2, %r14
|
||||
addq %r11, %r8
|
||||
adcq %r14, %r12
|
||||
movq %r12, %r9
|
||||
adcq $0x00, %r10
|
||||
# h in r10, r9, r8
|
||||
# Next block from message
|
||||
addq $16, %rsi
|
||||
subq $16, %rcx
|
||||
jg L_poly1305_avx_blocks_start
|
||||
# Store h to ctx
|
||||
movq %r8, 24(%rdi)
|
||||
movq %r9, 32(%rdi)
|
||||
movq %r10, 40(%rdi)
|
||||
popq %r14
|
||||
popq %r13
|
||||
popq %r12
|
||||
popq %rbx
|
||||
popq %r15
|
||||
repz retq
|
||||
.size poly1305_blocks_avx,.-poly1305_blocks_avx
|
||||
.globl poly1305_final_avx
|
||||
.type poly1305_final_avx,@function
|
||||
.align 4
|
||||
poly1305_final_avx:
|
||||
pushq %rbx
|
||||
pushq %r12
|
||||
movq %rsi, %rbx
|
||||
movq 608(%rdi), %rax
|
||||
testq %rax, %rax
|
||||
je L_poly1305_avx_final_no_more
|
||||
movb $0x01, 480(%rdi,%rax,1)
|
||||
jmp L_poly1305_avx_final_cmp_rem
|
||||
L_poly1305_avx_final_zero_rem:
|
||||
movb $0x00, 480(%rdi,%rax,1)
|
||||
L_poly1305_avx_final_cmp_rem:
|
||||
incb %al
|
||||
cmpq $16, %rax
|
||||
jl L_poly1305_avx_final_zero_rem
|
||||
movb $0x00, 616(%rdi)
|
||||
leaq 480(%rdi), %rsi
|
||||
callq poly1305_block_avx@plt
|
||||
L_poly1305_avx_final_no_more:
|
||||
movq 24(%rdi), %rax
|
||||
movq 32(%rdi), %rdx
|
||||
movq 40(%rdi), %rcx
|
||||
movq 48(%rdi), %r11
|
||||
movq 56(%rdi), %r12
|
||||
# h %= p
|
||||
# h = (h + pad)
|
||||
# mod 2^130 - 5
|
||||
movq %rcx, %r8
|
||||
andq $3, %rcx
|
||||
shrq $2, %r8
|
||||
# Multily by 5
|
||||
leaq 0(%r8,%r8,4), %r8
|
||||
addq %r8, %rax
|
||||
adcq $0x00, %rdx
|
||||
adcq $0x00, %rcx
|
||||
# Fixup when between (1 << 130) - 1 and (1 << 130) - 5
|
||||
movq %rax, %r8
|
||||
movq %rdx, %r9
|
||||
movq %rcx, %r10
|
||||
addq $5, %r8
|
||||
adcq $0x00, %r9
|
||||
adcq $0x00, %r10
|
||||
cmpq $4, %r10
|
||||
cmoveq %r8, %rax
|
||||
cmoveq %r9, %rdx
|
||||
# h += pad
|
||||
addq %r11, %rax
|
||||
adcq %r12, %rdx
|
||||
movq %rax, (%rbx)
|
||||
movq %rdx, 8(%rbx)
|
||||
# Zero out r
|
||||
movq $0x00, (%rdi)
|
||||
movq $0x00, 8(%rdi)
|
||||
# Zero out h
|
||||
movq $0x00, 24(%rdi)
|
||||
movq $0x00, 32(%rdi)
|
||||
movq $0x00, 40(%rdi)
|
||||
# Zero out pad
|
||||
movq $0x00, 48(%rdi)
|
||||
movq $0x00, 56(%rdi)
|
||||
popq %r12
|
||||
popq %rbx
|
||||
repz retq
|
||||
.size poly1305_final_avx,.-poly1305_final_avx
|
||||
#endif /* HAVE_INTEL_AVX1 */
|
||||
#ifdef HAVE_INTEL_AVX2
|
||||
.globl poly1305_calc_powers_avx2
|
||||
.type poly1305_calc_powers_avx2,@function
|
||||
.align 4
|
||||
poly1305_calc_powers_avx2:
|
||||
pushq %r12
|
||||
pushq %r13
|
||||
pushq %r14
|
||||
pushq %r15
|
||||
pushq %rbx
|
||||
pushq %rbp
|
||||
movq (%rdi), %rcx
|
||||
movq 8(%rdi), %r8
|
||||
xorq %r9, %r9
|
||||
# Convert to 26 bits in 32
|
||||
movq %rcx, %rax
|
||||
movq %rcx, %rdx
|
||||
movq %rcx, %rsi
|
||||
movq %r8, %rbx
|
||||
movq %r8, %rbp
|
||||
shrq $26, %rdx
|
||||
shrdq $52, %r8, %rsi
|
||||
shrq $14, %rbx
|
||||
shrdq $40, %r9, %rbp
|
||||
andq $0x3ffffff, %rax
|
||||
andq $0x3ffffff, %rdx
|
||||
andq $0x3ffffff, %rsi
|
||||
andq $0x3ffffff, %rbx
|
||||
andq $0x3ffffff, %rbp
|
||||
movl %eax, 224(%rdi)
|
||||
movl %edx, 228(%rdi)
|
||||
movl %esi, 232(%rdi)
|
||||
movl %ebx, 236(%rdi)
|
||||
movl %ebp, 240(%rdi)
|
||||
movl $0x00, 244(%rdi)
|
||||
# Square 128-bit
|
||||
movq %r8, %rax
|
||||
mulq %rcx
|
||||
xorq %r13, %r13
|
||||
movq %rax, %r11
|
||||
movq %rdx, %r12
|
||||
addq %rax, %r11
|
||||
adcq %rdx, %r12
|
||||
adcq $0x00, %r13
|
||||
movq %rcx, %rax
|
||||
mulq %rax
|
||||
movq %rax, %r10
|
||||
movq %rdx, %r15
|
||||
movq %r8, %rax
|
||||
mulq %rax
|
||||
addq %r15, %r11
|
||||
adcq %rax, %r12
|
||||
adcq %rdx, %r13
|
||||
# Reduce 256-bit to 130-bit
|
||||
movq %r12, %rax
|
||||
movq %r13, %rdx
|
||||
andq $-4, %rax
|
||||
andq $3, %r12
|
||||
addq %rax, %r10
|
||||
adcq %rdx, %r11
|
||||
adcq $0x00, %r12
|
||||
shrdq $2, %rdx, %rax
|
||||
shrq $2, %rdx
|
||||
addq %rax, %r10
|
||||
adcq %rdx, %r11
|
||||
adcq $0x00, %r12
|
||||
movq %r12, %rax
|
||||
shrq $2, %rax
|
||||
leaq 0(%rax,%rax,4), %rax
|
||||
addq %rax, %r10
|
||||
adcq $0x00, %r11
|
||||
adcq $0x00, %r12
|
||||
# Convert to 26 bits in 32
|
||||
movq %r10, %rax
|
||||
movq %r10, %rdx
|
||||
movq %r10, %rsi
|
||||
movq %r11, %rbx
|
||||
movq %r11, %rbp
|
||||
shrq $26, %rdx
|
||||
shrdq $52, %r11, %rsi
|
||||
shrq $14, %rbx
|
||||
shrdq $40, %r12, %rbp
|
||||
andq $0x3ffffff, %rax
|
||||
andq $0x3ffffff, %rdx
|
||||
andq $0x3ffffff, %rsi
|
||||
andq $0x3ffffff, %rbx
|
||||
andq $0x3ffffff, %rbp
|
||||
movl %eax, 256(%rdi)
|
||||
movl %edx, 260(%rdi)
|
||||
movl %esi, 264(%rdi)
|
||||
movl %ebx, 268(%rdi)
|
||||
movl %ebp, 272(%rdi)
|
||||
movl $0x00, 276(%rdi)
|
||||
# Multiply 128-bit by 130-bit
|
||||
# r1[0] * r2[0]
|
||||
movq %rcx, %rax
|
||||
mulq %r10
|
||||
movq %rax, %r13
|
||||
movq %rdx, %r14
|
||||
# r1[0] * r2[1]
|
||||
movq %rcx, %rax
|
||||
mulq %r11
|
||||
movq $0x00, %r15
|
||||
addq %rax, %r14
|
||||
adcq %rdx, %r15
|
||||
# r1[1] * r2[0]
|
||||
movq %r8, %rax
|
||||
mulq %r10
|
||||
movq $0x00, %rsi
|
||||
addq %rax, %r14
|
||||
adcq %rdx, %r15
|
||||
adcq $0x00, %rsi
|
||||
# r1[0] * r2[2]
|
||||
movq %rcx, %rax
|
||||
mulq %r12
|
||||
addq %rax, %r15
|
||||
adcq %rdx, %rsi
|
||||
# r1[1] * r2[1]
|
||||
movq %r8, %rax
|
||||
mulq %r11
|
||||
movq $0x00, %rbx
|
||||
addq %rax, %r15
|
||||
adcq %rdx, %rsi
|
||||
adcq $0x00, %rbx
|
||||
# r1[1] * r2[2]
|
||||
movq %r8, %rax
|
||||
mulq %r12
|
||||
addq %rax, %rsi
|
||||
adcq %rdx, %rbx
|
||||
# Reduce 260-bit to 130-bit
|
||||
movq %r15, %rax
|
||||
movq %rsi, %rdx
|
||||
movq %rbx, %rbx
|
||||
andq $-4, %rax
|
||||
andq $3, %r15
|
||||
addq %rax, %r13
|
||||
adcq %rdx, %r14
|
||||
adcq %rbx, %r15
|
||||
shrdq $2, %rdx, %rax
|
||||
shrdq $2, %rbx, %rdx
|
||||
shrq $2, %rbx
|
||||
addq %rax, %r13
|
||||
adcq %rdx, %r14
|
||||
adcq %rbx, %r15
|
||||
movq %r15, %rax
|
||||
andq $3, %r15
|
||||
shrq $2, %rax
|
||||
leaq 0(%rax,%rax,4), %rax
|
||||
addq %rax, %r13
|
||||
adcq $0x00, %r14
|
||||
adcq $0x00, %r15
|
||||
# Convert to 26 bits in 32
|
||||
movq %r13, %rax
|
||||
movq %r13, %rdx
|
||||
movq %r13, %rsi
|
||||
movq %r14, %rbx
|
||||
movq %r14, %rbp
|
||||
shrq $26, %rdx
|
||||
shrdq $52, %r14, %rsi
|
||||
shrq $14, %rbx
|
||||
shrdq $40, %r15, %rbp
|
||||
andq $0x3ffffff, %rax
|
||||
andq $0x3ffffff, %rdx
|
||||
andq $0x3ffffff, %rsi
|
||||
andq $0x3ffffff, %rbx
|
||||
andq $0x3ffffff, %rbp
|
||||
movl %eax, 288(%rdi)
|
||||
movl %edx, 292(%rdi)
|
||||
movl %esi, 296(%rdi)
|
||||
movl %ebx, 300(%rdi)
|
||||
movl %ebp, 304(%rdi)
|
||||
movl $0x00, 308(%rdi)
|
||||
# Square 130-bit
|
||||
movq %r11, %rax
|
||||
mulq %r10
|
||||
xorq %r13, %r13
|
||||
movq %rax, %r8
|
||||
movq %rdx, %r9
|
||||
addq %rax, %r8
|
||||
adcq %rdx, %r9
|
||||
adcq $0x00, %r13
|
||||
movq %r10, %rax
|
||||
mulq %rax
|
||||
movq %rax, %rcx
|
||||
movq %rdx, %r15
|
||||
movq %r11, %rax
|
||||
mulq %rax
|
||||
addq %r15, %r8
|
||||
adcq %rax, %r9
|
||||
adcq %rdx, %r13
|
||||
movq %r12, %rax
|
||||
mulq %rax
|
||||
movq %rax, %r14
|
||||
movq %r12, %rax
|
||||
mulq %r10
|
||||
addq %rax, %r9
|
||||
adcq %rdx, %r13
|
||||
adcq $0x00, %r14
|
||||
addq %rax, %r9
|
||||
adcq %rdx, %r13
|
||||
adcq $0x00, %r14
|
||||
movq %r12, %rax
|
||||
mulq %r11
|
||||
addq %rax, %r13
|
||||
adcq %rdx, %r14
|
||||
addq %rax, %r13
|
||||
adcq %rdx, %r14
|
||||
# Reduce 260-bit to 130-bit
|
||||
movq %r9, %rax
|
||||
movq %r13, %rdx
|
||||
movq %r14, %r15
|
||||
andq $-4, %rax
|
||||
andq $3, %r9
|
||||
addq %rax, %rcx
|
||||
adcq %rdx, %r8
|
||||
adcq %r15, %r9
|
||||
shrdq $2, %rdx, %rax
|
||||
shrdq $2, %r15, %rdx
|
||||
shrq $2, %r15
|
||||
addq %rax, %rcx
|
||||
adcq %rdx, %r8
|
||||
adcq %r15, %r9
|
||||
movq %r9, %rax
|
||||
andq $3, %r9
|
||||
shrq $2, %rax
|
||||
leaq 0(%rax,%rax,4), %rax
|
||||
addq %rax, %rcx
|
||||
adcq $0x00, %r8
|
||||
adcq $0x00, %r9
|
||||
# Convert to 26 bits in 32
|
||||
movq %rcx, %rax
|
||||
movq %rcx, %rdx
|
||||
movq %rcx, %rsi
|
||||
movq %r8, %rbx
|
||||
movq %r8, %rbp
|
||||
shrq $26, %rdx
|
||||
shrdq $52, %r8, %rsi
|
||||
shrq $14, %rbx
|
||||
shrdq $40, %r9, %rbp
|
||||
andq $0x3ffffff, %rax
|
||||
andq $0x3ffffff, %rdx
|
||||
andq $0x3ffffff, %rsi
|
||||
andq $0x3ffffff, %rbx
|
||||
andq $0x3ffffff, %rbp
|
||||
movl %eax, 320(%rdi)
|
||||
movl %edx, 324(%rdi)
|
||||
movl %esi, 328(%rdi)
|
||||
movl %ebx, 332(%rdi)
|
||||
movl %ebp, 336(%rdi)
|
||||
movl $0x00, 340(%rdi)
|
||||
popq %rbp
|
||||
popq %rbx
|
||||
popq %r15
|
||||
popq %r14
|
||||
popq %r13
|
||||
popq %r12
|
||||
repz retq
|
||||
.size poly1305_calc_powers_avx2,.-poly1305_calc_powers_avx2
|
||||
.globl poly1305_setkey_avx2
|
||||
.type poly1305_setkey_avx2,@function
|
||||
.align 4
|
||||
poly1305_setkey_avx2:
|
||||
callq poly1305_setkey_avx@plt
|
||||
vpxor %ymm0, %ymm0, %ymm0
|
||||
vmovdqu %ymm0, 64(%rdi)
|
||||
vmovdqu %ymm0, 96(%rdi)
|
||||
vmovdqu %ymm0, 128(%rdi)
|
||||
vmovdqu %ymm0, 160(%rdi)
|
||||
vmovdqu %ymm0, 192(%rdi)
|
||||
movq $0x00, 608(%rdi)
|
||||
movw $0x00, 616(%rdi)
|
||||
repz retq
|
||||
.size poly1305_setkey_avx2,.-poly1305_setkey_avx2
|
||||
.align 32
|
||||
L_poly1305_avx2_blocks_mask:
|
||||
.quad 0x3ffffff, 0x3ffffff
|
||||
.quad 0x3ffffff, 0x3ffffff
|
||||
.align 32
|
||||
L_poly1305_avx2_blocks_hibit:
|
||||
.quad 0x1000000, 0x1000000
|
||||
.quad 0x1000000, 0x1000000
|
||||
.globl poly1305_blocks_avx2
|
||||
.type poly1305_blocks_avx2,@function
|
||||
.align 4
|
||||
poly1305_blocks_avx2:
|
||||
pushq %r12
|
||||
pushq %rbx
|
||||
subq $0x140, %rsp
|
||||
movq %rsp, %rcx
|
||||
andq $-32, %rcx
|
||||
addq $32, %rcx
|
||||
vpxor %ymm15, %ymm15, %ymm15
|
||||
movq %rcx, %rbx
|
||||
leaq 64(%rdi), %rax
|
||||
addq $0xa0, %rbx
|
||||
cmpw $0x00, 616(%rdi)
|
||||
jne L_poly1305_avx2_blocks_begin_h
|
||||
# Load the message data
|
||||
vmovdqu (%rsi), %ymm0
|
||||
vmovdqu 32(%rsi), %ymm1
|
||||
vperm2i128 $32, %ymm1, %ymm0, %ymm2
|
||||
vperm2i128 $49, %ymm1, %ymm0, %ymm0
|
||||
vpunpckldq %ymm0, %ymm2, %ymm1
|
||||
vpunpckhdq %ymm0, %ymm2, %ymm3
|
||||
vpunpckldq %ymm15, %ymm1, %ymm0
|
||||
vpunpckhdq %ymm15, %ymm1, %ymm1
|
||||
vpunpckldq %ymm15, %ymm3, %ymm2
|
||||
vpunpckhdq %ymm15, %ymm3, %ymm3
|
||||
vmovdqu L_poly1305_avx2_blocks_hibit(%rip), %ymm4
|
||||
vpsllq $6, %ymm1, %ymm1
|
||||
vpsllq $12, %ymm2, %ymm2
|
||||
vpsllq $18, %ymm3, %ymm3
|
||||
vmovdqu L_poly1305_avx2_blocks_mask(%rip), %ymm14
|
||||
# Reduce, in place, the message data
|
||||
vpsrlq $26, %ymm0, %ymm10
|
||||
vpsrlq $26, %ymm3, %ymm11
|
||||
vpand %ymm14, %ymm0, %ymm0
|
||||
vpand %ymm14, %ymm3, %ymm3
|
||||
vpaddq %ymm1, %ymm10, %ymm1
|
||||
vpaddq %ymm4, %ymm11, %ymm4
|
||||
vpsrlq $26, %ymm1, %ymm10
|
||||
vpsrlq $26, %ymm4, %ymm11
|
||||
vpand %ymm14, %ymm1, %ymm1
|
||||
vpand %ymm14, %ymm4, %ymm4
|
||||
vpaddq %ymm2, %ymm10, %ymm2
|
||||
vpslld $2, %ymm11, %ymm12
|
||||
vpaddd %ymm12, %ymm11, %ymm12
|
||||
vpsrlq $26, %ymm2, %ymm10
|
||||
vpaddq %ymm0, %ymm12, %ymm0
|
||||
vpsrlq $26, %ymm0, %ymm11
|
||||
vpand %ymm14, %ymm2, %ymm2
|
||||
vpand %ymm14, %ymm0, %ymm0
|
||||
vpaddq %ymm3, %ymm10, %ymm3
|
||||
vpaddq %ymm1, %ymm11, %ymm1
|
||||
vpsrlq $26, %ymm3, %ymm10
|
||||
vpand %ymm14, %ymm3, %ymm3
|
||||
vpaddq %ymm4, %ymm10, %ymm4
|
||||
addq $0x40, %rsi
|
||||
subq $0x40, %rdx
|
||||
jz L_poly1305_avx2_blocks_store
|
||||
jmp L_poly1305_avx2_blocks_load_r4
|
||||
L_poly1305_avx2_blocks_begin_h:
|
||||
# Load the H values.
|
||||
vmovdqu (%rax), %ymm0
|
||||
vmovdqu 32(%rax), %ymm1
|
||||
vmovdqu 64(%rax), %ymm2
|
||||
vmovdqu 96(%rax), %ymm3
|
||||
vmovdqu 128(%rax), %ymm4
|
||||
# Check if there is a power of r to load - otherwise use r^4.
|
||||
cmpb $0x00, 616(%rdi)
|
||||
je L_poly1305_avx2_blocks_load_r4
|
||||
# Load the 4 powers of r - r^4, r^3, r^2, r^1.
|
||||
vmovdqu 224(%rdi), %ymm8
|
||||
vmovdqu 256(%rdi), %ymm7
|
||||
vmovdqu 288(%rdi), %ymm6
|
||||
vmovdqu 320(%rdi), %ymm5
|
||||
vpermq $0xd8, %ymm5, %ymm5
|
||||
vpermq $0xd8, %ymm6, %ymm6
|
||||
vpermq $0xd8, %ymm7, %ymm7
|
||||
vpermq $0xd8, %ymm8, %ymm8
|
||||
vpunpcklqdq %ymm6, %ymm5, %ymm10
|
||||
vpunpckhqdq %ymm6, %ymm5, %ymm11
|
||||
vpunpcklqdq %ymm8, %ymm7, %ymm12
|
||||
vpunpckhqdq %ymm8, %ymm7, %ymm13
|
||||
vperm2i128 $32, %ymm12, %ymm10, %ymm5
|
||||
vperm2i128 $49, %ymm12, %ymm10, %ymm7
|
||||
vperm2i128 $32, %ymm13, %ymm11, %ymm9
|
||||
vpsrlq $32, %ymm5, %ymm6
|
||||
vpsrlq $32, %ymm7, %ymm8
|
||||
jmp L_poly1305_avx2_blocks_mul_5
|
||||
L_poly1305_avx2_blocks_load_r4:
|
||||
# Load r^4 into all four positions.
|
||||
vmovdqu 320(%rdi), %ymm13
|
||||
vpermq $0x00, %ymm13, %ymm5
|
||||
vpsrlq $32, %ymm13, %ymm14
|
||||
vpermq $0x55, %ymm13, %ymm7
|
||||
vpermq $0xaa, %ymm13, %ymm9
|
||||
vpermq $0x00, %ymm14, %ymm6
|
||||
vpermq $0x55, %ymm14, %ymm8
|
||||
L_poly1305_avx2_blocks_mul_5:
|
||||
# Multiply top 4 26-bit values of all four H by 5
|
||||
vpslld $2, %ymm6, %ymm10
|
||||
vpslld $2, %ymm7, %ymm11
|
||||
vpslld $2, %ymm8, %ymm12
|
||||
vpslld $2, %ymm9, %ymm13
|
||||
vpaddq %ymm10, %ymm6, %ymm10
|
||||
vpaddq %ymm11, %ymm7, %ymm11
|
||||
vpaddq %ymm12, %ymm8, %ymm12
|
||||
vpaddq %ymm13, %ymm9, %ymm13
|
||||
# Store powers of r and multiple of 5 for use in multiply.
|
||||
vmovdqa %ymm10, (%rbx)
|
||||
vmovdqa %ymm11, 32(%rbx)
|
||||
vmovdqa %ymm12, 64(%rbx)
|
||||
vmovdqa %ymm13, 96(%rbx)
|
||||
vmovdqa %ymm5, (%rcx)
|
||||
vmovdqa %ymm6, 32(%rcx)
|
||||
vmovdqa %ymm7, 64(%rcx)
|
||||
vmovdqa %ymm8, 96(%rcx)
|
||||
vmovdqa %ymm9, 128(%rcx)
|
||||
vmovdqu L_poly1305_avx2_blocks_mask(%rip), %ymm14
|
||||
# If not finished then loop over data
|
||||
cmpb $0x01, 616(%rdi)
|
||||
jne L_poly1305_avx2_blocks_start
|
||||
# Do last multiply, reduce, add the four H together and move to
|
||||
# 32-bit registers
|
||||
vpmuludq (%rbx), %ymm4, %ymm5
|
||||
vpmuludq 32(%rbx), %ymm3, %ymm10
|
||||
vpmuludq 32(%rbx), %ymm4, %ymm6
|
||||
vpmuludq 64(%rbx), %ymm3, %ymm11
|
||||
vpmuludq 64(%rbx), %ymm4, %ymm7
|
||||
vpaddq %ymm5, %ymm10, %ymm5
|
||||
vpmuludq 64(%rbx), %ymm2, %ymm12
|
||||
vpmuludq 96(%rbx), %ymm4, %ymm8
|
||||
vpaddq %ymm6, %ymm11, %ymm6
|
||||
vpmuludq 96(%rbx), %ymm1, %ymm13
|
||||
vpmuludq 96(%rbx), %ymm2, %ymm10
|
||||
vpaddq %ymm5, %ymm12, %ymm5
|
||||
vpmuludq 96(%rbx), %ymm3, %ymm11
|
||||
vpmuludq (%rcx), %ymm3, %ymm12
|
||||
vpaddq %ymm5, %ymm13, %ymm5
|
||||
vpmuludq (%rcx), %ymm4, %ymm9
|
||||
vpaddq %ymm6, %ymm10, %ymm6
|
||||
vpmuludq (%rcx), %ymm0, %ymm13
|
||||
vpaddq %ymm7, %ymm11, %ymm7
|
||||
vpmuludq (%rcx), %ymm1, %ymm10
|
||||
vpaddq %ymm8, %ymm12, %ymm8
|
||||
vpmuludq (%rcx), %ymm2, %ymm11
|
||||
vpmuludq 32(%rcx), %ymm2, %ymm12
|
||||
vpaddq %ymm5, %ymm13, %ymm5
|
||||
vpmuludq 32(%rcx), %ymm3, %ymm13
|
||||
vpaddq %ymm6, %ymm10, %ymm6
|
||||
vpmuludq 32(%rcx), %ymm0, %ymm10
|
||||
vpaddq %ymm7, %ymm11, %ymm7
|
||||
vpmuludq 32(%rcx), %ymm1, %ymm11
|
||||
vpaddq %ymm8, %ymm12, %ymm8
|
||||
vpmuludq 64(%rcx), %ymm1, %ymm12
|
||||
vpaddq %ymm9, %ymm13, %ymm9
|
||||
vpmuludq 64(%rcx), %ymm2, %ymm13
|
||||
vpaddq %ymm6, %ymm10, %ymm6
|
||||
vpmuludq 64(%rcx), %ymm0, %ymm10
|
||||
vpaddq %ymm7, %ymm11, %ymm7
|
||||
vpmuludq 96(%rcx), %ymm0, %ymm11
|
||||
vpaddq %ymm8, %ymm12, %ymm8
|
||||
vpmuludq 96(%rcx), %ymm1, %ymm12
|
||||
vpaddq %ymm9, %ymm13, %ymm9
|
||||
vpaddq %ymm7, %ymm10, %ymm7
|
||||
vpmuludq 128(%rcx), %ymm0, %ymm13
|
||||
vpaddq %ymm8, %ymm11, %ymm8
|
||||
vpaddq %ymm9, %ymm12, %ymm9
|
||||
vpaddq %ymm9, %ymm13, %ymm9
|
||||
vpsrlq $26, %ymm5, %ymm10
|
||||
vpsrlq $26, %ymm8, %ymm11
|
||||
vpand %ymm14, %ymm5, %ymm5
|
||||
vpand %ymm14, %ymm8, %ymm8
|
||||
vpaddq %ymm6, %ymm10, %ymm6
|
||||
vpaddq %ymm9, %ymm11, %ymm9
|
||||
vpsrlq $26, %ymm6, %ymm10
|
||||
vpsrlq $26, %ymm9, %ymm11
|
||||
vpand %ymm14, %ymm6, %ymm1
|
||||
vpand %ymm14, %ymm9, %ymm4
|
||||
vpaddq %ymm7, %ymm10, %ymm7
|
||||
vpslld $2, %ymm11, %ymm12
|
||||
vpaddd %ymm12, %ymm11, %ymm12
|
||||
vpsrlq $26, %ymm7, %ymm10
|
||||
vpaddq %ymm5, %ymm12, %ymm5
|
||||
vpsrlq $26, %ymm5, %ymm11
|
||||
vpand %ymm14, %ymm7, %ymm2
|
||||
vpand %ymm14, %ymm5, %ymm0
|
||||
vpaddq %ymm8, %ymm10, %ymm8
|
||||
vpaddq %ymm1, %ymm11, %ymm1
|
||||
vpsrlq $26, %ymm8, %ymm10
|
||||
vpand %ymm14, %ymm8, %ymm3
|
||||
vpaddq %ymm4, %ymm10, %ymm4
|
||||
vpsrldq $8, %ymm0, %ymm5
|
||||
vpsrldq $8, %ymm1, %ymm6
|
||||
vpsrldq $8, %ymm2, %ymm7
|
||||
vpsrldq $8, %ymm3, %ymm8
|
||||
vpsrldq $8, %ymm4, %ymm9
|
||||
vpaddq %ymm0, %ymm5, %ymm0
|
||||
vpaddq %ymm1, %ymm6, %ymm1
|
||||
vpaddq %ymm2, %ymm7, %ymm2
|
||||
vpaddq %ymm3, %ymm8, %ymm3
|
||||
vpaddq %ymm4, %ymm9, %ymm4
|
||||
vpermq $2, %ymm0, %ymm5
|
||||
vpermq $2, %ymm1, %ymm6
|
||||
vpermq $2, %ymm2, %ymm7
|
||||
vpermq $2, %ymm3, %ymm8
|
||||
vpermq $2, %ymm4, %ymm9
|
||||
vpaddq %ymm0, %ymm5, %ymm0
|
||||
vpaddq %ymm1, %ymm6, %ymm1
|
||||
vpaddq %ymm2, %ymm7, %ymm2
|
||||
vpaddq %ymm3, %ymm8, %ymm3
|
||||
vpaddq %ymm4, %ymm9, %ymm4
|
||||
vmovd %xmm0, %r8d
|
||||
vmovd %xmm1, %r9d
|
||||
vmovd %xmm2, %r10d
|
||||
vmovd %xmm3, %r11d
|
||||
vmovd %xmm4, %r12d
|
||||
jmp L_poly1305_avx2_blocks_end_calc
|
||||
L_poly1305_avx2_blocks_start:
|
||||
vmovdqu (%rsi), %ymm5
|
||||
vmovdqu 32(%rsi), %ymm6
|
||||
vperm2i128 $32, %ymm6, %ymm5, %ymm7
|
||||
vperm2i128 $49, %ymm6, %ymm5, %ymm5
|
||||
vpunpckldq %ymm5, %ymm7, %ymm6
|
||||
vpunpckhdq %ymm5, %ymm7, %ymm8
|
||||
vpunpckldq %ymm15, %ymm6, %ymm5
|
||||
vpunpckhdq %ymm15, %ymm6, %ymm6
|
||||
vpunpckldq %ymm15, %ymm8, %ymm7
|
||||
vpunpckhdq %ymm15, %ymm8, %ymm8
|
||||
vmovdqu L_poly1305_avx2_blocks_hibit(%rip), %ymm9
|
||||
vpsllq $6, %ymm6, %ymm6
|
||||
vpsllq $12, %ymm7, %ymm7
|
||||
vpsllq $18, %ymm8, %ymm8
|
||||
vpmuludq (%rbx), %ymm4, %ymm10
|
||||
vpaddq %ymm5, %ymm10, %ymm5
|
||||
vpmuludq 32(%rbx), %ymm3, %ymm10
|
||||
vpmuludq 32(%rbx), %ymm4, %ymm11
|
||||
vpaddq %ymm6, %ymm11, %ymm6
|
||||
vpmuludq 64(%rbx), %ymm3, %ymm11
|
||||
vpmuludq 64(%rbx), %ymm4, %ymm12
|
||||
vpaddq %ymm7, %ymm12, %ymm7
|
||||
vpaddq %ymm5, %ymm10, %ymm5
|
||||
vpmuludq 64(%rbx), %ymm2, %ymm12
|
||||
vpmuludq 96(%rbx), %ymm4, %ymm13
|
||||
vpaddq %ymm8, %ymm13, %ymm8
|
||||
vpaddq %ymm6, %ymm11, %ymm6
|
||||
vpmuludq 96(%rbx), %ymm1, %ymm13
|
||||
vpmuludq 96(%rbx), %ymm2, %ymm10
|
||||
vpaddq %ymm5, %ymm12, %ymm5
|
||||
vpmuludq 96(%rbx), %ymm3, %ymm11
|
||||
vpmuludq (%rcx), %ymm3, %ymm12
|
||||
vpaddq %ymm5, %ymm13, %ymm5
|
||||
vpmuludq (%rcx), %ymm4, %ymm13
|
||||
vpaddq %ymm9, %ymm13, %ymm9
|
||||
vpaddq %ymm6, %ymm10, %ymm6
|
||||
vpmuludq (%rcx), %ymm0, %ymm13
|
||||
vpaddq %ymm7, %ymm11, %ymm7
|
||||
vpmuludq (%rcx), %ymm1, %ymm10
|
||||
vpaddq %ymm8, %ymm12, %ymm8
|
||||
vpmuludq (%rcx), %ymm2, %ymm11
|
||||
vpmuludq 32(%rcx), %ymm2, %ymm12
|
||||
vpaddq %ymm5, %ymm13, %ymm5
|
||||
vpmuludq 32(%rcx), %ymm3, %ymm13
|
||||
vpaddq %ymm6, %ymm10, %ymm6
|
||||
vpmuludq 32(%rcx), %ymm0, %ymm10
|
||||
vpaddq %ymm7, %ymm11, %ymm7
|
||||
vpmuludq 32(%rcx), %ymm1, %ymm11
|
||||
vpaddq %ymm8, %ymm12, %ymm8
|
||||
vpmuludq 64(%rcx), %ymm1, %ymm12
|
||||
vpaddq %ymm9, %ymm13, %ymm9
|
||||
vpmuludq 64(%rcx), %ymm2, %ymm13
|
||||
vpaddq %ymm6, %ymm10, %ymm6
|
||||
vpmuludq 64(%rcx), %ymm0, %ymm10
|
||||
vpaddq %ymm7, %ymm11, %ymm7
|
||||
vpmuludq 96(%rcx), %ymm0, %ymm11
|
||||
vpaddq %ymm8, %ymm12, %ymm8
|
||||
vpmuludq 96(%rcx), %ymm1, %ymm12
|
||||
vpaddq %ymm9, %ymm13, %ymm9
|
||||
vpaddq %ymm7, %ymm10, %ymm7
|
||||
vpmuludq 128(%rcx), %ymm0, %ymm13
|
||||
vpaddq %ymm8, %ymm11, %ymm8
|
||||
vpaddq %ymm9, %ymm12, %ymm9
|
||||
vpaddq %ymm9, %ymm13, %ymm9
|
||||
vpsrlq $26, %ymm5, %ymm10
|
||||
vpsrlq $26, %ymm8, %ymm11
|
||||
vpand %ymm14, %ymm5, %ymm5
|
||||
vpand %ymm14, %ymm8, %ymm8
|
||||
vpaddq %ymm6, %ymm10, %ymm6
|
||||
vpaddq %ymm9, %ymm11, %ymm9
|
||||
vpsrlq $26, %ymm6, %ymm10
|
||||
vpsrlq $26, %ymm9, %ymm11
|
||||
vpand %ymm14, %ymm6, %ymm1
|
||||
vpand %ymm14, %ymm9, %ymm4
|
||||
vpaddq %ymm7, %ymm10, %ymm7
|
||||
vpslld $2, %ymm11, %ymm12
|
||||
vpaddd %ymm12, %ymm11, %ymm12
|
||||
vpsrlq $26, %ymm7, %ymm10
|
||||
vpaddq %ymm5, %ymm12, %ymm5
|
||||
vpsrlq $26, %ymm5, %ymm11
|
||||
vpand %ymm14, %ymm7, %ymm2
|
||||
vpand %ymm14, %ymm5, %ymm0
|
||||
vpaddq %ymm8, %ymm10, %ymm8
|
||||
vpaddq %ymm1, %ymm11, %ymm1
|
||||
vpsrlq $26, %ymm8, %ymm10
|
||||
vpand %ymm14, %ymm8, %ymm3
|
||||
vpaddq %ymm4, %ymm10, %ymm4
|
||||
addq $0x40, %rsi
|
||||
subq $0x40, %rdx
|
||||
jnz L_poly1305_avx2_blocks_start
|
||||
L_poly1305_avx2_blocks_store:
|
||||
# Store four H values - state
|
||||
vmovdqu %ymm0, (%rax)
|
||||
vmovdqu %ymm1, 32(%rax)
|
||||
vmovdqu %ymm2, 64(%rax)
|
||||
vmovdqu %ymm3, 96(%rax)
|
||||
vmovdqu %ymm4, 128(%rax)
|
||||
L_poly1305_avx2_blocks_end_calc:
|
||||
cmpb $0x00, 616(%rdi)
|
||||
je L_poly1305_avx2_blocks_complete
|
||||
movq %r8, %rax
|
||||
movq %r10, %rdx
|
||||
movq %r12, %rcx
|
||||
shrq $12, %rdx
|
||||
shrq $24, %rcx
|
||||
shlq $26, %r9
|
||||
shlq $52, %r10
|
||||
shlq $14, %r11
|
||||
shlq $40, %r12
|
||||
addq %r9, %rax
|
||||
adcq %r10, %rax
|
||||
addq %r11, %rdx
|
||||
adcq %r12, %rdx
|
||||
adcq $0x00, %rcx
|
||||
movq %rcx, %r8
|
||||
andq $3, %rcx
|
||||
shrq $2, %r8
|
||||
leaq 0(%r8,%r8,4), %r8
|
||||
addq %r8, %rax
|
||||
adcq $0x00, %rdx
|
||||
adcq $0x00, %rcx
|
||||
movq %rax, 24(%rdi)
|
||||
movq %rdx, 32(%rdi)
|
||||
movq %rcx, 40(%rdi)
|
||||
L_poly1305_avx2_blocks_complete:
|
||||
movb $0x01, 617(%rdi)
|
||||
addq $0x140, %rsp
|
||||
popq %rbx
|
||||
popq %r12
|
||||
repz retq
|
||||
.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
|
||||
.globl poly1305_final_avx2
|
||||
.type poly1305_final_avx2,@function
|
||||
.align 4
|
||||
poly1305_final_avx2:
|
||||
movb $0x01, 616(%rdi)
|
||||
movb 617(%rdi), %cl
|
||||
cmpb $0x00, %cl
|
||||
je L_poly1305_avx2_final_done_blocks_X4
|
||||
pushq %rsi
|
||||
movq $0x40, %rdx
|
||||
xorq %rsi, %rsi
|
||||
callq poly1305_blocks_avx2@plt
|
||||
popq %rsi
|
||||
L_poly1305_avx2_final_done_blocks_X4:
|
||||
movq 608(%rdi), %rax
|
||||
movq %rax, %rcx
|
||||
andq $-16, %rcx
|
||||
cmpb $0x00, %cl
|
||||
je L_poly1305_avx2_final_done_blocks
|
||||
pushq %rcx
|
||||
pushq %rax
|
||||
pushq %rsi
|
||||
movq %rcx, %rdx
|
||||
leaq 480(%rdi), %rsi
|
||||
callq poly1305_blocks_avx@plt
|
||||
popq %rsi
|
||||
popq %rax
|
||||
popq %rcx
|
||||
L_poly1305_avx2_final_done_blocks:
|
||||
subq %rcx, 608(%rdi)
|
||||
xorq %rdx, %rdx
|
||||
jmp L_poly1305_avx2_final_cmp_copy
|
||||
L_poly1305_avx2_final_start_copy:
|
||||
movb 480(%rdi,%rcx,1), %r8b
|
||||
movb %r8b, 480(%rdi,%rdx,1)
|
||||
incb %cl
|
||||
incb %dl
|
||||
L_poly1305_avx2_final_cmp_copy:
|
||||
cmp %rcx, %rax
|
||||
jne L_poly1305_avx2_final_start_copy
|
||||
callq poly1305_final_avx@plt
|
||||
vpxor %ymm0, %ymm0, %ymm0
|
||||
vmovdqu %ymm0, 64(%rdi)
|
||||
vmovdqu %ymm0, 96(%rdi)
|
||||
vmovdqu %ymm0, 128(%rdi)
|
||||
vmovdqu %ymm0, 160(%rdi)
|
||||
vmovdqu %ymm0, 192(%rdi)
|
||||
vmovdqu %ymm0, 224(%rdi)
|
||||
vmovdqu %ymm0, 256(%rdi)
|
||||
vmovdqu %ymm0, 288(%rdi)
|
||||
vmovdqu %ymm0, 320(%rdi)
|
||||
movq $0x00, 608(%rdi)
|
||||
movw $0x00, 616(%rdi)
|
||||
repz retq
|
||||
.size poly1305_final_avx2,.-poly1305_final_avx2
|
||||
#endif /* HAVE_INTEL_AVX2 */
|
File diff suppressed because it is too large
Load Diff
22437
wolfcrypt/src/sha256_asm.S
Normal file
22437
wolfcrypt/src/sha256_asm.S
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
10545
wolfcrypt/src/sha512_asm.S
Normal file
10545
wolfcrypt/src/sha512_asm.S
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
25934
wolfcrypt/src/sp_x86_64_asm.S
Normal file
25934
wolfcrypt/src/sp_x86_64_asm.S
Normal file
File diff suppressed because it is too large
Load Diff
@ -56,8 +56,8 @@
|
||||
#include <wolfssl/wolfcrypt/sha512.h>
|
||||
#include <wolfssl/wolfcrypt/arc4.h>
|
||||
|
||||
#if defined(WC_NO_RNG) && defined(USE_FAST_MATH)
|
||||
#include <wolfssl/wolfcrypt/tfm.h>
|
||||
#if defined(WC_NO_RNG)
|
||||
#include <wolfssl/wolfcrypt/integer.h>
|
||||
#else
|
||||
#include <wolfssl/wolfcrypt/random.h>
|
||||
#endif
|
||||
|
@ -135,9 +135,6 @@ typedef struct wc_Sha256 {
|
||||
word32 loLen; /* length in bytes */
|
||||
word32 hiLen; /* length in bytes */
|
||||
void* heap;
|
||||
#ifdef USE_INTEL_SPEEDUP
|
||||
const byte* data;
|
||||
#endif
|
||||
#ifdef WOLFSSL_PIC32MZ_HASH
|
||||
hashUpdCache cache; /* cache for updates */
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user