Poly1305 Intel Assembly code - AVX and AVX2

This commit is contained in:
Sean Parkinson 2017-07-25 15:45:43 +10:00
parent 108f6a4958
commit 5c2736f1a9
4 changed files with 1252 additions and 113 deletions

View File

@ -404,10 +404,12 @@ static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c,
byte* output; byte* output;
word32 i; word32 i;
word32 cnt = 0; word32 cnt = 0;
static const __m128i add = { 0x0000000100000000UL,0x0000000300000002UL }; static const word64 add[2] = { 0x0000000100000000UL,0x0000000300000002UL };
static const __m128i four = { 0x0000000400000004UL,0x0000000400000004UL }; static const word64 four[2] = { 0x0000000400000004UL,0x0000000400000004UL };
static const __m128i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL }; static const word64 rotl8[2] =
static const __m128i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL }; { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL };
static const word64 rotl16[2] =
{ 0x0504070601000302UL,0x0d0c0f0e09080b0aUL };
if (bytes == 0) if (bytes == 0)
return; return;
@ -632,8 +634,8 @@ static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c,
: [bytes] "+r" (bytes), [cnt] "+r" (cnt), : [bytes] "+r" (bytes), [cnt] "+r" (cnt),
[in] "+r" (m), [out] "+r" (c) [in] "+r" (m), [out] "+r" (c)
: [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X), : [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X),
[add] "xrm" (add), [four] "xrm" (four), [add] "m" (add), [four] "m" (four),
[rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) [rotl8] "m" (rotl8), [rotl16] "m" (rotl16)
: "xmm0", "xmm1", "xmm2", "xmm3", : "xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9", "xmm10", "xmm11", "xmm8", "xmm9", "xmm10", "xmm11",
@ -669,14 +671,17 @@ static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c,
byte* output; byte* output;
word32 i; word32 i;
word32 cnt = 0; word32 cnt = 0;
static const __m256i add = { 0x0000000100000000UL,0x0000000300000002UL, static const word64 add[4] = { 0x0000000100000000UL, 0x0000000300000002UL,
0x0000000500000004UL,0x0000000700000006UL }; 0x0000000500000004UL, 0x0000000700000006UL };
static const __m256i eight = { 0x0000000800000008UL,0x0000000800000008UL, static const word64 eight[4] =
0x0000000800000008UL,0x0000000800000008UL }; { 0x0000000800000008UL, 0x0000000800000008UL,
static const __m256i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL, 0x0000000800000008UL, 0x0000000800000008UL };
0x0605040702010003UL,0x0e0d0c0f0a09080bUL }; static const word64 rotl8[4] =
static const __m256i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL, { 0x0605040702010003UL, 0x0e0d0c0f0a09080bUL,
0x0504070601000302UL,0x0d0c0f0e09080b0aUL }; 0x0605040702010003UL, 0x0e0d0c0f0a09080bUL };
static const word64 rotl16[4] =
{ 0x0504070601000302UL, 0x0d0c0f0e09080b0aUL,
0x0504070601000302UL, 0x0d0c0f0e09080b0aUL };
if (bytes == 0) if (bytes == 0)
return; return;
@ -917,8 +922,8 @@ static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c,
: [bytes] "+r" (bytes), [cnt] "+r" (cnt), : [bytes] "+r" (bytes), [cnt] "+r" (cnt),
[in] "+r" (m), [out] "+r" (c) [in] "+r" (m), [out] "+r" (c)
: [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X), : [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X),
[add] "rm" (add), [eight] "rm" (eight), [add] "m" (add), [eight] "m" (eight),
[rotl8] "rm" (rotl8), [rotl16] "rm" (rotl16) [rotl8] "m" (rotl8), [rotl16] "m" (rotl16)
: "ymm0", "ymm1", "ymm2", "ymm3", : "ymm0", "ymm1", "ymm2", "ymm3",
"ymm4", "ymm5", "ymm6", "ymm7", "ymm4", "ymm5", "ymm6", "ymm7",
"ymm8", "ymm9", "ymm10", "ymm11", "ymm8", "ymm9", "ymm10", "ymm11",

File diff suppressed because it is too large Load Diff

View File

@ -3188,7 +3188,9 @@ int poly1305_test(void)
byte tag[16]; byte tag[16];
Poly1305 enc; Poly1305 enc;
static const byte msg[] = static const byte empty[] = { };
static const byte msg1[] =
{ {
0x43,0x72,0x79,0x70,0x74,0x6f,0x67,0x72, 0x43,0x72,0x79,0x70,0x74,0x6f,0x67,0x72,
0x61,0x70,0x68,0x69,0x63,0x20,0x46,0x6f, 0x61,0x70,0x68,0x69,0x63,0x20,0x46,0x6f,
@ -3230,17 +3232,28 @@ int poly1305_test(void)
0x61,0x16 0x61,0x16
}; };
static const byte msg5[] =
{
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
};
byte additional[] = byte additional[] =
{ {
0x50,0x51,0x52,0x53,0xc0,0xc1,0xc2,0xc3, 0x50,0x51,0x52,0x53,0xc0,0xc1,0xc2,0xc3,
0xc4,0xc5,0xc6,0xc7 0xc4,0xc5,0xc6,0xc7
}; };
static const byte correct[] = static const byte correct0[] =
{
0x01,0x03,0x80,0x8a,0xfb,0x0d,0xb2,0xfd,
0x4a,0xbf,0xf6,0xaf,0x41,0x49,0xf5,0x1b
};
static const byte correct1[] =
{ {
0xa8,0x06,0x1d,0xc1,0x30,0x51,0x36,0xc6, 0xa8,0x06,0x1d,0xc1,0x30,0x51,0x36,0xc6,
0xc2,0x2b,0x8b,0xaf,0x0c,0x01,0x27,0xa9 0xc2,0x2b,0x8b,0xaf,0x0c,0x01,0x27,0xa9
}; };
static const byte correct2[] = static const byte correct2[] =
@ -3261,6 +3274,12 @@ int poly1305_test(void)
0x7e,0x90,0x2e,0xcb,0xd0,0x60,0x06,0x91 0x7e,0x90,0x2e,0xcb,0xd0,0x60,0x06,0x91
}; };
static const byte correct5[] =
{
0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
};
static const byte key[] = { static const byte key[] = {
0x85,0xd6,0xbe,0x78,0x57,0x55,0x6d,0x33, 0x85,0xd6,0xbe,0x78,0x57,0x55,0x6d,0x33,
0x7f,0x44,0x52,0xfe,0x42,0xd5,0x06,0xa8, 0x7f,0x44,0x52,0xfe,0x42,0xd5,0x06,0xa8,
@ -3282,41 +3301,49 @@ int poly1305_test(void)
0x2a,0x93,0x75,0x78,0x3e,0xd5,0x53,0xff 0x2a,0x93,0x75,0x78,0x3e,0xd5,0x53,0xff
}; };
const byte* msgs[] = {msg, msg2, msg3}; static const byte key5[] = {
word32 szm[] = {sizeof(msg),sizeof(msg2),sizeof(msg3)}; 0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
const byte* keys[] = {key, key2, key2}; 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
const byte* tests[] = {correct, correct2, correct3}; 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
};
for (i = 0; i < 3; i++) { const byte* msgs[] = {empty, msg1, msg2, msg3, msg5};
word32 szm[] = {sizeof(empty), sizeof(msg1), sizeof(msg2),
sizeof(msg3), sizeof(msg5)};
const byte* keys[] = {key, key, key2, key2, key5};
const byte* tests[] = {correct0, correct1, correct2, correct3, correct5};
for (i = 0; i < 5; i++) {
ret = wc_Poly1305SetKey(&enc, keys[i], 32); ret = wc_Poly1305SetKey(&enc, keys[i], 32);
if (ret != 0) if (ret != 0)
return -3600; return -3600 + i;
ret = wc_Poly1305Update(&enc, msgs[i], szm[i]); ret = wc_Poly1305Update(&enc, msgs[i], szm[i]);
if (ret != 0) if (ret != 0)
return -3601; return -3605 + i;
ret = wc_Poly1305Final(&enc, tag); ret = wc_Poly1305Final(&enc, tag);
if (ret != 0) if (ret != 0)
return -3602; return -36108 + i;
if (XMEMCMP(tag, tests[i], sizeof(tag))) if (XMEMCMP(tag, tests[i], sizeof(tag)))
return -3603; return -3615 + i;
} }
/* Check TLS MAC function from 2.8.2 https://tools.ietf.org/html/rfc7539 */ /* Check TLS MAC function from 2.8.2 https://tools.ietf.org/html/rfc7539 */
XMEMSET(tag, 0, sizeof(tag)); XMEMSET(tag, 0, sizeof(tag));
ret = wc_Poly1305SetKey(&enc, key4, sizeof(key4)); ret = wc_Poly1305SetKey(&enc, key4, sizeof(key4));
if (ret != 0) if (ret != 0)
return -3604; return -3614;
ret = wc_Poly1305_MAC(&enc, additional, sizeof(additional), ret = wc_Poly1305_MAC(&enc, additional, sizeof(additional),
(byte*)msg4, sizeof(msg4), tag, sizeof(tag)); (byte*)msg4, sizeof(msg4), tag, sizeof(tag));
if (ret != 0) if (ret != 0)
return -3605; return -3615;
if (XMEMCMP(tag, correct4, sizeof(tag))) if (XMEMCMP(tag, correct4, sizeof(tag)))
return -3606; return -3616;
/* Check fail of TLS MAC function if altering additional data */ /* Check fail of TLS MAC function if altering additional data */
XMEMSET(tag, 0, sizeof(tag)); XMEMSET(tag, 0, sizeof(tag));
@ -3324,10 +3351,10 @@ int poly1305_test(void)
ret = wc_Poly1305_MAC(&enc, additional, sizeof(additional), ret = wc_Poly1305_MAC(&enc, additional, sizeof(additional),
(byte*)msg4, sizeof(msg4), tag, sizeof(tag)); (byte*)msg4, sizeof(msg4), tag, sizeof(tag));
if (ret != 0) if (ret != 0)
return -3607; return -3617;
if (XMEMCMP(tag, correct4, sizeof(tag)) == 0) if (XMEMCMP(tag, correct4, sizeof(tag)) == 0)
return -3608; return -3618;
return 0; return 0;

View File

@ -45,8 +45,9 @@
#define WC_HAS_GCC_4_4_64BIT #define WC_HAS_GCC_4_4_64BIT
#endif #endif
#if (defined(WC_HAS_SIZEOF_INT128_64BIT) || defined(WC_HAS_MSVC_64BIT) || \ #ifdef USE_INTEL_SPEEDUP
defined(WC_HAS_GCC_4_4_64BIT)) #elif (defined(WC_HAS_SIZEOF_INT128_64BIT) || defined(WC_HAS_MSVC_64BIT) || \
defined(WC_HAS_GCC_4_4_64BIT))
#define POLY130564 #define POLY130564
#else #else
#define POLY130532 #define POLY130532
@ -63,24 +64,44 @@ enum {
/* Poly1305 state */ /* Poly1305 state */
typedef struct Poly1305 { typedef struct Poly1305 {
#if defined(POLY130564) #ifdef USE_INTEL_SPEEDUP
word64 r[3]; word64 r[3];
word64 h[3]; word64 h[3];
word64 pad[2]; word64 pad[2];
word64 t0[6];
word64 t1[6];
word64 hh[12];
word32 r0[8];
word32 r1[8];
word32 r2[8];
word32 r3[8];
word32 r4[8];
word32* rp[4];
word64 hibit[4];
size_t leftover;
unsigned char buffer[4*POLY1305_BLOCK_SIZE];
unsigned char finished;
unsigned char started;
#else #else
word32 r[5]; #if defined(POLY130564)
word32 h[5]; word64 r[3];
word32 pad[4]; word64 h[3];
word64 pad[2];
#else
word32 r[5];
word32 h[5];
word32 pad[4];
#endif
size_t leftover;
unsigned char buffer[POLY1305_BLOCK_SIZE];
unsigned char finished;
#endif #endif
size_t leftover;
unsigned char buffer[POLY1305_BLOCK_SIZE];
unsigned char final;
} Poly1305; } Poly1305;
/* does init */ /* does init */
WOLFSSL_API int wc_Poly1305SetKey(Poly1305* poly1305, const byte* key, word32 kySz); WOLFSSL_API int wc_Poly1305SetKey(Poly1305* poly1305, const byte* key,
word32 kySz);
WOLFSSL_API int wc_Poly1305Update(Poly1305* poly1305, const byte*, word32); WOLFSSL_API int wc_Poly1305Update(Poly1305* poly1305, const byte*, word32);
WOLFSSL_API int wc_Poly1305Final(Poly1305* poly1305, byte* tag); WOLFSSL_API int wc_Poly1305Final(Poly1305* poly1305, byte* tag);
WOLFSSL_API int wc_Poly1305_MAC(Poly1305* ctx, byte* additional, word32 addSz, WOLFSSL_API int wc_Poly1305_MAC(Poly1305* ctx, byte* additional, word32 addSz,