Subject: SIMD optimization of chacha20 cipher (64 bit and 128 bit version) From: Alexander Troosh 1) Test of raw speed by: "openssl speed -evp chacha20" e1c+: 105 MB/s → 352 MB/s e8c2: 161 MB/s → 144 MB/s (* lcc-1.26 produce bad SIMD code for E2Kv5) e2c3: 334 MB/s → 1085 MB/s 2) Speed of security channel (with openssh-9.1p1) for i in chacha20-poly1305@openssh.com do dd if=/dev/zero bs=1M count=1000 2> /dev/null \ | /usr/bin/ssh -c $i root@localhost "(/usr/bin/time -p cat) > /dev/null" 2>&1 \ | grep real | tr , . | awk '{print "'$i': "1000 / $2" MB/s" }' done use localhost ============= e3c2: 133 MB/s → 190 MB/s remote ssh via 1G ethernet link =============================== e1c+: 38 MB/s → 47 MB/s e8c2: 90 MB/s → 80 MB/s (* ^-- lcc-1.25 bad code) e2c3: 67 MB/s → 80 MB/s (** 1G ethernet limit) diff --git a/crypto/chacha/chacha_enc.c b/crypto/chacha/chacha_enc.c index 18251ea..8a6ae37 100644 --- a/crypto/chacha/chacha_enc.c +++ b/crypto/chacha/chacha_enc.c @@ -16,6 +16,262 @@ typedef unsigned int u32; typedef unsigned char u8; + +#if defined(__e2k__) && __iset__ >= 3 + +#include +#include + +#if __iset__ >= 5 /* 128-bit SIMD */ + +/* QUARTERROUND updates a, b, c, d with a ChaCha "quarter" round. */ +# define QUARTERROUND(a,b,c,d) ( \ + x[a] = __builtin_e2k_qpaddw(x[a], x[b]), x[d] = __builtin_e2k_qpsrcw(__builtin_e2k_qpxor(x[d], x[a]), 32 - 16), \ + x[c] = __builtin_e2k_qpaddw(x[c], x[d]), x[b] = __builtin_e2k_qpsrcw(__builtin_e2k_qpxor(x[b], x[c]), 32 - 12), \ + x[a] = __builtin_e2k_qpaddw(x[a], x[b]), x[d] = __builtin_e2k_qpsrcw(__builtin_e2k_qpxor(x[d], x[a]), 32 - 8), \ + x[c] = __builtin_e2k_qpaddw(x[c], x[d]), x[b] = __builtin_e2k_qpsrcw(__builtin_e2k_qpxor(x[b], x[c]), 32 - 7) ) + +/* chacha_core performs 20 rounds of ChaCha on the input words in + * |input| and writes the 64 output bytes to |output|. */ +static inline __attribute__((__always_inline__)) +void chacha20_core_x4(__v2di *output, const __v2di input[16]) +{ + __v2di x[16]; + int i; + memcpy(x, input, sizeof(x)); + + for (i = 20; i > 0; i -= 2) { + QUARTERROUND(0, 4, 8, 12); + QUARTERROUND(1, 5, 9, 13); + QUARTERROUND(2, 6, 10, 14); + QUARTERROUND(3, 7, 11, 15); + QUARTERROUND(0, 5, 10, 15); + QUARTERROUND(1, 6, 11, 12); + QUARTERROUND(2, 7, 8, 13); + QUARTERROUND(3, 4, 9, 14); + } + + #pragma ivdep + for (i = 0; i < 16; ++i) + output[i] = __builtin_e2k_qpaddw(x[i], input[i]); +} + +void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, + size_t len, const unsigned int key[8], + const unsigned int counter[4]) +{ + u32 input[16]; + __v2di input_x4[16]; + __v2di buf[16]; + size_t todo, i; + + /* sigma constant "expand 32-byte k" in little-endian encoding */ + input[0] = ((u32)('e')) | ((u32)('x') << 8) | ((u32)('p') << 16) | ((u32)('a') << 24); + input[1] = ((u32)('n')) | ((u32)('d') << 8) | ((u32)(' ') << 16) | ((u32)('3') << 24); + input[2] = ((u32)('2')) | ((u32)('-') << 8) | ((u32)('b') << 16) | ((u32)('y') << 24); + input[3] = ((u32)('t')) | ((u32)('e') << 8) | ((u32)(' ') << 16) | ((u32)('k') << 24); + + input[4] = key[0]; + input[5] = key[1]; + input[6] = key[2]; + input[7] = key[3]; + input[8] = key[4]; + input[9] = key[5]; + input[10] = key[6]; + input[11] = key[7]; + + input[12] = counter[0]; + input[13] = counter[1]; + input[14] = counter[2]; + input[15] = counter[3]; + + for (i=0; i < 16; i++) { + unsigned long long w = input[i] * 0x100000001LL; + input_x4[i] = __builtin_e2k_qppackdl(w, w); + } + + input_x4[12] = __builtin_e2k_qpaddw(input_x4[12], (__v2di){0x100000000LL, 0x300000002LL}); + +#pragma loop count(100) + while (len > 0) { + __v2di buf_tran[16]; + + chacha20_core_x4(buf, input_x4); + + for (i = 0; i < 16; i+=4) { + const __v2di f1 = __builtin_e2k_qppackdl(0x1f1e1d1c0f0e0d0cLL, 0x1716151407060504LL); + const __v2di f0 = __builtin_e2k_qppackdl(0x1b1a19180b0a0908LL, 0x1312111003020100LL); + + const __v2di f3 = __builtin_e2k_qppackdl(0x1f1e1d1c1b1a1918LL, 0x0f0e0d0c0b0a0908LL); + const __v2di f2 = __builtin_e2k_qppackdl(0x1716151413121110LL, 0x0706050403020100LL); + + __v2di t0 = __builtin_e2k_qppermb(buf[i + 1], buf[i + 0], f0); + __v2di t1 = __builtin_e2k_qppermb(buf[i + 1], buf[i + 0], f1); + __v2di t2 = __builtin_e2k_qppermb(buf[i + 3], buf[i + 2], f0); + __v2di t3 = __builtin_e2k_qppermb(buf[i + 3], buf[i + 2], f1); + + buf_tran[i/4 + 0] = __builtin_e2k_qppermb(t2, t0, f2); + buf_tran[i/4 + 4] = __builtin_e2k_qppermb(t3, t1, f2); + buf_tran[i/4 + 8] = __builtin_e2k_qppermb(t2, t0, f3); + buf_tran[i/4 + 12] = __builtin_e2k_qppermb(t3, t1, f3); + } + + todo = sizeof(buf); + if (__builtin_expect(len < todo, 0)) { + todo = len & ~(size_t)15; + + #pragma ivdep + #pragma loop count(15) + for (i = 0; i < todo; i+=16) { + *(__v2di *)&out[i] = __builtin_e2k_qpxor(*(__v2di *)&inp[i], buf_tran[i / 16]); + } + #pragma ivdep + #pragma loop count(15) + for (; i < len; i++) { + out[i] = inp[i] ^ ((u8 *)buf_tran)[i]; + } + return; + } + + #pragma ivdep + #pragma unroll(16) + for (i = 0; i < todo; i+=16) { + *(__v2di *)&out[i] = __builtin_e2k_qpxor(*(__v2di *)&inp[i], buf_tran[i / 16]); + } + + /* + * Advance 32-bit counters. Note that as subroutine is so to + * say nonce-agnostic, this limited counter width doesn't + * prevent caller from implementing wider counter. It would + * simply take two calls split on counter overflow... + */ + input_x4[12] = __builtin_e2k_qpaddw(input_x4[12], (__v2di){0x400000004LL, 0x400000004LL}); + + out += todo; + inp += todo; + len -= todo; + } +} + +#elif __iset__ >= 3 /* 64-bit SIMD */ + +/* QUARTERROUND updates a, b, c, d with a ChaCha "quarter" round. */ +# define QUARTERROUND(a,b,c,d) ( \ + x[a]=__builtin_e2k_paddw(x[a],x[b]), tt=__builtin_e2k_pxord(x[d],x[a]), x[d]=__builtin_e2k_pshufb(tt,tt,0x0504070601000302ull), \ + x[c]=__builtin_e2k_paddw(x[c],x[d]), tt=__builtin_e2k_pxord(x[b],x[c]), x[b]=__builtin_e2k_pord(__builtin_e2k_psllw(tt,12), __builtin_e2k_psrlw(tt,32-12)),\ + x[a]=__builtin_e2k_paddw(x[a],x[b]), tt=__builtin_e2k_pxord(x[d],x[a]), x[d]=__builtin_e2k_pshufb(tt,tt,0x0605040702010003ull), \ + x[c]=__builtin_e2k_paddw(x[c],x[d]), tt=__builtin_e2k_pxord(x[b],x[c]), x[b]=__builtin_e2k_pord(__builtin_e2k_psllw(tt, 7), __builtin_e2k_psrlw(tt,32- 7)) ) + +/* chacha_core performs 20 rounds of ChaCha on the input words in + * |input| and writes the 64 output bytes to |output|. */ +void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, + size_t len, const unsigned int key[8], + const unsigned int counter[4]) +{ + u32 input[16]; + uint64_t input_x2[16]; + uint64_t buf[16]; + size_t todo, i; + + /* sigma constant "expand 32-byte k" in little-endian encoding */ + input[0] = ((u32)('e')) | ((u32)('x') << 8) | ((u32)('p') << 16) | ((u32)('a') << 24); + input[1] = ((u32)('n')) | ((u32)('d') << 8) | ((u32)(' ') << 16) | ((u32)('3') << 24); + input[2] = ((u32)('2')) | ((u32)('-') << 8) | ((u32)('b') << 16) | ((u32)('y') << 24); + input[3] = ((u32)('t')) | ((u32)('e') << 8) | ((u32)(' ') << 16) | ((u32)('k') << 24); + + input[4] = key[0]; + input[5] = key[1]; + input[6] = key[2]; + input[7] = key[3]; + input[8] = key[4]; + input[9] = key[5]; + input[10] = key[6]; + input[11] = key[7]; + + input[12] = counter[0]; + input[13] = counter[1]; + input[14] = counter[2]; + input[15] = counter[3]; + +#pragma unroll(16) + for (i=0; i < 16; i++) { + input_x2[i] = input[i] * 0x100000001ull; + } + + input_x2[12] = __builtin_e2k_paddw(input_x2[12], 0x100000000ull); + + #pragma loop count(100) + while (len > 0) { + uint64_t buf_tran[16]; + uint64_t x[16], tt; + uint64_t * __restrict__ outw = (uint64_t *)out; + + for (i = 0; i < 16; ++i) x[i] = input_x2[i]; + + for (i = 20; i > 0; i -= 2) { + QUARTERROUND(0, 4, 8, 12); + QUARTERROUND(1, 5, 9, 13); + QUARTERROUND(2, 6, 10, 14); + QUARTERROUND(3, 7, 11, 15); + QUARTERROUND(0, 5, 10, 15); + QUARTERROUND(1, 6, 11, 12); + QUARTERROUND(2, 7, 8, 13); + QUARTERROUND(3, 4, 9, 14); + } + + #pragma ivdep + for (i = 0; i < 16; ++i) + buf[i] = __builtin_e2k_paddw(x[i], input_x2[i]); + + #pragma unroll(8) + #pragma ivdep + for (i = 0; i < 16; i+=2) { + const uint64_t fmtl = 0x0b0a090803020100ull; + const uint64_t fmtr = 0x0f0e0d0c07060504ull; + + buf_tran[i/2 + 0] = __builtin_e2k_pshufb(buf[i + 1], buf[i + 0], fmtl); + buf_tran[i/2 + 8] = __builtin_e2k_pshufb(buf[i + 1], buf[i + 0], fmtr); + } + + todo = sizeof(buf); + if (__builtin_expect(len < todo, 0)) { + todo = len & ~(size_t)7; + + #pragma ivdep + #pragma loop count(16) + for (i = 0; i < todo; i+=8) { + *(uint64_t *)&out[i] = __builtin_e2k_pxord(*(uint64_t *)&inp[i], buf_tran[i / 8]); + } + #pragma ivdep + #pragma loop count(7) + for (; i < len; i++) { + out[i] = inp[i] ^ ((u8 *)buf_tran)[i]; + } + return; + } + + #pragma ivdep + #pragma unroll(16) + for (i = 0; i < todo; i+=8) { + *outw++ = __builtin_e2k_pxord(*(uint64_t *)&inp[i], buf_tran[i / 8]); + } + + /* + * Advance 32-bit counters. Note that as subroutine is so to + * say nonce-agnostic, this limited counter width doesn't + * prevent caller from implementing wider counter. It would + * simply take two calls split on counter overflow... + */ + input_x2[12] = __builtin_e2k_paddw(input_x2[12], 0x200000002ull); + + out += todo; + inp += todo; + len -= todo; + } +} +#endif /* E2Kv5+ / E2Kv3..E2Kv4 */ + +#else /* Generic version */ typedef union { u32 u[16]; u8 c[64]; @@ -128,3 +384,4 @@ void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, input[12]++; } } +#endif