303 lines
11 KiB
Diff
303 lines
11 KiB
Diff
Subject: SIMD optimization of chacha20 cipher (64 bit and 128 bit version)
|
|
From: Alexander Troosh <trush@yandex.ru>
|
|
|
|
|
|
1) Test of raw speed by: "openssl speed -evp chacha20"
|
|
|
|
e1c+: 105 MB/s → 352 MB/s
|
|
e8c2: 161 MB/s → 144 MB/s (* lcc-1.26 produce bad SIMD code for E2Kv5)
|
|
e2c3: 334 MB/s → 1085 MB/s
|
|
|
|
2) Speed of security channel (with openssh-9.1p1)
|
|
|
|
for i in chacha20-poly1305@openssh.com
|
|
do
|
|
dd if=/dev/zero bs=1M count=1000 2> /dev/null \
|
|
| /usr/bin/ssh -c $i root@localhost "(/usr/bin/time -p cat) > /dev/null" 2>&1 \
|
|
| grep real | tr , . | awk '{print "'$i': "1000 / $2" MB/s" }'
|
|
done
|
|
|
|
use localhost
|
|
=============
|
|
e3c2: 133 MB/s → 190 MB/s
|
|
|
|
remote ssh via 1G ethernet link
|
|
===============================
|
|
e1c+: 38 MB/s → 47 MB/s
|
|
e8c2: 90 MB/s → 80 MB/s (* ^-- lcc-1.25 bad code)
|
|
e2c3: 67 MB/s → 80 MB/s (** 1G ethernet limit)
|
|
|
|
|
|
diff --git a/crypto/chacha/chacha_enc.c b/crypto/chacha/chacha_enc.c
|
|
index 18251ea..8a6ae37 100644
|
|
--- a/crypto/chacha/chacha_enc.c
|
|
+++ b/crypto/chacha/chacha_enc.c
|
|
@@ -16,6 +16,262 @@
|
|
|
|
typedef unsigned int u32;
|
|
typedef unsigned char u8;
|
|
+
|
|
+#if defined(__e2k__) && __iset__ >= 3
|
|
+
|
|
+#include <stdint.h>
|
|
+#include <e2kintrin.h>
|
|
+
|
|
+#if __iset__ >= 5 /* 128-bit SIMD */
|
|
+
|
|
+/* QUARTERROUND updates a, b, c, d with a ChaCha "quarter" round. */
|
|
+# define QUARTERROUND(a,b,c,d) ( \
|
|
+ x[a] = __builtin_e2k_qpaddw(x[a], x[b]), x[d] = __builtin_e2k_qpsrcw(__builtin_e2k_qpxor(x[d], x[a]), 32 - 16), \
|
|
+ x[c] = __builtin_e2k_qpaddw(x[c], x[d]), x[b] = __builtin_e2k_qpsrcw(__builtin_e2k_qpxor(x[b], x[c]), 32 - 12), \
|
|
+ x[a] = __builtin_e2k_qpaddw(x[a], x[b]), x[d] = __builtin_e2k_qpsrcw(__builtin_e2k_qpxor(x[d], x[a]), 32 - 8), \
|
|
+ x[c] = __builtin_e2k_qpaddw(x[c], x[d]), x[b] = __builtin_e2k_qpsrcw(__builtin_e2k_qpxor(x[b], x[c]), 32 - 7) )
|
|
+
|
|
+/* chacha_core performs 20 rounds of ChaCha on the input words in
|
|
+ * |input| and writes the 64 output bytes to |output|. */
|
|
+static inline __attribute__((__always_inline__))
|
|
+void chacha20_core_x4(__v2di *output, const __v2di input[16])
|
|
+{
|
|
+ __v2di x[16];
|
|
+ int i;
|
|
+ memcpy(x, input, sizeof(x));
|
|
+
|
|
+ for (i = 20; i > 0; i -= 2) {
|
|
+ QUARTERROUND(0, 4, 8, 12);
|
|
+ QUARTERROUND(1, 5, 9, 13);
|
|
+ QUARTERROUND(2, 6, 10, 14);
|
|
+ QUARTERROUND(3, 7, 11, 15);
|
|
+ QUARTERROUND(0, 5, 10, 15);
|
|
+ QUARTERROUND(1, 6, 11, 12);
|
|
+ QUARTERROUND(2, 7, 8, 13);
|
|
+ QUARTERROUND(3, 4, 9, 14);
|
|
+ }
|
|
+
|
|
+ #pragma ivdep
|
|
+ for (i = 0; i < 16; ++i)
|
|
+ output[i] = __builtin_e2k_qpaddw(x[i], input[i]);
|
|
+}
|
|
+
|
|
+void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp,
|
|
+ size_t len, const unsigned int key[8],
|
|
+ const unsigned int counter[4])
|
|
+{
|
|
+ u32 input[16];
|
|
+ __v2di input_x4[16];
|
|
+ __v2di buf[16];
|
|
+ size_t todo, i;
|
|
+
|
|
+ /* sigma constant "expand 32-byte k" in little-endian encoding */
|
|
+ input[0] = ((u32)('e')) | ((u32)('x') << 8) | ((u32)('p') << 16) | ((u32)('a') << 24);
|
|
+ input[1] = ((u32)('n')) | ((u32)('d') << 8) | ((u32)(' ') << 16) | ((u32)('3') << 24);
|
|
+ input[2] = ((u32)('2')) | ((u32)('-') << 8) | ((u32)('b') << 16) | ((u32)('y') << 24);
|
|
+ input[3] = ((u32)('t')) | ((u32)('e') << 8) | ((u32)(' ') << 16) | ((u32)('k') << 24);
|
|
+
|
|
+ input[4] = key[0];
|
|
+ input[5] = key[1];
|
|
+ input[6] = key[2];
|
|
+ input[7] = key[3];
|
|
+ input[8] = key[4];
|
|
+ input[9] = key[5];
|
|
+ input[10] = key[6];
|
|
+ input[11] = key[7];
|
|
+
|
|
+ input[12] = counter[0];
|
|
+ input[13] = counter[1];
|
|
+ input[14] = counter[2];
|
|
+ input[15] = counter[3];
|
|
+
|
|
+ for (i=0; i < 16; i++) {
|
|
+ unsigned long long w = input[i] * 0x100000001LL;
|
|
+ input_x4[i] = __builtin_e2k_qppackdl(w, w);
|
|
+ }
|
|
+
|
|
+ input_x4[12] = __builtin_e2k_qpaddw(input_x4[12], (__v2di){0x100000000LL, 0x300000002LL});
|
|
+
|
|
+#pragma loop count(100)
|
|
+ while (len > 0) {
|
|
+ __v2di buf_tran[16];
|
|
+
|
|
+ chacha20_core_x4(buf, input_x4);
|
|
+
|
|
+ for (i = 0; i < 16; i+=4) {
|
|
+ const __v2di f1 = __builtin_e2k_qppackdl(0x1f1e1d1c0f0e0d0cLL, 0x1716151407060504LL);
|
|
+ const __v2di f0 = __builtin_e2k_qppackdl(0x1b1a19180b0a0908LL, 0x1312111003020100LL);
|
|
+
|
|
+ const __v2di f3 = __builtin_e2k_qppackdl(0x1f1e1d1c1b1a1918LL, 0x0f0e0d0c0b0a0908LL);
|
|
+ const __v2di f2 = __builtin_e2k_qppackdl(0x1716151413121110LL, 0x0706050403020100LL);
|
|
+
|
|
+ __v2di t0 = __builtin_e2k_qppermb(buf[i + 1], buf[i + 0], f0);
|
|
+ __v2di t1 = __builtin_e2k_qppermb(buf[i + 1], buf[i + 0], f1);
|
|
+ __v2di t2 = __builtin_e2k_qppermb(buf[i + 3], buf[i + 2], f0);
|
|
+ __v2di t3 = __builtin_e2k_qppermb(buf[i + 3], buf[i + 2], f1);
|
|
+
|
|
+ buf_tran[i/4 + 0] = __builtin_e2k_qppermb(t2, t0, f2);
|
|
+ buf_tran[i/4 + 4] = __builtin_e2k_qppermb(t3, t1, f2);
|
|
+ buf_tran[i/4 + 8] = __builtin_e2k_qppermb(t2, t0, f3);
|
|
+ buf_tran[i/4 + 12] = __builtin_e2k_qppermb(t3, t1, f3);
|
|
+ }
|
|
+
|
|
+ todo = sizeof(buf);
|
|
+ if (__builtin_expect(len < todo, 0)) {
|
|
+ todo = len & ~(size_t)15;
|
|
+
|
|
+ #pragma ivdep
|
|
+ #pragma loop count(15)
|
|
+ for (i = 0; i < todo; i+=16) {
|
|
+ *(__v2di *)&out[i] = __builtin_e2k_qpxor(*(__v2di *)&inp[i], buf_tran[i / 16]);
|
|
+ }
|
|
+ #pragma ivdep
|
|
+ #pragma loop count(15)
|
|
+ for (; i < len; i++) {
|
|
+ out[i] = inp[i] ^ ((u8 *)buf_tran)[i];
|
|
+ }
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ #pragma ivdep
|
|
+ #pragma unroll(16)
|
|
+ for (i = 0; i < todo; i+=16) {
|
|
+ *(__v2di *)&out[i] = __builtin_e2k_qpxor(*(__v2di *)&inp[i], buf_tran[i / 16]);
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Advance 32-bit counters. Note that as subroutine is so to
|
|
+ * say nonce-agnostic, this limited counter width doesn't
|
|
+ * prevent caller from implementing wider counter. It would
|
|
+ * simply take two calls split on counter overflow...
|
|
+ */
|
|
+ input_x4[12] = __builtin_e2k_qpaddw(input_x4[12], (__v2di){0x400000004LL, 0x400000004LL});
|
|
+
|
|
+ out += todo;
|
|
+ inp += todo;
|
|
+ len -= todo;
|
|
+ }
|
|
+}
|
|
+
|
|
+#elif __iset__ >= 3 /* 64-bit SIMD */
|
|
+
|
|
+/* QUARTERROUND updates a, b, c, d with a ChaCha "quarter" round. */
|
|
+# define QUARTERROUND(a,b,c,d) ( \
|
|
+ x[a]=__builtin_e2k_paddw(x[a],x[b]), tt=__builtin_e2k_pxord(x[d],x[a]), x[d]=__builtin_e2k_pshufb(tt,tt,0x0504070601000302ull), \
|
|
+ x[c]=__builtin_e2k_paddw(x[c],x[d]), tt=__builtin_e2k_pxord(x[b],x[c]), x[b]=__builtin_e2k_pord(__builtin_e2k_psllw(tt,12), __builtin_e2k_psrlw(tt,32-12)),\
|
|
+ x[a]=__builtin_e2k_paddw(x[a],x[b]), tt=__builtin_e2k_pxord(x[d],x[a]), x[d]=__builtin_e2k_pshufb(tt,tt,0x0605040702010003ull), \
|
|
+ x[c]=__builtin_e2k_paddw(x[c],x[d]), tt=__builtin_e2k_pxord(x[b],x[c]), x[b]=__builtin_e2k_pord(__builtin_e2k_psllw(tt, 7), __builtin_e2k_psrlw(tt,32- 7)) )
|
|
+
|
|
+/* chacha_core performs 20 rounds of ChaCha on the input words in
|
|
+ * |input| and writes the 64 output bytes to |output|. */
|
|
+void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp,
|
|
+ size_t len, const unsigned int key[8],
|
|
+ const unsigned int counter[4])
|
|
+{
|
|
+ u32 input[16];
|
|
+ uint64_t input_x2[16];
|
|
+ uint64_t buf[16];
|
|
+ size_t todo, i;
|
|
+
|
|
+ /* sigma constant "expand 32-byte k" in little-endian encoding */
|
|
+ input[0] = ((u32)('e')) | ((u32)('x') << 8) | ((u32)('p') << 16) | ((u32)('a') << 24);
|
|
+ input[1] = ((u32)('n')) | ((u32)('d') << 8) | ((u32)(' ') << 16) | ((u32)('3') << 24);
|
|
+ input[2] = ((u32)('2')) | ((u32)('-') << 8) | ((u32)('b') << 16) | ((u32)('y') << 24);
|
|
+ input[3] = ((u32)('t')) | ((u32)('e') << 8) | ((u32)(' ') << 16) | ((u32)('k') << 24);
|
|
+
|
|
+ input[4] = key[0];
|
|
+ input[5] = key[1];
|
|
+ input[6] = key[2];
|
|
+ input[7] = key[3];
|
|
+ input[8] = key[4];
|
|
+ input[9] = key[5];
|
|
+ input[10] = key[6];
|
|
+ input[11] = key[7];
|
|
+
|
|
+ input[12] = counter[0];
|
|
+ input[13] = counter[1];
|
|
+ input[14] = counter[2];
|
|
+ input[15] = counter[3];
|
|
+
|
|
+#pragma unroll(16)
|
|
+ for (i=0; i < 16; i++) {
|
|
+ input_x2[i] = input[i] * 0x100000001ull;
|
|
+ }
|
|
+
|
|
+ input_x2[12] = __builtin_e2k_paddw(input_x2[12], 0x100000000ull);
|
|
+
|
|
+ #pragma loop count(100)
|
|
+ while (len > 0) {
|
|
+ uint64_t buf_tran[16];
|
|
+ uint64_t x[16], tt;
|
|
+ uint64_t * __restrict__ outw = (uint64_t *)out;
|
|
+
|
|
+ for (i = 0; i < 16; ++i) x[i] = input_x2[i];
|
|
+
|
|
+ for (i = 20; i > 0; i -= 2) {
|
|
+ QUARTERROUND(0, 4, 8, 12);
|
|
+ QUARTERROUND(1, 5, 9, 13);
|
|
+ QUARTERROUND(2, 6, 10, 14);
|
|
+ QUARTERROUND(3, 7, 11, 15);
|
|
+ QUARTERROUND(0, 5, 10, 15);
|
|
+ QUARTERROUND(1, 6, 11, 12);
|
|
+ QUARTERROUND(2, 7, 8, 13);
|
|
+ QUARTERROUND(3, 4, 9, 14);
|
|
+ }
|
|
+
|
|
+ #pragma ivdep
|
|
+ for (i = 0; i < 16; ++i)
|
|
+ buf[i] = __builtin_e2k_paddw(x[i], input_x2[i]);
|
|
+
|
|
+ #pragma unroll(8)
|
|
+ #pragma ivdep
|
|
+ for (i = 0; i < 16; i+=2) {
|
|
+ const uint64_t fmtl = 0x0b0a090803020100ull;
|
|
+ const uint64_t fmtr = 0x0f0e0d0c07060504ull;
|
|
+
|
|
+ buf_tran[i/2 + 0] = __builtin_e2k_pshufb(buf[i + 1], buf[i + 0], fmtl);
|
|
+ buf_tran[i/2 + 8] = __builtin_e2k_pshufb(buf[i + 1], buf[i + 0], fmtr);
|
|
+ }
|
|
+
|
|
+ todo = sizeof(buf);
|
|
+ if (__builtin_expect(len < todo, 0)) {
|
|
+ todo = len & ~(size_t)7;
|
|
+
|
|
+ #pragma ivdep
|
|
+ #pragma loop count(16)
|
|
+ for (i = 0; i < todo; i+=8) {
|
|
+ *(uint64_t *)&out[i] = __builtin_e2k_pxord(*(uint64_t *)&inp[i], buf_tran[i / 8]);
|
|
+ }
|
|
+ #pragma ivdep
|
|
+ #pragma loop count(7)
|
|
+ for (; i < len; i++) {
|
|
+ out[i] = inp[i] ^ ((u8 *)buf_tran)[i];
|
|
+ }
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ #pragma ivdep
|
|
+ #pragma unroll(16)
|
|
+ for (i = 0; i < todo; i+=8) {
|
|
+ *outw++ = __builtin_e2k_pxord(*(uint64_t *)&inp[i], buf_tran[i / 8]);
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Advance 32-bit counters. Note that as subroutine is so to
|
|
+ * say nonce-agnostic, this limited counter width doesn't
|
|
+ * prevent caller from implementing wider counter. It would
|
|
+ * simply take two calls split on counter overflow...
|
|
+ */
|
|
+ input_x2[12] = __builtin_e2k_paddw(input_x2[12], 0x200000002ull);
|
|
+
|
|
+ out += todo;
|
|
+ inp += todo;
|
|
+ len -= todo;
|
|
+ }
|
|
+}
|
|
+#endif /* E2Kv5+ / E2Kv3..E2Kv4 */
|
|
+
|
|
+#else /* Generic version */
|
|
typedef union {
|
|
u32 u[16];
|
|
u8 c[64];
|
|
@@ -128,3 +384,4 @@ void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp,
|
|
input[12]++;
|
|
}
|
|
}
|
|
+#endif
|