mcst-linux-kernel/patches-2024.06.26/openssl-1.1.1n/0010-SIMD-optimization-of-c...

303 lines
11 KiB
Diff

Subject: SIMD optimization of chacha20 cipher (64 bit and 128 bit version)
From: Alexander Troosh <trush@yandex.ru>
1) Test of raw speed by: "openssl speed -evp chacha20"
e1c+: 105 MB/s → 352 MB/s
e8c2: 161 MB/s → 144 MB/s (* lcc-1.26 produce bad SIMD code for E2Kv5)
e2c3: 334 MB/s → 1085 MB/s
2) Speed of security channel (with openssh-9.1p1)
for i in chacha20-poly1305@openssh.com
do
dd if=/dev/zero bs=1M count=1000 2> /dev/null \
| /usr/bin/ssh -c $i root@localhost "(/usr/bin/time -p cat) > /dev/null" 2>&1 \
| grep real | tr , . | awk '{print "'$i': "1000 / $2" MB/s" }'
done
use localhost
=============
e3c2: 133 MB/s → 190 MB/s
remote ssh via 1G ethernet link
===============================
e1c+: 38 MB/s → 47 MB/s
e8c2: 90 MB/s → 80 MB/s (* ^-- lcc-1.25 bad code)
e2c3: 67 MB/s → 80 MB/s (** 1G ethernet limit)
diff --git a/crypto/chacha/chacha_enc.c b/crypto/chacha/chacha_enc.c
index 18251ea..8a6ae37 100644
--- a/crypto/chacha/chacha_enc.c
+++ b/crypto/chacha/chacha_enc.c
@@ -16,6 +16,262 @@
typedef unsigned int u32;
typedef unsigned char u8;
+
+#if defined(__e2k__) && __iset__ >= 3
+
+#include <stdint.h>
+#include <e2kintrin.h>
+
+#if __iset__ >= 5 /* 128-bit SIMD */
+
+/* QUARTERROUND updates a, b, c, d with a ChaCha "quarter" round. */
+# define QUARTERROUND(a,b,c,d) ( \
+ x[a] = __builtin_e2k_qpaddw(x[a], x[b]), x[d] = __builtin_e2k_qpsrcw(__builtin_e2k_qpxor(x[d], x[a]), 32 - 16), \
+ x[c] = __builtin_e2k_qpaddw(x[c], x[d]), x[b] = __builtin_e2k_qpsrcw(__builtin_e2k_qpxor(x[b], x[c]), 32 - 12), \
+ x[a] = __builtin_e2k_qpaddw(x[a], x[b]), x[d] = __builtin_e2k_qpsrcw(__builtin_e2k_qpxor(x[d], x[a]), 32 - 8), \
+ x[c] = __builtin_e2k_qpaddw(x[c], x[d]), x[b] = __builtin_e2k_qpsrcw(__builtin_e2k_qpxor(x[b], x[c]), 32 - 7) )
+
+/* chacha_core performs 20 rounds of ChaCha on the input words in
+ * |input| and writes the 64 output bytes to |output|. */
+static inline __attribute__((__always_inline__))
+void chacha20_core_x4(__v2di *output, const __v2di input[16])
+{
+ __v2di x[16];
+ int i;
+ memcpy(x, input, sizeof(x));
+
+ for (i = 20; i > 0; i -= 2) {
+ QUARTERROUND(0, 4, 8, 12);
+ QUARTERROUND(1, 5, 9, 13);
+ QUARTERROUND(2, 6, 10, 14);
+ QUARTERROUND(3, 7, 11, 15);
+ QUARTERROUND(0, 5, 10, 15);
+ QUARTERROUND(1, 6, 11, 12);
+ QUARTERROUND(2, 7, 8, 13);
+ QUARTERROUND(3, 4, 9, 14);
+ }
+
+ #pragma ivdep
+ for (i = 0; i < 16; ++i)
+ output[i] = __builtin_e2k_qpaddw(x[i], input[i]);
+}
+
+void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp,
+ size_t len, const unsigned int key[8],
+ const unsigned int counter[4])
+{
+ u32 input[16];
+ __v2di input_x4[16];
+ __v2di buf[16];
+ size_t todo, i;
+
+ /* sigma constant "expand 32-byte k" in little-endian encoding */
+ input[0] = ((u32)('e')) | ((u32)('x') << 8) | ((u32)('p') << 16) | ((u32)('a') << 24);
+ input[1] = ((u32)('n')) | ((u32)('d') << 8) | ((u32)(' ') << 16) | ((u32)('3') << 24);
+ input[2] = ((u32)('2')) | ((u32)('-') << 8) | ((u32)('b') << 16) | ((u32)('y') << 24);
+ input[3] = ((u32)('t')) | ((u32)('e') << 8) | ((u32)(' ') << 16) | ((u32)('k') << 24);
+
+ input[4] = key[0];
+ input[5] = key[1];
+ input[6] = key[2];
+ input[7] = key[3];
+ input[8] = key[4];
+ input[9] = key[5];
+ input[10] = key[6];
+ input[11] = key[7];
+
+ input[12] = counter[0];
+ input[13] = counter[1];
+ input[14] = counter[2];
+ input[15] = counter[3];
+
+ for (i=0; i < 16; i++) {
+ unsigned long long w = input[i] * 0x100000001LL;
+ input_x4[i] = __builtin_e2k_qppackdl(w, w);
+ }
+
+ input_x4[12] = __builtin_e2k_qpaddw(input_x4[12], (__v2di){0x100000000LL, 0x300000002LL});
+
+#pragma loop count(100)
+ while (len > 0) {
+ __v2di buf_tran[16];
+
+ chacha20_core_x4(buf, input_x4);
+
+ for (i = 0; i < 16; i+=4) {
+ const __v2di f1 = __builtin_e2k_qppackdl(0x1f1e1d1c0f0e0d0cLL, 0x1716151407060504LL);
+ const __v2di f0 = __builtin_e2k_qppackdl(0x1b1a19180b0a0908LL, 0x1312111003020100LL);
+
+ const __v2di f3 = __builtin_e2k_qppackdl(0x1f1e1d1c1b1a1918LL, 0x0f0e0d0c0b0a0908LL);
+ const __v2di f2 = __builtin_e2k_qppackdl(0x1716151413121110LL, 0x0706050403020100LL);
+
+ __v2di t0 = __builtin_e2k_qppermb(buf[i + 1], buf[i + 0], f0);
+ __v2di t1 = __builtin_e2k_qppermb(buf[i + 1], buf[i + 0], f1);
+ __v2di t2 = __builtin_e2k_qppermb(buf[i + 3], buf[i + 2], f0);
+ __v2di t3 = __builtin_e2k_qppermb(buf[i + 3], buf[i + 2], f1);
+
+ buf_tran[i/4 + 0] = __builtin_e2k_qppermb(t2, t0, f2);
+ buf_tran[i/4 + 4] = __builtin_e2k_qppermb(t3, t1, f2);
+ buf_tran[i/4 + 8] = __builtin_e2k_qppermb(t2, t0, f3);
+ buf_tran[i/4 + 12] = __builtin_e2k_qppermb(t3, t1, f3);
+ }
+
+ todo = sizeof(buf);
+ if (__builtin_expect(len < todo, 0)) {
+ todo = len & ~(size_t)15;
+
+ #pragma ivdep
+ #pragma loop count(15)
+ for (i = 0; i < todo; i+=16) {
+ *(__v2di *)&out[i] = __builtin_e2k_qpxor(*(__v2di *)&inp[i], buf_tran[i / 16]);
+ }
+ #pragma ivdep
+ #pragma loop count(15)
+ for (; i < len; i++) {
+ out[i] = inp[i] ^ ((u8 *)buf_tran)[i];
+ }
+ return;
+ }
+
+ #pragma ivdep
+ #pragma unroll(16)
+ for (i = 0; i < todo; i+=16) {
+ *(__v2di *)&out[i] = __builtin_e2k_qpxor(*(__v2di *)&inp[i], buf_tran[i / 16]);
+ }
+
+ /*
+ * Advance 32-bit counters. Note that as subroutine is so to
+ * say nonce-agnostic, this limited counter width doesn't
+ * prevent caller from implementing wider counter. It would
+ * simply take two calls split on counter overflow...
+ */
+ input_x4[12] = __builtin_e2k_qpaddw(input_x4[12], (__v2di){0x400000004LL, 0x400000004LL});
+
+ out += todo;
+ inp += todo;
+ len -= todo;
+ }
+}
+
+#elif __iset__ >= 3 /* 64-bit SIMD */
+
+/* QUARTERROUND updates a, b, c, d with a ChaCha "quarter" round. */
+# define QUARTERROUND(a,b,c,d) ( \
+ x[a]=__builtin_e2k_paddw(x[a],x[b]), tt=__builtin_e2k_pxord(x[d],x[a]), x[d]=__builtin_e2k_pshufb(tt,tt,0x0504070601000302ull), \
+ x[c]=__builtin_e2k_paddw(x[c],x[d]), tt=__builtin_e2k_pxord(x[b],x[c]), x[b]=__builtin_e2k_pord(__builtin_e2k_psllw(tt,12), __builtin_e2k_psrlw(tt,32-12)),\
+ x[a]=__builtin_e2k_paddw(x[a],x[b]), tt=__builtin_e2k_pxord(x[d],x[a]), x[d]=__builtin_e2k_pshufb(tt,tt,0x0605040702010003ull), \
+ x[c]=__builtin_e2k_paddw(x[c],x[d]), tt=__builtin_e2k_pxord(x[b],x[c]), x[b]=__builtin_e2k_pord(__builtin_e2k_psllw(tt, 7), __builtin_e2k_psrlw(tt,32- 7)) )
+
+/* chacha_core performs 20 rounds of ChaCha on the input words in
+ * |input| and writes the 64 output bytes to |output|. */
+void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp,
+ size_t len, const unsigned int key[8],
+ const unsigned int counter[4])
+{
+ u32 input[16];
+ uint64_t input_x2[16];
+ uint64_t buf[16];
+ size_t todo, i;
+
+ /* sigma constant "expand 32-byte k" in little-endian encoding */
+ input[0] = ((u32)('e')) | ((u32)('x') << 8) | ((u32)('p') << 16) | ((u32)('a') << 24);
+ input[1] = ((u32)('n')) | ((u32)('d') << 8) | ((u32)(' ') << 16) | ((u32)('3') << 24);
+ input[2] = ((u32)('2')) | ((u32)('-') << 8) | ((u32)('b') << 16) | ((u32)('y') << 24);
+ input[3] = ((u32)('t')) | ((u32)('e') << 8) | ((u32)(' ') << 16) | ((u32)('k') << 24);
+
+ input[4] = key[0];
+ input[5] = key[1];
+ input[6] = key[2];
+ input[7] = key[3];
+ input[8] = key[4];
+ input[9] = key[5];
+ input[10] = key[6];
+ input[11] = key[7];
+
+ input[12] = counter[0];
+ input[13] = counter[1];
+ input[14] = counter[2];
+ input[15] = counter[3];
+
+#pragma unroll(16)
+ for (i=0; i < 16; i++) {
+ input_x2[i] = input[i] * 0x100000001ull;
+ }
+
+ input_x2[12] = __builtin_e2k_paddw(input_x2[12], 0x100000000ull);
+
+ #pragma loop count(100)
+ while (len > 0) {
+ uint64_t buf_tran[16];
+ uint64_t x[16], tt;
+ uint64_t * __restrict__ outw = (uint64_t *)out;
+
+ for (i = 0; i < 16; ++i) x[i] = input_x2[i];
+
+ for (i = 20; i > 0; i -= 2) {
+ QUARTERROUND(0, 4, 8, 12);
+ QUARTERROUND(1, 5, 9, 13);
+ QUARTERROUND(2, 6, 10, 14);
+ QUARTERROUND(3, 7, 11, 15);
+ QUARTERROUND(0, 5, 10, 15);
+ QUARTERROUND(1, 6, 11, 12);
+ QUARTERROUND(2, 7, 8, 13);
+ QUARTERROUND(3, 4, 9, 14);
+ }
+
+ #pragma ivdep
+ for (i = 0; i < 16; ++i)
+ buf[i] = __builtin_e2k_paddw(x[i], input_x2[i]);
+
+ #pragma unroll(8)
+ #pragma ivdep
+ for (i = 0; i < 16; i+=2) {
+ const uint64_t fmtl = 0x0b0a090803020100ull;
+ const uint64_t fmtr = 0x0f0e0d0c07060504ull;
+
+ buf_tran[i/2 + 0] = __builtin_e2k_pshufb(buf[i + 1], buf[i + 0], fmtl);
+ buf_tran[i/2 + 8] = __builtin_e2k_pshufb(buf[i + 1], buf[i + 0], fmtr);
+ }
+
+ todo = sizeof(buf);
+ if (__builtin_expect(len < todo, 0)) {
+ todo = len & ~(size_t)7;
+
+ #pragma ivdep
+ #pragma loop count(16)
+ for (i = 0; i < todo; i+=8) {
+ *(uint64_t *)&out[i] = __builtin_e2k_pxord(*(uint64_t *)&inp[i], buf_tran[i / 8]);
+ }
+ #pragma ivdep
+ #pragma loop count(7)
+ for (; i < len; i++) {
+ out[i] = inp[i] ^ ((u8 *)buf_tran)[i];
+ }
+ return;
+ }
+
+ #pragma ivdep
+ #pragma unroll(16)
+ for (i = 0; i < todo; i+=8) {
+ *outw++ = __builtin_e2k_pxord(*(uint64_t *)&inp[i], buf_tran[i / 8]);
+ }
+
+ /*
+ * Advance 32-bit counters. Note that as subroutine is so to
+ * say nonce-agnostic, this limited counter width doesn't
+ * prevent caller from implementing wider counter. It would
+ * simply take two calls split on counter overflow...
+ */
+ input_x2[12] = __builtin_e2k_paddw(input_x2[12], 0x200000002ull);
+
+ out += todo;
+ inp += todo;
+ len -= todo;
+ }
+}
+#endif /* E2Kv5+ / E2Kv3..E2Kv4 */
+
+#else /* Generic version */
typedef union {
u32 u[16];
u8 c[64];
@@ -128,3 +384,4 @@ void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp,
input[12]++;
}
}
+#endif