mcst-linux-kernel/patches-2024.06.26/openssl-1.1.1n/0015-E2Kv7-AESCTR+GHASH-sti...

239 lines
8.8 KiB
Diff

Subject: E2Kv7-AESCTR+GHASH stitch
From: Alexander Troosh <trush@yandex.ru>
1) Test of raw speed of AES-GCM on e8v7 proto (12.5 MHz):
for m in aes-128-gcm aes-192-gcm aes-256-gcm
do
openssl speed -evp $m 2>/dev/null | tail -1
done
16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
aes-128-gcm 718.71k 1839.57k 4777.64k 8408.75k 9843.61k 10514.01k
aes-192-gcm 690.14k 1769.94k 4647.08k 7830.53k 9407.15k 9480.87k
aes-256-gcm 677.34k 1691.39k 4393.13k 7315.11k 8710.83k 8770.90k
=>
16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
aes-128-gcm 722.58k 1789.15k 5976.42k 10739.64k 15095.88k 16481.09k
aes-192-gcm 682.42k 1994.80k 5686.80k 10345.84k 14425.04k 14696.45k
aes-256-gcm 632.85k 1600.49k 4942.17k 10379.95k 15357.27k 15870.63k
2) Speed of decrypt ("openssl speed -decrypt -evp $m 2>/dev/null"):
16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
aes-128-gcm 717.25k 2494.23k 6970.20k 12576.43k 16531.46k 16913.75k
aes-192-gcm 708.31k 2430.27k 6854.57k 12502.36k 16506.88k 16902.83k
aes-256-gcm 671.55k 2428.37k 6629.72k 11624.11k 14898.52k 15215.27k
3) Speed of security channel (with openssh-9.1p1), e8v7-proto to x86-64 server with AES-NI:
for i in `ssh -Q cipher`
do
dd if=/dev/zero bs=1M count=100 2> /dev/null \
| /usr/bin/ssh -c $i root@x86server "(/usr/bin/time -p cat) > /dev/null" 2>&1 \
| grep real | tr , . | awk '{print "'$i': "100 / $2" MB/s" }'
done
aes128-ctr: 1.2577 MB/s
aes192-ctr: 1.5863 MB/s
aes256-ctr: 1.7280 MB/s
aes128-gcm@openssh.com: 2.7293 MB/s
aes256-gcm@openssh.com: 2.2936 MB/s
chacha20-poly1305@openssh.com: 1.8222 MB/s
=>
aes128-ctr: 1.6326 MB/s
aes192-ctr: 1.4080 MB/s
aes256-ctr: 1.5211 MB/s
aes128-gcm@openssh.com: 3.0675 MB/s
aes256-gcm@openssh.com: 2.5208 MB/s
chacha20-poly1305@openssh.com: 1.6963 MB/s
diff --git a/crypto/evp/e_aes.c b/crypto/evp/e_aes.c
index 9319da9..6a8063d 100644
--- a/crypto/evp/e_aes.c
+++ b/crypto/evp/e_aes.c
@@ -2523,7 +2523,19 @@ void e2kv7_ctr32_encrypt_blocks_x4(const unsigned char *in,
unsigned char *out,
size_t blocks,
const void *key, const unsigned char *ivec);
-# define e2k_ctr32_encrypt_blocks e2kv7_ctr32_encrypt_blocks_x4
+# define e2k_ctr32_encrypt_blocks e2kv7_ctr32_encrypt_blocks_x4
+
+size_t e2kv7_aes_gcm_encrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const void *key,
+ unsigned char ivec[16], u64 *Xi);
+size_t e2kv7_aes_gcm_decrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const void *key,
+ unsigned char ivec[16], u64 *Xi);
+
+# define AES_gcm_encrypt e2kv7_aes_gcm_encrypt
+# define AES_gcm_decrypt e2kv7_aes_gcm_decrypt
+
+# define AES_GCM_ASM(gctx) (gctx->ctr==e2kv7_ctr32_encrypt_blocks_x4)
# else
@@ -2531,7 +2543,7 @@ void e2kv2_ctr32_encrypt_blocks_x2(const unsigned char *in,
unsigned char *out,
size_t blocks,
const void *key, const unsigned char *ivec);
-# define e2k_ctr32_encrypt_blocks e2kv2_ctr32_encrypt_blocks_x2
+# define e2k_ctr32_encrypt_blocks e2kv2_ctr32_encrypt_blocks_x2
# endif
# endif
diff --git a/crypto/modes/gcm128.c b/crypto/modes/gcm128.c
index 7923ea8..babb657 100644
--- a/crypto/modes/gcm128.c
+++ b/crypto/modes/gcm128.c
@@ -295,6 +295,149 @@ void e2kv7_ctr32_encrypt_blocks_x4(const void *in, void *out,
}
}
+size_t e2kv7_aes_gcm_encrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const void *key,
+ unsigned char ivec[16], u64 *Xi)
+{
+ __v2di x = reverse_vector( ((__v2di *)Xi)[0] );
+ __v2di *Hp = &((__v2di *)Xi)[2];
+ __v2di Hp0 = Hp[0], Hp1 = Hp[1], Hp2 = Hp[2], Hp3 = Hp[3];
+
+ __v2di y0 = reverse_vector(*(__v2di *)ivec);
+ __v2di y1 = __builtin_e2k_qpaddd(y0, (__v2di){1LL, 0LL});
+ __v2di y2 = __builtin_e2k_qpaddd(y0, (__v2di){2LL, 0LL});
+ __v2di y3 = __builtin_e2k_qpaddd(y0, (__v2di){3LL, 0LL});
+
+ __v2di * __restrict__ input = (__v2di *)in;
+ __v2di * __restrict__ output = (__v2di *)out;
+
+ const AES_KEY *aes_key = (AES_KEY *)key;
+ const __v2di * __restrict__ rd_key = (__v2di *)aes_key->rd_key;
+ const int rounds = aes_key->rounds;
+ size_t blocks = len / 16, i;
+
+#pragma unroll(1)
+#pragma loop count(1000)
+ while(blocks >= 4)
+ {
+ __v2di m0 = input[0] ^ aes_encrypt_e2kv7(reverse_vector(y0), rd_key, rounds);
+ __v2di m1 = input[1] ^ aes_encrypt_e2kv7(reverse_vector(y1), rd_key, rounds);
+ __v2di m2 = input[2] ^ aes_encrypt_e2kv7(reverse_vector(y2), rd_key, rounds);
+ __v2di m3 = input[3] ^ aes_encrypt_e2kv7(reverse_vector(y3), rd_key, rounds);
+
+ output[0] = m0;
+ output[1] = m1;
+ output[2] = m2;
+ output[3] = m3;
+
+ x ^= reverse_vector(m0);
+ x = gcm_multiply_x4(Hp0, Hp1, Hp2, Hp3,
+ reverse_vector(m3),
+ reverse_vector(m2),
+ reverse_vector(m1),
+ x);
+
+ y0 = __builtin_e2k_qpaddd(y0, (__v2di){4LL, 0LL});
+ y1 = __builtin_e2k_qpaddd(y1, (__v2di){4LL, 0LL});
+ y2 = __builtin_e2k_qpaddd(y2, (__v2di){4LL, 0LL});
+ y3 = __builtin_e2k_qpaddd(y3, (__v2di){4LL, 0LL});
+
+ input += 4;
+ output += 4;
+ blocks -= 4;
+ }
+
+#pragma loop count(3)
+ for(i=0; i < blocks; i++)
+ {
+ __v2di m0 = input[i] ^ aes_encrypt_e2kv7(reverse_vector(y0), rd_key, rounds);
+
+ output[i] = m0;
+
+ x ^= reverse_vector(m0);
+ x = gcm_multiply(Hp0, x);
+
+ y0 = __builtin_e2k_qpaddd(y0, (__v2di){1LL, 0LL});
+ }
+
+ *(__v2di *)ivec = reverse_vector(y0);
+ ((__v2di *)Xi)[0] = reverse_vector(x);
+
+ return len & ~(16-1);
+}
+
+size_t e2kv7_aes_gcm_decrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const void *key,
+ unsigned char ivec[16], u64 *Xi)
+{
+ __v2di x = reverse_vector( ((__v2di *)Xi)[0] );
+ __v2di *Hp = &((__v2di *)Xi)[2];
+ __v2di Hp0 = Hp[0], Hp1 = Hp[1], Hp2 = Hp[2], Hp3 = Hp[3];
+
+ __v2di y0 = reverse_vector(*(__v2di *)ivec);
+ __v2di y1 = __builtin_e2k_qpaddd(y0, (__v2di){1LL, 0LL});
+ __v2di y2 = __builtin_e2k_qpaddd(y0, (__v2di){2LL, 0LL});
+ __v2di y3 = __builtin_e2k_qpaddd(y0, (__v2di){3LL, 0LL});
+
+ __v2di * __restrict__ input = (__v2di *)in;
+ __v2di * __restrict__ output = (__v2di *)out;
+
+ const AES_KEY *aes_key = (AES_KEY *)key;
+ const __v2di * __restrict__ rd_key = (__v2di *)aes_key->rd_key;
+ const int rounds = aes_key->rounds;
+ size_t blocks = len / 16, i;
+
+#pragma unroll(1)
+#pragma loop count(1000)
+ while(blocks >= 4)
+ {
+ __v2di m0 = input[0];
+ __v2di m1 = input[1];
+ __v2di m2 = input[2];
+ __v2di m3 = input[3];
+
+ x ^= reverse_vector(m0);
+ x = gcm_multiply_x4(Hp0, Hp1, Hp2, Hp3,
+ reverse_vector(m3),
+ reverse_vector(m2),
+ reverse_vector(m1),
+ x);
+
+ output[0] = m0 ^ aes_encrypt_e2kv7(reverse_vector(y0), rd_key, rounds);
+ output[1] = m1 ^ aes_encrypt_e2kv7(reverse_vector(y1), rd_key, rounds);
+ output[2] = m2 ^ aes_encrypt_e2kv7(reverse_vector(y2), rd_key, rounds);
+ output[3] = m3 ^ aes_encrypt_e2kv7(reverse_vector(y3), rd_key, rounds);
+
+ y0 = __builtin_e2k_qpaddd(y0, (__v2di){4LL, 0LL});
+ y1 = __builtin_e2k_qpaddd(y1, (__v2di){4LL, 0LL});
+ y2 = __builtin_e2k_qpaddd(y2, (__v2di){4LL, 0LL});
+ y3 = __builtin_e2k_qpaddd(y3, (__v2di){4LL, 0LL});
+
+ input += 4;
+ output += 4;
+ blocks -= 4;
+ }
+
+#pragma loop count(3)
+ for(i=0; i < blocks; i++)
+ {
+ __v2di m0 = input[i];
+
+ x ^= reverse_vector(m0);
+ x = gcm_multiply(Hp0, x);
+
+ output[i] = m0 ^ aes_encrypt_e2kv7(reverse_vector(y0), rd_key, rounds);
+
+ y0 = __builtin_e2k_qpaddd(y0, (__v2di){1LL, 0LL});
+ }
+
+ *(__v2di *)ivec = reverse_vector(y0);
+ ((__v2di *)Xi)[0] = reverse_vector(x);
+
+ return len & ~(16-1);
+}
+
+
#endif /* For E2Kv7+ only */
#endif /* For E2Kv6+ only */