Align critical-path loops in AES and ChaCha.

This commit is contained in:
riastradh 2020-07-27 20:53:22 +00:00
parent 6bb4d9815b
commit 57324de2aa
4 changed files with 40 additions and 4 deletions

View File

@ -1,4 +1,4 @@
/* $NetBSD: aes_armv8_64.S,v 1.8 2020/07/25 22:33:04 riastradh Exp $ */
/* $NetBSD: aes_armv8_64.S,v 1.9 2020/07/27 20:53:22 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@ -440,6 +440,7 @@ END(aesarmv8_setenckey256)
ENTRY(aesarmv8_enctodec)
ldr q0, [x0, x2, lsl #4] /* load last round key */
b 2f
_ALIGN_TEXT
1: aesimc v0.16b, v0.16b /* convert encryption to decryption */
2: str q0, [x1], #0x10 /* store round key */
subs x2, x2, #1 /* count down round */
@ -503,6 +504,7 @@ ENTRY(aesarmv8_cbc_enc)
mov x9, x0 /* x9 := enckey */
mov x10, x3 /* x10 := nbytes */
ldr q0, [x4] /* q0 := chaining value */
_ALIGN_TEXT
1: ldr q1, [x1], #0x10 /* q1 := plaintext block */
eor v0.16b, v0.16b, v1.16b /* q0 := cv ^ ptxt */
mov x0, x9 /* x0 := enckey */
@ -539,6 +541,7 @@ ENTRY(aesarmv8_cbc_dec1)
ldr q0, [x1, #-0x10]! /* q0 := last ciphertext block */
str q0, [x4] /* update iv */
b 2f
_ALIGN_TEXT
1: ldr q31, [x1, #-0x10]! /* q31 := chaining value */
eor v0.16b, v0.16b, v31.16b /* q0 := plaintext block */
str q0, [x2, #-0x10]! /* store plaintext block */
@ -576,6 +579,7 @@ ENTRY(aesarmv8_cbc_dec8)
ldp q6, q7, [x1, #-0x20]! /* q6, q7 := last ciphertext blocks */
str q7, [x4] /* update iv */
b 2f
_ALIGN_TEXT
1: ldp q6, q7, [x1, #-0x20]!
eor v0.16b, v0.16b, v7.16b /* q0 := pt0 */
stp q0, q1, [x2, #-0x20]!
@ -629,6 +633,7 @@ ENTRY(aesarmv8_xts_enc1)
mov x9, x0 /* x9 := enckey */
mov x10, x3 /* x10 := nbytes */
ldr q31, [x4] /* q31 := tweak */
_ALIGN_TEXT
1: ldr q0, [x1], #0x10 /* q0 := ptxt */
mov x0, x9 /* x0 := enckey */
mov x3, x5 /* x3 := nrounds */
@ -661,6 +666,7 @@ ENTRY(aesarmv8_xts_enc8)
mov x9, x0 /* x9 := enckey */
mov x10, x3 /* x10 := nbytes */
ldr q31, [x4] /* q31 := tweak */
_ALIGN_TEXT
1: mov v24.16b, v31.16b /* q24 := tweak[0] */
bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v25.16b, v31.16b /* q25 := tweak[1] */
@ -729,6 +735,7 @@ ENTRY(aesarmv8_xts_dec1)
mov x9, x0 /* x9 := deckey */
mov x10, x3 /* x10 := nbytes */
ldr q31, [x4] /* q31 := tweak */
_ALIGN_TEXT
1: ldr q0, [x1], #0x10 /* q0 := ctxt */
mov x0, x9 /* x0 := deckey */
mov x3, x5 /* x3 := nrounds */
@ -761,6 +768,7 @@ ENTRY(aesarmv8_xts_dec8)
mov x9, x0 /* x9 := deckey */
mov x10, x3 /* x10 := nbytes */
ldr q31, [x4] /* q31 := tweak */
_ALIGN_TEXT
1: mov v24.16b, v31.16b /* q24 := tweak[0] */
bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v25.16b, v31.16b /* q25 := tweak[1] */
@ -879,6 +887,7 @@ ENTRY(aesarmv8_cbcmac_update1)
ldr q0, [x3] /* q0 := initial authenticator */
mov x9, x0 /* x9 := enckey */
mov x5, x3 /* x5 := &auth (enc1 trashes x3) */
_ALIGN_TEXT
1: ldr q1, [x1], #0x10 /* q1 := plaintext block */
mov x0, x9 /* x0 := enckey */
mov x3, x4 /* x3 := nrounds */
@ -913,6 +922,7 @@ ENTRY(aesarmv8_ccm_enc1)
#if _BYTE_ORDER == _LITTLE_ENDIAN
rev32 v2.16b, v2.16b /* q2 := ctr (host-endian) */
#endif
_ALIGN_TEXT
1: ldr q3, [x1], #0x10 /* q3 := plaintext block */
add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */
mov x0, x9 /* x0 := enckey */
@ -972,6 +982,7 @@ ENTRY(aesarmv8_ccm_dec1)
bl aesarmv8_enc1 /* q0 := pad; trash x0/x3/q16 */
b 2f
_ALIGN_TEXT
1: /*
* Authenticate the last block and decrypt the next block
* simultaneously.
@ -1031,6 +1042,7 @@ END(ctr32_inc)
aesarmv8_enc1:
ldr q16, [x0], #0x10 /* load round key */
b 2f
_ALIGN_TEXT
1: /* q0 := MixColumns(q0) */
aesmc v0.16b, v0.16b
2: subs x3, x3, #1
@ -1056,6 +1068,7 @@ END(aesarmv8_enc1)
aesarmv8_enc2:
ldr q16, [x0], #0x10 /* load round key */
b 2f
_ALIGN_TEXT
1: /* q[i] := MixColumns(q[i]) */
aesmc v0.16b, v0.16b
aesmc v1.16b, v1.16b
@ -1085,6 +1098,7 @@ END(aesarmv8_enc2)
aesarmv8_enc8:
ldr q16, [x0], #0x10 /* load round key */
b 2f
_ALIGN_TEXT
1: /* q[i] := MixColumns(q[i]) */
aesmc v0.16b, v0.16b
aesmc v1.16b, v1.16b
@ -1131,6 +1145,7 @@ END(aesarmv8_enc8)
aesarmv8_dec1:
ldr q16, [x0], #0x10 /* load round key */
b 2f
_ALIGN_TEXT
1: /* q0 := InMixColumns(q0) */
aesimc v0.16b, v0.16b
2: subs x3, x3, #1
@ -1157,6 +1172,7 @@ END(aesarmv8_dec1)
aesarmv8_dec8:
ldr q16, [x0], #0x10 /* load round key */
b 2f
_ALIGN_TEXT
1: /* q[i] := InMixColumns(q[i]) */
aesimc v0.16b, v0.16b
aesimc v1.16b, v1.16b

View File

@ -1,4 +1,4 @@
/* $NetBSD: aes_neon_32.S,v 1.2 2020/07/27 20:52:10 riastradh Exp $ */
/* $NetBSD: aes_neon_32.S,v 1.3 2020/07/27 20:53:22 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@ -316,6 +316,7 @@ ENTRY(aes_neon_enc1)
b 2f
_ALIGN_TEXT
1: vld1.64 {d28-d29}, [r0 :128]! /* q14 = *rk++ */
/* q0 := A = rk[i] + sb1_0(io) + sb1_1(jo) */
@ -535,6 +536,7 @@ ENTRY(aes_neon_dec1)
b 2f
_ALIGN_TEXT
1: /* load dsbd */
add r4, r12, #(dsbd_0 - .Lconstants)
vld1.64 {d16-d17}, [r4 :128]! /* q8 := dsbd[0] */

View File

@ -1,4 +1,4 @@
/* $NetBSD: aes_ni_64.S,v 1.4 2020/07/25 22:29:06 riastradh Exp $ */
/* $NetBSD: aes_ni_64.S,v 1.5 2020/07/27 20:53:22 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@ -523,6 +523,7 @@ ENTRY(aesni_enctodec)
movdqa (%rdi,%rdx),%xmm0 /* load last round key */
movdqa %xmm0,(%rsi) /* store last round key verbatim */
jmp 2f
_ALIGN_TEXT
1: movdqa (%rdi,%rdx),%xmm0 /* load round key */
aesimc %xmm0,%xmm0 /* convert encryption to decryption */
movdqa %xmm0,(%rsi) /* store round key */
@ -580,6 +581,7 @@ ENTRY(aesni_cbc_enc)
jz 2f
mov %rcx,%r10 /* r10 := nbytes */
movdqu (%r8),%xmm0 /* xmm0 := chaining value */
_ALIGN_TEXT
1: movdqu (%rsi),%xmm1 /* xmm1 := plaintext block */
lea 0x10(%rsi),%rsi
pxor %xmm1,%xmm0 /* xmm0 := cv ^ ptxt */
@ -615,6 +617,7 @@ ENTRY(aesni_cbc_dec1)
movdqu -0x10(%rsi,%r10),%xmm0 /* xmm0 := last ciphertext block */
movdqu %xmm0,(%r8) /* update iv */
jmp 2f
_ALIGN_TEXT
1: movdqu -0x10(%rsi,%r10),%xmm8 /* xmm8 := chaining value */
pxor %xmm8,%xmm0 /* xmm0 := ptxt */
movdqu %xmm0,(%rdx,%r10) /* store plaintext block */
@ -650,6 +653,7 @@ ENTRY(aesni_cbc_dec8)
movdqu -0x10(%rsi,%r10),%xmm7 /* xmm7 := ciphertext block[n-1] */
movdqu %xmm7,(%r8) /* update iv */
jmp 2f
_ALIGN_TEXT
1: movdqu -0x10(%rsi,%r10),%xmm7 /* xmm7 := cv[0] */
pxor %xmm7,%xmm0 /* xmm0 := ptxt[0] */
movdqu %xmm0,(%rdx,%r10) /* store plaintext block */
@ -706,6 +710,7 @@ END(aesni_cbc_dec8)
ENTRY(aesni_xts_enc1)
mov %rcx,%r10 /* r10 := nbytes */
movdqu (%r8),%xmm15 /* xmm15 := tweak */
_ALIGN_TEXT
1: movdqu (%rsi),%xmm0 /* xmm0 := ptxt */
lea 0x10(%rsi),%rsi /* advance rdi to next block */
pxor %xmm15,%xmm0 /* xmm0 := ptxt ^ tweak */
@ -738,6 +743,7 @@ ENTRY(aesni_xts_enc8)
sub $0x10,%rsp
mov %rcx,%r10 /* r10 := nbytes */
movdqu (%r8),%xmm15 /* xmm15 := tweak[0] */
_ALIGN_TEXT
1: movdqa %xmm15,%xmm8 /* xmm8 := tweak[0] */
call aesni_xts_mulx /* xmm15 := tweak[1] */
movdqa %xmm15,%xmm9 /* xmm9 := tweak[1] */
@ -812,6 +818,7 @@ END(aesni_xts_enc8)
ENTRY(aesni_xts_dec1)
mov %rcx,%r10 /* r10 := nbytes */
movdqu (%r8),%xmm15 /* xmm15 := tweak */
_ALIGN_TEXT
1: movdqu (%rsi),%xmm0 /* xmm0 := ctxt */
lea 0x10(%rsi),%rsi /* advance rdi to next block */
pxor %xmm15,%xmm0 /* xmm0 := ctxt ^ tweak */
@ -844,6 +851,7 @@ ENTRY(aesni_xts_dec8)
sub $0x10,%rsp
mov %rcx,%r10 /* r10 := nbytes */
movdqu (%r8),%xmm15 /* xmm15 := tweak[0] */
_ALIGN_TEXT
1: movdqa %xmm15,%xmm8 /* xmm8 := tweak[0] */
call aesni_xts_mulx /* xmm15 := tweak[1] */
movdqa %xmm15,%xmm9 /* xmm9 := tweak[1] */
@ -964,6 +972,7 @@ ENTRY(aesni_cbcmac_update1)
movdqu (%rcx),%xmm0 /* xmm0 := auth */
mov %rdx,%r10 /* r10 := nbytes */
mov %rcx,%rdx /* rdx := &auth */
_ALIGN_TEXT
1: pxor (%rsi),%xmm0 /* xmm0 ^= plaintext block */
lea 0x10(%rsi),%rsi
mov %r8d,%ecx /* ecx := nrounds */
@ -992,6 +1001,7 @@ ENTRY(aesni_ccm_enc1)
movdqa ctr32_inc(%rip),%xmm5 /* xmm5 := (0,0,0,1) (le) */
movdqu (%r8),%xmm0 /* xmm0 := auth */
pshufb %xmm4,%xmm2 /* xmm2 := ctr (le) */
_ALIGN_TEXT
1: movdqu (%rsi),%xmm3 /* xmm3 := plaintext block */
paddd %xmm5,%xmm2 /* increment ctr (32-bit) */
lea 0x10(%rsi),%rsi
@ -1040,6 +1050,7 @@ ENTRY(aesni_ccm_dec1)
call aesni_enc1 /* xmm0 := pad; trash rax/rcx/xmm8 */
jmp 2f
_ALIGN_TEXT
1: /*
* Authenticate the last block and decrypt the next block
* simultaneously.
@ -1103,6 +1114,7 @@ aesni_enc1:
lea 0x10(%rdi,%rcx),%rax /* rax := end of round key array */
neg %rcx /* rcx := byte offset of round key from end */
jmp 2f
_ALIGN_TEXT
1: aesenc %xmm8,%xmm0
2: movdqa (%rax,%rcx),%xmm8 /* load round key */
add $0x10,%rcx
@ -1130,6 +1142,7 @@ aesni_enc2:
pxor %xmm8,%xmm0 /* xor in first round key */
pxor %xmm8,%xmm1
jmp 2f
_ALIGN_TEXT
1: aesenc %xmm8,%xmm0
aesenc %xmm8,%xmm1
2: movdqa (%rax,%rcx),%xmm8 /* load round key */
@ -1165,6 +1178,7 @@ aesni_enc8:
lea 0x10(%rdi,%rcx),%rax /* rax := end of round key array */
neg %rcx /* rcx := byte offset of round key from end */
jmp 2f
_ALIGN_TEXT
1: aesenc %xmm8,%xmm0
aesenc %xmm8,%xmm1
aesenc %xmm8,%xmm2
@ -1204,6 +1218,7 @@ aesni_dec1:
lea 0x10(%rdi,%rcx),%rax /* rax := pointer to round key */
neg %rcx /* rcx := byte offset of round key from end */
jmp 2f
_ALIGN_TEXT
1: aesdec %xmm8,%xmm0
2: movdqa (%rax,%rcx),%xmm8 /* load round key */
add $0x10,%rcx
@ -1237,6 +1252,7 @@ aesni_dec8:
lea 0x10(%rdi,%rcx),%rax /* rax := pointer to round key */
neg %rcx /* rcx := byte offset of round key from end */
jmp 2f
_ALIGN_TEXT
1: aesdec %xmm8,%xmm0
aesdec %xmm8,%xmm1
aesdec %xmm8,%xmm2

View File

@ -1,4 +1,4 @@
/* $NetBSD: chacha_neon_64.S,v 1.2 2020/07/27 20:50:25 riastradh Exp $ */
/* $NetBSD: chacha_neon_64.S,v 1.3 2020/07/27 20:53:23 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@ -201,6 +201,7 @@ ENTRY(chacha_stream256_neon)
mov w11, v14.s[0]
mov w12, v15.s[0]
_ALIGN_TEXT
1: subs w5, w5, #2
ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
v28,v29,v30,v31, v27)
@ -339,6 +340,7 @@ ENTRY(chacha_stream_xor256_neon)
mov w11, v14.s[0]
mov w12, v15.s[0]
_ALIGN_TEXT
1: subs w6, w6, #2
ROUND(v0,v1,v2,v3, v4,v5,v6,v7, v8,v9,v10,v11, v12,v13,v14,v15,
v28,v29,v30,v31, v27)