Reallocate registers to avoid abusing callee-saves registers, v8-v15.

Forgot to consult the AAPCS before committing this before -- oops!

While here, take advantage of the 32 aarch64 simd registers to avoid
all stack spills.
This commit is contained in:
riastradh 2020-06-30 23:06:02 +00:00
parent 6d5a7eed7d
commit aac1a7e566

View File

@ -1,4 +1,4 @@
/* $NetBSD: aes_armv8_64.S,v 1.3 2020/06/30 21:53:39 riastradh Exp $ */ /* $NetBSD: aes_armv8_64.S,v 1.4 2020/06/30 23:06:02 riastradh Exp $ */
/*- /*-
* Copyright (c) 2020 The NetBSD Foundation, Inc. * Copyright (c) 2020 The NetBSD Foundation, Inc.
@ -116,7 +116,7 @@ ENTRY(aesarmv8_setenckey128)
adrl x4, unshiftrows_rotword_3 adrl x4, unshiftrows_rotword_3
eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
ldr q8, [x4] /* q8 := unshiftrows_rotword_3 table */ ldr q16, [x4] /* q16 := unshiftrows_rotword_3 table */
str q1, [x0], #0x10 /* store master key as first round key */ str q1, [x0], #0x10 /* store master key as first round key */
mov x2, #10 /* round count */ mov x2, #10 /* round count */
@ -136,7 +136,7 @@ ENTRY(aesarmv8_setenckey128)
/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */ /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
ld1r {v4.4s}, [x3], #4 ld1r {v4.4s}, [x3], #4
tbl v3.16b, {v3.16b}, v8.16b tbl v3.16b, {v3.16b}, v16.16b
eor v3.16b, v3.16b, v4.16b eor v3.16b, v3.16b, v4.16b
/* /*
@ -175,8 +175,8 @@ ENTRY(aesarmv8_setenckey192)
adrl x4, unshiftrows_rotword_1 adrl x4, unshiftrows_rotword_1
adrl x5, unshiftrows_rotword_3 adrl x5, unshiftrows_rotword_3
eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
ldr q8, [x4] /* q8 := unshiftrows_rotword_1 */ ldr q16, [x4] /* q16 := unshiftrows_rotword_1 */
ldr q9, [x5] /* q9 := unshiftrows_rotword_3 */ ldr q17, [x5] /* q17 := unshiftrows_rotword_3 */
str q1, [x0], #0x10 /* store master key[0:128) as round key */ str q1, [x0], #0x10 /* store master key[0:128) as round key */
mov x2, #12 /* round count */ mov x2, #12 /* round count */
@ -197,7 +197,7 @@ ENTRY(aesarmv8_setenckey192)
/* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */ /* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */
ld1r {v4.4s}, [x3], #4 ld1r {v4.4s}, [x3], #4
tbl v3.16b, {v3.16b}, v8.16b tbl v3.16b, {v3.16b}, v16.16b
eor v3.16b, v3.16b, v4.16b eor v3.16b, v3.16b, v4.16b
/* /*
@ -269,8 +269,8 @@ ENTRY(aesarmv8_setenckey192)
* q2 = rk * q2 = rk
* q3 = nrk * q3 = nrk
* v5.4s = (rk[2], rk[3], nrk[0], nrk[1]) * v5.4s = (rk[2], rk[3], nrk[0], nrk[1])
* q8 = unshiftrows_rotword_1 * q16 = unshiftrows_rotword_1
* q9 = unshiftrows_rotword_3 * q17 = unshiftrows_rotword_3
* *
* We have to compute, in q1: * We have to compute, in q1:
* *
@ -294,7 +294,7 @@ ENTRY(aesarmv8_setenckey192)
/* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */ /* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */
ld1r {v4.4s}, [x3], #4 ld1r {v4.4s}, [x3], #4
tbl v1.16b, {v1.16b}, v9.16b tbl v1.16b, {v1.16b}, v17.16b
eor v1.16b, v1.16b, v4.16b eor v1.16b, v1.16b, v4.16b
/* /*
@ -354,8 +354,8 @@ ENTRY(aesarmv8_setenckey256)
adrl x4, unshiftrows_rotword_3 adrl x4, unshiftrows_rotword_3
adrl x5, unshiftrows_3 adrl x5, unshiftrows_3
eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
ldr q8, [x4] /* q8 := unshiftrows_rotword_3 */ ldr q16, [x4] /* q16 := unshiftrows_rotword_3 */
ldr q9, [x5] /* q9 := unshiftrows_3 */ ldr q17, [x5] /* q17 := unshiftrows_3 */
/* store master key as first two round keys */ /* store master key as first two round keys */
stp q1, q2, [x0], #0x20 stp q1, q2, [x0], #0x20
@ -376,7 +376,7 @@ ENTRY(aesarmv8_setenckey256)
/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */ /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
ld1r {v4.4s}, [x3], #4 ld1r {v4.4s}, [x3], #4
tbl v3.16b, {v3.16b}, v8.16b tbl v3.16b, {v3.16b}, v16.16b
eor v3.16b, v3.16b, v4.16b eor v3.16b, v3.16b, v4.16b
/* /*
@ -402,7 +402,7 @@ ENTRY(aesarmv8_setenckey256)
aese v3.16b, v0.16b aese v3.16b, v0.16b
/* v3.4s[i] := SubBytes(rk[3]) */ /* v3.4s[i] := SubBytes(rk[3]) */
tbl v3.16b, {v3.16b}, v9.16b tbl v3.16b, {v3.16b}, v17.16b
/* /*
* v5.4s := (0,prk[0],prk[1],prk[2]) * v5.4s := (0,prk[0],prk[1],prk[2])
@ -458,9 +458,9 @@ END(aesarmv8_enctodec)
ENTRY(aesarmv8_enc) ENTRY(aesarmv8_enc)
stp fp, lr, [sp, #-16]! /* push stack frame */ stp fp, lr, [sp, #-16]! /* push stack frame */
mov fp, sp mov fp, sp
ldr q0, [x1] /* q0 := block */ ldr q0, [x1] /* q0 := ptxt */
bl aesarmv8_enc1 bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */
str q0, [x2] /* store block */ str q0, [x2] /* store ctxt */
ldp fp, lr, [sp], #16 /* pop stack frame */ ldp fp, lr, [sp], #16 /* pop stack frame */
ret ret
END(aesarmv8_enc) END(aesarmv8_enc)
@ -476,9 +476,9 @@ END(aesarmv8_enc)
ENTRY(aesarmv8_dec) ENTRY(aesarmv8_dec)
stp fp, lr, [sp, #-16]! /* push stack frame */ stp fp, lr, [sp, #-16]! /* push stack frame */
mov fp, sp mov fp, sp
ldr q0, [x1] /* q0 := block */ ldr q0, [x1] /* q0 := ctxt */
bl aesarmv8_dec1 bl aesarmv8_dec1 /* q0 := ptxt; trash x0/x3/q16 */
str q0, [x2] /* store block */ str q0, [x2] /* store ptxt */
ldp fp, lr, [sp], #16 /* pop stack frame */ ldp fp, lr, [sp], #16 /* pop stack frame */
ret ret
END(aesarmv8_dec) END(aesarmv8_dec)
@ -505,7 +505,7 @@ ENTRY(aesarmv8_cbc_enc)
eor v0.16b, v0.16b, v1.16b /* q0 := cv ^ ptxt */ eor v0.16b, v0.16b, v1.16b /* q0 := cv ^ ptxt */
mov x0, x9 /* x0 := enckey */ mov x0, x9 /* x0 := enckey */
mov x3, x5 /* x3 := nrounds */ mov x3, x5 /* x3 := nrounds */
bl aesarmv8_enc1 /* q0 := ciphertext block */ bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */
subs x10, x10, #0x10 /* count down nbytes */ subs x10, x10, #0x10 /* count down nbytes */
str q0, [x2], #0x10 /* store ciphertext block */ str q0, [x2], #0x10 /* store ciphertext block */
b.ne 1b /* repeat if x10 is nonzero */ b.ne 1b /* repeat if x10 is nonzero */
@ -527,10 +527,9 @@ END(aesarmv8_cbc_enc)
* Standard ABI calling convention. * Standard ABI calling convention.
*/ */
ENTRY(aesarmv8_cbc_dec1) ENTRY(aesarmv8_cbc_dec1)
stp fp, lr, [sp, #-32]! /* push stack frame with uint128 */ stp fp, lr, [sp, #-16]! /* push stack frame */
mov fp, sp mov fp, sp
ldr q8, [x4] /* q8 := iv */ ldr q24, [x4] /* q24 := iv */
str q8, [sp, #16] /* save iv */
mov x9, x0 /* x9 := enckey */ mov x9, x0 /* x9 := enckey */
mov x10, x3 /* x10 := nbytes */ mov x10, x3 /* x10 := nbytes */
add x1, x1, x3 /* x1 := pointer past end of in */ add x1, x1, x3 /* x1 := pointer past end of in */
@ -539,18 +538,17 @@ ENTRY(aesarmv8_cbc_dec1)
str q0, [x4] /* update iv */ str q0, [x4] /* update iv */
1: mov x0, x9 /* x0 := enckey */ 1: mov x0, x9 /* x0 := enckey */
mov x3, x5 /* x3 := nrounds */ mov x3, x5 /* x3 := nrounds */
bl aesarmv8_dec1 /* q0 := cv ^ ptxt; trash x0/x3 */ bl aesarmv8_dec1 /* q0 := cv ^ ptxt; trash x0/x3/q16 */
subs x10, x10, #0x10 /* count down nbytes */ subs x10, x10, #0x10 /* count down nbytes */
b.eq 2f /* stop if this is the first block */ b.eq 2f /* stop if this is the first block */
ldr q8, [x1, #-0x10]! /* q8 := chaining value */ ldr q31, [x1, #-0x10]! /* q31 := chaining value */
eor v0.16b, v0.16b, v8.16b /* q0 := plaintext block */ eor v0.16b, v0.16b, v31.16b /* q0 := plaintext block */
str q0, [x2, #-0x10]! /* store plaintext block */ str q0, [x2, #-0x10]! /* store plaintext block */
mov v0.16b, v8.16b /* move cv = ciphertext block */ mov v0.16b, v31.16b /* move cv = ciphertext block */
b 1b b 1b
2: ldr q8, [sp, #16] /* q8 := iv */ 2: eor v0.16b, v0.16b, v24.16b /* q0 := first plaintext block */
eor v0.16b, v0.16b, v8.16b /* q0 := first plaintext block */
str q0, [x2, #-0x10]! /* store first plaintext block */ str q0, [x2, #-0x10]! /* store first plaintext block */
ldp fp, lr, [sp], #32 /* pop stack frame */ ldp fp, lr, [sp], #16 /* pop stack frame */
ret ret
END(aesarmv8_cbc_dec1) END(aesarmv8_cbc_dec1)
@ -566,10 +564,9 @@ END(aesarmv8_cbc_dec1)
* Standard ABI calling convention. * Standard ABI calling convention.
*/ */
ENTRY(aesarmv8_cbc_dec8) ENTRY(aesarmv8_cbc_dec8)
stp fp, lr, [sp, #-32]! /* push stack frame with uint128 */ stp fp, lr, [sp, #-16]! /* push stack frame */
mov fp, sp mov fp, sp
ldr q8, [x4] /* q8 := iv */ ldr q24, [x4] /* q24 := iv */
str q8, [sp, #16] /* save iv */
mov x9, x0 /* x9 := enckey */ mov x9, x0 /* x9 := enckey */
mov x10, x3 /* x10 := nbytes */ mov x10, x3 /* x10 := nbytes */
add x1, x1, x3 /* x1 := pointer past end of in */ add x1, x1, x3 /* x1 := pointer past end of in */
@ -579,23 +576,24 @@ ENTRY(aesarmv8_cbc_dec8)
1: ldp q4, q5, [x1, #-0x20]! 1: ldp q4, q5, [x1, #-0x20]!
ldp q2, q3, [x1, #-0x20]! ldp q2, q3, [x1, #-0x20]!
ldp q0, q1, [x1, #-0x20]! ldp q0, q1, [x1, #-0x20]!
mov v15.16b, v6.16b /* q[8+i] := cv[i], 0<i<8 */ mov v31.16b, v6.16b /* q[24+i] := cv[i], 0<i<8 */
mov v14.16b, v5.16b mov v30.16b, v5.16b
mov v13.16b, v4.16b mov v29.16b, v4.16b
mov v12.16b, v3.16b mov v28.16b, v3.16b
mov v11.16b, v2.16b mov v27.16b, v2.16b
mov v10.16b, v1.16b mov v26.16b, v1.16b
mov v9.16b, v0.16b mov v25.16b, v0.16b
mov x0, x9 /* x0 := enckey */ mov x0, x9 /* x0 := enckey */
mov x3, x5 /* x3 := nrounds */ mov x3, x5 /* x3 := nrounds */
bl aesarmv8_dec8 /* q[i] := cv[i] ^ pt[i] */ bl aesarmv8_dec8 /* q[i] := cv[i] ^ pt[i];
eor v7.16b, v7.16b, v15.16b /* q[i] := pt[i] */ * trash x0/x3/q16 */
eor v6.16b, v6.16b, v14.16b eor v7.16b, v7.16b, v31.16b /* q[i] := pt[i] */
eor v5.16b, v5.16b, v13.16b eor v6.16b, v6.16b, v30.16b
eor v4.16b, v4.16b, v12.16b eor v5.16b, v5.16b, v29.16b
eor v3.16b, v3.16b, v11.16b eor v4.16b, v4.16b, v28.16b
eor v2.16b, v2.16b, v10.16b eor v3.16b, v3.16b, v27.16b
eor v1.16b, v1.16b, v9.16b eor v2.16b, v2.16b, v26.16b
eor v1.16b, v1.16b, v25.16b
subs x10, x10, #0x80 /* count down nbytes */ subs x10, x10, #0x80 /* count down nbytes */
stp q6, q7, [x2, #-0x20]! /* store plaintext blocks */ stp q6, q7, [x2, #-0x20]! /* store plaintext blocks */
stp q4, q5, [x2, #-0x20]! stp q4, q5, [x2, #-0x20]!
@ -605,10 +603,9 @@ ENTRY(aesarmv8_cbc_dec8)
eor v0.16b, v0.16b, v7.16b /* q0 := pt0 */ eor v0.16b, v0.16b, v7.16b /* q0 := pt0 */
stp q0, q1, [x2, #-0x20]! stp q0, q1, [x2, #-0x20]!
b 1b b 1b
2: ldr q8, [sp, #16] /* q8 := iv */ 2: eor v0.16b, v0.16b, v24.16b /* q0 := pt0 */
eor v0.16b, v0.16b, v8.16b /* q0 := pt0 */
stp q0, q1, [x2, #-0x20]! /* store first two plaintext blocks */ stp q0, q1, [x2, #-0x20]! /* store first two plaintext blocks */
ldp fp, lr, [sp], #32 /* pop stack frame */ ldp fp, lr, [sp], #16 /* pop stack frame */
ret ret
END(aesarmv8_cbc_dec8) END(aesarmv8_cbc_dec8)
@ -629,18 +626,18 @@ ENTRY(aesarmv8_xts_enc1)
mov fp, sp mov fp, sp
mov x9, x0 /* x9 := enckey */ mov x9, x0 /* x9 := enckey */
mov x10, x3 /* x10 := nbytes */ mov x10, x3 /* x10 := nbytes */
ldr q9, [x4] /* q9 := tweak */ ldr q31, [x4] /* q31 := tweak */
1: ldr q0, [x1], #0x10 /* q0 := ptxt */ 1: ldr q0, [x1], #0x10 /* q0 := ptxt */
mov x0, x9 /* x0 := enckey */ mov x0, x9 /* x0 := enckey */
mov x3, x5 /* x3 := nrounds */ mov x3, x5 /* x3 := nrounds */
eor v0.16b, v0.16b, v9.16b /* q0 := ptxt ^ tweak */ eor v0.16b, v0.16b, v31.16b /* q0 := ptxt ^ tweak */
bl aesarmv8_enc1 /* q0 := AES(ptxt ^ tweak) */ bl aesarmv8_enc1 /* q0 := AES(...); trash x0/x3/q16 */
eor v0.16b, v0.16b, v9.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */ eor v0.16b, v0.16b, v31.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */
str q0, [x2], #0x10 /* store ciphertext block */ str q0, [x2], #0x10 /* store ciphertext block */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
subs x10, x10, #0x10 /* count down nbytes */ subs x10, x10, #0x10 /* count down nbytes */
b.ne 1b /* repeat if more blocks */ b.ne 1b /* repeat if more blocks */
str q9, [x4] /* update tweak */ str q31, [x4] /* update tweak */
ldp fp, lr, [sp], #16 /* pop stack frame */ ldp fp, lr, [sp], #16 /* pop stack frame */
ret ret
END(aesarmv8_xts_enc1) END(aesarmv8_xts_enc1)
@ -657,61 +654,58 @@ END(aesarmv8_xts_enc1)
* Standard ABI calling convention. * Standard ABI calling convention.
*/ */
ENTRY(aesarmv8_xts_enc8) ENTRY(aesarmv8_xts_enc8)
stp fp, lr, [sp, #-48]! /* push stack frame uint128[2] */ stp fp, lr, [sp, #-16]! /* push stack frame */
mov fp, sp mov fp, sp
mov x9, x0 /* x9 := enckey */ mov x9, x0 /* x9 := enckey */
mov x10, x3 /* x10 := nbytes */ mov x10, x3 /* x10 := nbytes */
ldr q9, [x4] /* q9 := tweak */ ldr q31, [x4] /* q31 := tweak */
1: str q9, [sp, #16] /* save tweak[0] */ 1: mov v24.16b, v31.16b /* q24 := tweak[0] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
str q9, [sp, #32] /* save tweak[1] */ mov v25.16b, v31.16b /* q25 := tweak[1] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v10.16b, v9.16b /* q10 := tweak[2] */ mov v26.16b, v31.16b /* q26 := tweak[2] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v11.16b, v9.16b /* q11 := tweak[3] */ mov v27.16b, v31.16b /* q27 := tweak[3] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v12.16b, v9.16b /* q11 := tweak[4] */ mov v28.16b, v31.16b /* q28 := tweak[4] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v13.16b, v9.16b /* q11 := tweak[5] */ mov v29.16b, v31.16b /* q29 := tweak[5] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v14.16b, v9.16b /* q11 := tweak[6] */ mov v30.16b, v31.16b /* q30 := tweak[6] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v15.16b, v9.16b /* q11 := tweak[7] */ /* q31 := tweak[7] */
ldp q8, q9, [sp, #16] /* q8 := tweak[0], q9 := tweak[1] */ ldp q0, q1, [x1], #0x20 /* q[i] := ptxt[i] */
ldp q0, q1, [x1], #0x20 /* q[i] := pt[i] */
ldp q2, q3, [x1], #0x20 ldp q2, q3, [x1], #0x20
ldp q4, q5, [x1], #0x20 ldp q4, q5, [x1], #0x20
ldp q6, q7, [x1], #0x20 ldp q6, q7, [x1], #0x20
eor v0.16b, v0.16b, v8.16b /* q[i] := pt[i] ^ tweak[i] */ eor v0.16b, v0.16b, v24.16b /* q[i] := ptxt[i] ^ tweak[i] */
eor v1.16b, v1.16b, v9.16b eor v1.16b, v1.16b, v25.16b
eor v2.16b, v2.16b, v10.16b eor v2.16b, v2.16b, v26.16b
eor v3.16b, v3.16b, v11.16b eor v3.16b, v3.16b, v27.16b
eor v4.16b, v4.16b, v12.16b eor v4.16b, v4.16b, v28.16b
eor v5.16b, v5.16b, v13.16b eor v5.16b, v5.16b, v29.16b
eor v6.16b, v6.16b, v14.16b eor v6.16b, v6.16b, v30.16b
eor v7.16b, v7.16b, v15.16b eor v7.16b, v7.16b, v31.16b
mov x0, x9 /* x0 := enckey */ mov x0, x9 /* x0 := enckey */
mov x3, x5 /* x3 := nrounds */ mov x3, x5 /* x3 := nrounds */
bl aesarmv8_enc8 /* encrypt q0,...,q7; trash x0/x3/q8 */ bl aesarmv8_enc8 /* encrypt q0-q7; trash x0/x3/q16 */
ldr q8, [sp, #16] /* reload q8 := tweak[0] */ eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */
eor v1.16b, v1.16b, v9.16b /* q[i] := AES(...) ^ tweak[i] */ eor v1.16b, v1.16b, v25.16b
eor v2.16b, v2.16b, v10.16b eor v2.16b, v2.16b, v26.16b
eor v3.16b, v3.16b, v11.16b eor v3.16b, v3.16b, v27.16b
eor v0.16b, v0.16b, v8.16b eor v4.16b, v4.16b, v28.16b
eor v4.16b, v4.16b, v12.16b eor v5.16b, v5.16b, v29.16b
eor v5.16b, v5.16b, v13.16b eor v6.16b, v6.16b, v30.16b
eor v6.16b, v6.16b, v14.16b eor v7.16b, v7.16b, v31.16b
eor v7.16b, v7.16b, v15.16b
stp q0, q1, [x2], #0x20 /* store ciphertext blocks */ stp q0, q1, [x2], #0x20 /* store ciphertext blocks */
stp q2, q3, [x2], #0x20 /* store ciphertext blocks */ stp q2, q3, [x2], #0x20
stp q4, q5, [x2], #0x20 /* store ciphertext blocks */ stp q4, q5, [x2], #0x20
stp q6, q7, [x2], #0x20 /* store ciphertext blocks */ stp q6, q7, [x2], #0x20
mov v9.16b, v15.16b /* q9 := q15 = tweak[7] */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
subs x10, x10, #0x80 /* count down nbytes */ subs x10, x10, #0x80 /* count down nbytes */
b.ne 1b /* repeat if more block groups */ b.ne 1b /* repeat if more block groups */
str q9, [x4] /* update tweak */ str q31, [x4] /* update tweak */
ldp fp, lr, [sp], #48 /* pop stack frame */ ldp fp, lr, [sp], #16 /* pop stack frame */
ret ret
END(aesarmv8_xts_enc8) END(aesarmv8_xts_enc8)
@ -720,7 +714,7 @@ END(aesarmv8_xts_enc8)
* uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
* uint32_t nrounds@x5) * uint32_t nrounds@x5)
* *
* Decrypt a contiguous sequence of blocks with AES-XTS. * Decrypt a contiguous sequdece of blocks with AES-XTS.
* *
* nbytes must be a positive integral multiple of 16. This routine * nbytes must be a positive integral multiple of 16. This routine
* is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once. * is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once.
@ -732,18 +726,18 @@ ENTRY(aesarmv8_xts_dec1)
mov fp, sp mov fp, sp
mov x9, x0 /* x9 := deckey */ mov x9, x0 /* x9 := deckey */
mov x10, x3 /* x10 := nbytes */ mov x10, x3 /* x10 := nbytes */
ldr q9, [x4] /* q9 := tweak */ ldr q31, [x4] /* q31 := tweak */
1: ldr q0, [x1], #0x10 /* q0 := ptxt */ 1: ldr q0, [x1], #0x10 /* q0 := ctxt */
mov x0, x9 /* x0 := deckey */ mov x0, x9 /* x0 := deckey */
mov x3, x5 /* x3 := nrounds */ mov x3, x5 /* x3 := nrounds */
eor v0.16b, v0.16b, v9.16b /* q0 := ptxt ^ tweak */ eor v0.16b, v0.16b, v31.16b /* q0 := ctxt ^ tweak */
bl aesarmv8_dec1 /* q0 := AES(ptxt ^ tweak) */ bl aesarmv8_dec1 /* q0 := AES(...); trash x0/x3/q16 */
eor v0.16b, v0.16b, v9.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */ eor v0.16b, v0.16b, v31.16b /* q0 := AES(ctxt ^ tweak) ^ tweak */
str q0, [x2], #0x10 /* store ciphertext block */ str q0, [x2], #0x10 /* store plaintext block */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
subs x10, x10, #0x10 /* count down nbytes */ subs x10, x10, #0x10 /* count down nbytes */
b.ne 1b /* repeat if more blocks */ b.ne 1b /* repeat if more blocks */
str q9, [x4] /* update tweak */ str q31, [x4] /* update tweak */
ldp fp, lr, [sp], #16 /* pop stack frame */ ldp fp, lr, [sp], #16 /* pop stack frame */
ret ret
END(aesarmv8_xts_dec1) END(aesarmv8_xts_dec1)
@ -753,75 +747,72 @@ END(aesarmv8_xts_dec1)
* uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
* uint32_t nrounds@x5) * uint32_t nrounds@x5)
* *
* Decrypt a contiguous sequence of blocks with AES-XTS. * Decrypt a contiguous sequdece of blocks with AES-XTS.
* *
* nbytes must be a positive integral multiple of 128. * nbytes must be a positive integral multiple of 128.
* *
* Standard ABI calling convention. * Standard ABI calling convention.
*/ */
ENTRY(aesarmv8_xts_dec8) ENTRY(aesarmv8_xts_dec8)
stp fp, lr, [sp, #-48]! /* push stack frame uint128[2] */ stp fp, lr, [sp, #-16]! /* push stack frame */
mov fp, sp mov fp, sp
mov x9, x0 /* x9 := deckey */ mov x9, x0 /* x9 := deckey */
mov x10, x3 /* x10 := nbytes */ mov x10, x3 /* x10 := nbytes */
ldr q9, [x4] /* q9 := tweak */ ldr q31, [x4] /* q31 := tweak */
1: str q9, [sp, #16] /* save tweak[0] */ 1: mov v24.16b, v31.16b /* q24 := tweak[0] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
str q9, [sp, #32] /* save tweak[1] */ mov v25.16b, v31.16b /* q25 := tweak[1] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v10.16b, v9.16b /* q10 := tweak[2] */ mov v26.16b, v31.16b /* q26 := tweak[2] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v11.16b, v9.16b /* q11 := tweak[3] */ mov v27.16b, v31.16b /* q27 := tweak[3] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v12.16b, v9.16b /* q11 := tweak[4] */ mov v28.16b, v31.16b /* q28 := tweak[4] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v13.16b, v9.16b /* q11 := tweak[5] */ mov v29.16b, v31.16b /* q29 := tweak[5] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v14.16b, v9.16b /* q11 := tweak[6] */ mov v30.16b, v31.16b /* q30 := tweak[6] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v15.16b, v9.16b /* q11 := tweak[7] */ /* q31 := tweak[7] */
ldp q8, q9, [sp, #16] /* q8 := tweak[0], q9 := tweak[1] */ ldp q0, q1, [x1], #0x20 /* q[i] := ctxt[i] */
ldp q0, q1, [x1], #0x20 /* q[i] := pt[i] */
ldp q2, q3, [x1], #0x20 ldp q2, q3, [x1], #0x20
ldp q4, q5, [x1], #0x20 ldp q4, q5, [x1], #0x20
ldp q6, q7, [x1], #0x20 ldp q6, q7, [x1], #0x20
eor v0.16b, v0.16b, v8.16b /* q[i] := pt[i] ^ tweak[i] */ eor v0.16b, v0.16b, v24.16b /* q[i] := ctxt[i] ^ tweak[i] */
eor v1.16b, v1.16b, v9.16b eor v1.16b, v1.16b, v25.16b
eor v2.16b, v2.16b, v10.16b eor v2.16b, v2.16b, v26.16b
eor v3.16b, v3.16b, v11.16b eor v3.16b, v3.16b, v27.16b
eor v4.16b, v4.16b, v12.16b eor v4.16b, v4.16b, v28.16b
eor v5.16b, v5.16b, v13.16b eor v5.16b, v5.16b, v29.16b
eor v6.16b, v6.16b, v14.16b eor v6.16b, v6.16b, v30.16b
eor v7.16b, v7.16b, v15.16b eor v7.16b, v7.16b, v31.16b
mov x0, x9 /* x0 := deckey */ mov x0, x9 /* x0 := deckey */
mov x3, x5 /* x3 := nrounds */ mov x3, x5 /* x3 := nrounds */
bl aesarmv8_dec8 /* decrypt q0,...,q7; trash x0/x3/q8 */ bl aesarmv8_dec8 /* decrypt q0-q7; trash x0/x3/q16 */
ldr q8, [sp, #16] /* reload q8 := tweak[0] */ eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */
eor v1.16b, v1.16b, v9.16b /* q[i] := AES(...) ^ tweak[i] */ eor v1.16b, v1.16b, v25.16b
eor v2.16b, v2.16b, v10.16b eor v2.16b, v2.16b, v26.16b
eor v3.16b, v3.16b, v11.16b eor v3.16b, v3.16b, v27.16b
eor v0.16b, v0.16b, v8.16b eor v4.16b, v4.16b, v28.16b
eor v4.16b, v4.16b, v12.16b eor v5.16b, v5.16b, v29.16b
eor v5.16b, v5.16b, v13.16b eor v6.16b, v6.16b, v30.16b
eor v6.16b, v6.16b, v14.16b eor v7.16b, v7.16b, v31.16b
eor v7.16b, v7.16b, v15.16b stp q0, q1, [x2], #0x20 /* store plaintext blocks */
stp q0, q1, [x2], #0x20 /* store ciphertext blocks */ stp q2, q3, [x2], #0x20
stp q2, q3, [x2], #0x20 /* store ciphertext blocks */ stp q4, q5, [x2], #0x20
stp q4, q5, [x2], #0x20 /* store ciphertext blocks */ stp q6, q7, [x2], #0x20
stp q6, q7, [x2], #0x20 /* store ciphertext blocks */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v9.16b, v15.16b /* q9 := q15 = tweak[7] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
subs x10, x10, #0x80 /* count down nbytes */ subs x10, x10, #0x80 /* count down nbytes */
b.ne 1b /* repeat if more block groups */ b.ne 1b /* repeat if more block groups */
str q9, [x4] /* update tweak */ str q31, [x4] /* update tweak */
ldp fp, lr, [sp], #48 /* pop stack frame */ ldp fp, lr, [sp], #16 /* pop stack frame */
ret ret
END(aesarmv8_xts_dec8) END(aesarmv8_xts_dec8)
/* /*
* aesarmv8_xts_mulx(tweak@q9) * aesarmv8_xts_mulx(tweak@q31)
* *
* Multiply q9 by x, modulo x^128 + x^7 + x^2 + x + 1, in place. * Multiply q31 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
* Uses x0 and q0/q1 as temporaries. * Uses x0 and q0/q1 as temporaries.
*/ */
.text .text
@ -836,12 +827,12 @@ aesarmv8_xts_mulx:
* carried into x^128 = x^7 + x^2 + x + 1. * carried into x^128 = x^7 + x^2 + x + 1.
*/ */
adrl x0, xtscarry adrl x0, xtscarry
cmlt v1.2d, v9.2d, #0 /* v1.2d[i] := -1 if v9.2d[i] < 0, else 0 */ cmlt v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v9.2d[i] < 0, else 0 */
ldr q0, [x0] /* q0 := xtscarry */ ldr q0, [x0] /* q0 := xtscarry */
ext v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */ ext v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */
shl v9.2d, v9.2d, #1 /* shift */ shl v31.2d, v31.2d, #1 /* shift */
and v0.16b, v0.16b, v1.16b /* copy xtscarry according to mask */ and v0.16b, v0.16b, v1.16b /* copy xtscarry according to mask */
eor v9.16b, v9.16b, v0.16b /* incorporate (a) and (b) */ eor v31.16b, v31.16b, v0.16b /* incorporate (a) and (b) */
ret ret
END(aesarmv8_xts_mulx) END(aesarmv8_xts_mulx)
@ -862,9 +853,9 @@ END(xtscarry)
ENTRY(aesarmv8_xts_update) ENTRY(aesarmv8_xts_update)
stp fp, lr, [sp, #-16]! /* push stack frame */ stp fp, lr, [sp, #-16]! /* push stack frame */
mov fp, sp mov fp, sp
ldr q9, [x0] /* load tweak */ ldr q31, [x0] /* load tweak */
bl aesarmv8_xts_mulx /* q9 *= x */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
str q9, [x1] /* store tweak */ str q31, [x1] /* store tweak */
ldp fp, lr, [sp], #16 /* pop stack frame */ ldp fp, lr, [sp], #16 /* pop stack frame */
ret ret
END(aesarmv8_xts_update) END(aesarmv8_xts_update)
@ -875,22 +866,22 @@ END(aesarmv8_xts_update)
* *
* Encrypt a single AES block in q0. * Encrypt a single AES block in q0.
* *
* Internal ABI. Uses q8 as temporary. Destroys x0 and x3. * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
*/ */
.text .text
_ALIGN_TEXT _ALIGN_TEXT
.type aesarmv8_enc1,@function .type aesarmv8_enc1,@function
aesarmv8_enc1: aesarmv8_enc1:
ldr q8, [x0], #0x10 /* load round key */ ldr q16, [x0], #0x10 /* load round key */
1: subs x3, x3, #1 1: subs x3, x3, #1
/* q0 := ShiftRows(SubBytes(AddRoundKey_q8(q0))) */ /* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */
aese v0.16b, v8.16b aese v0.16b, v16.16b
ldr q8, [x0], #0x10 /* load next round key */ ldr q16, [x0], #0x10 /* load next round key */
b.eq 2f b.eq 2f
/* q0 := MixColumns(q0) */ /* q0 := MixColumns(q0) */
aesmc v0.16b, v0.16b aesmc v0.16b, v0.16b
b 1b b 1b
2: eor v0.16b, v0.16b, v8.16b 2: eor v0.16b, v0.16b, v16.16b
ret ret
END(aesarmv8_enc1) END(aesarmv8_enc1)
@ -901,24 +892,24 @@ END(aesarmv8_enc1)
* *
* Encrypt eight AES blocks in q0 through q7 in parallel. * Encrypt eight AES blocks in q0 through q7 in parallel.
* *
* Internal ABI. Uses q8 as temporary. Destroys x0 and x3. * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
*/ */
.text .text
_ALIGN_TEXT _ALIGN_TEXT
.type aesarmv8_enc8,@function .type aesarmv8_enc8,@function
aesarmv8_enc8: aesarmv8_enc8:
ldr q8, [x0], #0x10 /* load round key */ ldr q16, [x0], #0x10 /* load round key */
1: subs x3, x3, #1 1: subs x3, x3, #1
/* q[i] := ShiftRows(SubBytes(AddRoundKey_q8(q[i]))) */ /* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
aese v0.16b, v8.16b aese v0.16b, v16.16b
aese v1.16b, v8.16b aese v1.16b, v16.16b
aese v2.16b, v8.16b aese v2.16b, v16.16b
aese v3.16b, v8.16b aese v3.16b, v16.16b
aese v4.16b, v8.16b aese v4.16b, v16.16b
aese v5.16b, v8.16b aese v5.16b, v16.16b
aese v6.16b, v8.16b aese v6.16b, v16.16b
aese v7.16b, v8.16b aese v7.16b, v16.16b
ldr q8, [x0], #0x10 /* load next round key */ ldr q16, [x0], #0x10 /* load next round key */
b.eq 2f b.eq 2f
/* q[i] := MixColumns(q[i]) */ /* q[i] := MixColumns(q[i]) */
aesmc v0.16b, v0.16b aesmc v0.16b, v0.16b
@ -930,14 +921,14 @@ aesarmv8_enc8:
aesmc v6.16b, v6.16b aesmc v6.16b, v6.16b
aesmc v7.16b, v7.16b aesmc v7.16b, v7.16b
b 1b b 1b
2: eor v0.16b, v0.16b, v8.16b /* AddRoundKey */ 2: eor v0.16b, v0.16b, v16.16b /* AddRoundKey */
eor v1.16b, v1.16b, v8.16b eor v1.16b, v1.16b, v16.16b
eor v2.16b, v2.16b, v8.16b eor v2.16b, v2.16b, v16.16b
eor v3.16b, v3.16b, v8.16b eor v3.16b, v3.16b, v16.16b
eor v4.16b, v4.16b, v8.16b eor v4.16b, v4.16b, v16.16b
eor v5.16b, v5.16b, v8.16b eor v5.16b, v5.16b, v16.16b
eor v6.16b, v6.16b, v8.16b eor v6.16b, v6.16b, v16.16b
eor v7.16b, v7.16b, v8.16b eor v7.16b, v7.16b, v16.16b
ret ret
END(aesarmv8_enc8) END(aesarmv8_enc8)
@ -947,22 +938,22 @@ END(aesarmv8_enc8)
* *
* Decrypt a single AES block in q0. * Decrypt a single AES block in q0.
* *
* Internal ABI. Uses q8 as temporary. Destroys x0 and x3. * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
*/ */
.text .text
_ALIGN_TEXT _ALIGN_TEXT
.type aesarmv8_dec1,@function .type aesarmv8_dec1,@function
aesarmv8_dec1: aesarmv8_dec1:
ldr q8, [x0], #0x10 /* load round key */ ldr q16, [x0], #0x10 /* load round key */
1: subs x3, x3, #1 1: subs x3, x3, #1
/* q0 := InSubBytes(InShiftRows(AddRoundKey_q8(q0))) */ /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
aesd v0.16b, v8.16b aesd v0.16b, v16.16b
ldr q8, [x0], #0x10 /* load next round key */ ldr q16, [x0], #0x10 /* load next round key */
b.eq 2f b.eq 2f
/* q0 := InMixColumns(q0) */ /* q0 := InMixColumns(q0) */
aesimc v0.16b, v0.16b aesimc v0.16b, v0.16b
b 1b b 1b
2: eor v0.16b, v0.16b, v8.16b 2: eor v0.16b, v0.16b, v16.16b
ret ret
END(aesarmv8_dec1) END(aesarmv8_dec1)
@ -973,24 +964,24 @@ END(aesarmv8_dec1)
* *
* Decrypt eight AES blocks in q0 through q7 in parallel. * Decrypt eight AES blocks in q0 through q7 in parallel.
* *
* Internal ABI. Uses q8 as temporary. Destroys x0 and x3. * Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
*/ */
.text .text
_ALIGN_TEXT _ALIGN_TEXT
.type aesarmv8_dec8,@function .type aesarmv8_dec8,@function
aesarmv8_dec8: aesarmv8_dec8:
ldr q8, [x0], #0x10 /* load round key */ ldr q16, [x0], #0x10 /* load round key */
1: subs x3, x3, #1 1: subs x3, x3, #1
/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q8(q[i]))) */ /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
aesd v0.16b, v8.16b aesd v0.16b, v16.16b
aesd v1.16b, v8.16b aesd v1.16b, v16.16b
aesd v2.16b, v8.16b aesd v2.16b, v16.16b
aesd v3.16b, v8.16b aesd v3.16b, v16.16b
aesd v4.16b, v8.16b aesd v4.16b, v16.16b
aesd v5.16b, v8.16b aesd v5.16b, v16.16b
aesd v6.16b, v8.16b aesd v6.16b, v16.16b
aesd v7.16b, v8.16b aesd v7.16b, v16.16b
ldr q8, [x0], #0x10 /* load next round key */ ldr q16, [x0], #0x10 /* load next round key */
b.eq 2f b.eq 2f
/* q[i] := InMixColumns(q[i]) */ /* q[i] := InMixColumns(q[i]) */
aesimc v0.16b, v0.16b aesimc v0.16b, v0.16b
@ -1002,13 +993,13 @@ aesarmv8_dec8:
aesimc v6.16b, v6.16b aesimc v6.16b, v6.16b
aesimc v7.16b, v7.16b aesimc v7.16b, v7.16b
b 1b b 1b
2: eor v0.16b, v0.16b, v8.16b /* AddRoundKey */ 2: eor v0.16b, v0.16b, v16.16b /* AddRoundKey */
eor v1.16b, v1.16b, v8.16b eor v1.16b, v1.16b, v16.16b
eor v2.16b, v2.16b, v8.16b eor v2.16b, v2.16b, v16.16b
eor v3.16b, v3.16b, v8.16b eor v3.16b, v3.16b, v16.16b
eor v4.16b, v4.16b, v8.16b eor v4.16b, v4.16b, v16.16b
eor v5.16b, v5.16b, v8.16b eor v5.16b, v5.16b, v16.16b
eor v6.16b, v6.16b, v8.16b eor v6.16b, v6.16b, v16.16b
eor v7.16b, v7.16b, v8.16b eor v7.16b, v7.16b, v16.16b
ret ret
END(aesarmv8_dec8) END(aesarmv8_dec8)