Reallocate registers to avoid abusing callee-saves registers, v8-v15.

Forgot to consult the AAPCS before committing this before -- oops!

While here, take advantage of the 32 aarch64 simd registers to avoid
all stack spills.
This commit is contained in:
riastradh 2020-06-30 23:06:02 +00:00
parent 6d5a7eed7d
commit aac1a7e566

View File

@ -1,4 +1,4 @@
/* $NetBSD: aes_armv8_64.S,v 1.3 2020/06/30 21:53:39 riastradh Exp $ */
/* $NetBSD: aes_armv8_64.S,v 1.4 2020/06/30 23:06:02 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@ -116,7 +116,7 @@ ENTRY(aesarmv8_setenckey128)
adrl x4, unshiftrows_rotword_3
eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
ldr q8, [x4] /* q8 := unshiftrows_rotword_3 table */
ldr q16, [x4] /* q16 := unshiftrows_rotword_3 table */
str q1, [x0], #0x10 /* store master key as first round key */
mov x2, #10 /* round count */
@ -136,7 +136,7 @@ ENTRY(aesarmv8_setenckey128)
/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
ld1r {v4.4s}, [x3], #4
tbl v3.16b, {v3.16b}, v8.16b
tbl v3.16b, {v3.16b}, v16.16b
eor v3.16b, v3.16b, v4.16b
/*
@ -175,8 +175,8 @@ ENTRY(aesarmv8_setenckey192)
adrl x4, unshiftrows_rotword_1
adrl x5, unshiftrows_rotword_3
eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
ldr q8, [x4] /* q8 := unshiftrows_rotword_1 */
ldr q9, [x5] /* q9 := unshiftrows_rotword_3 */
ldr q16, [x4] /* q16 := unshiftrows_rotword_1 */
ldr q17, [x5] /* q17 := unshiftrows_rotword_3 */
str q1, [x0], #0x10 /* store master key[0:128) as round key */
mov x2, #12 /* round count */
@ -197,7 +197,7 @@ ENTRY(aesarmv8_setenckey192)
/* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */
ld1r {v4.4s}, [x3], #4
tbl v3.16b, {v3.16b}, v8.16b
tbl v3.16b, {v3.16b}, v16.16b
eor v3.16b, v3.16b, v4.16b
/*
@ -269,8 +269,8 @@ ENTRY(aesarmv8_setenckey192)
* q2 = rk
* q3 = nrk
* v5.4s = (rk[2], rk[3], nrk[0], nrk[1])
* q8 = unshiftrows_rotword_1
* q9 = unshiftrows_rotword_3
* q16 = unshiftrows_rotword_1
* q17 = unshiftrows_rotword_3
*
* We have to compute, in q1:
*
@ -294,7 +294,7 @@ ENTRY(aesarmv8_setenckey192)
/* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */
ld1r {v4.4s}, [x3], #4
tbl v1.16b, {v1.16b}, v9.16b
tbl v1.16b, {v1.16b}, v17.16b
eor v1.16b, v1.16b, v4.16b
/*
@ -354,8 +354,8 @@ ENTRY(aesarmv8_setenckey256)
adrl x4, unshiftrows_rotword_3
adrl x5, unshiftrows_3
eor v0.16b, v0.16b, v0.16b /* q0 := 0 */
ldr q8, [x4] /* q8 := unshiftrows_rotword_3 */
ldr q9, [x5] /* q9 := unshiftrows_3 */
ldr q16, [x4] /* q16 := unshiftrows_rotword_3 */
ldr q17, [x5] /* q17 := unshiftrows_3 */
/* store master key as first two round keys */
stp q1, q2, [x0], #0x20
@ -376,7 +376,7 @@ ENTRY(aesarmv8_setenckey256)
/* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */
ld1r {v4.4s}, [x3], #4
tbl v3.16b, {v3.16b}, v8.16b
tbl v3.16b, {v3.16b}, v16.16b
eor v3.16b, v3.16b, v4.16b
/*
@ -402,7 +402,7 @@ ENTRY(aesarmv8_setenckey256)
aese v3.16b, v0.16b
/* v3.4s[i] := SubBytes(rk[3]) */
tbl v3.16b, {v3.16b}, v9.16b
tbl v3.16b, {v3.16b}, v17.16b
/*
* v5.4s := (0,prk[0],prk[1],prk[2])
@ -458,9 +458,9 @@ END(aesarmv8_enctodec)
ENTRY(aesarmv8_enc)
stp fp, lr, [sp, #-16]! /* push stack frame */
mov fp, sp
ldr q0, [x1] /* q0 := block */
bl aesarmv8_enc1
str q0, [x2] /* store block */
ldr q0, [x1] /* q0 := ptxt */
bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */
str q0, [x2] /* store ctxt */
ldp fp, lr, [sp], #16 /* pop stack frame */
ret
END(aesarmv8_enc)
@ -476,9 +476,9 @@ END(aesarmv8_enc)
ENTRY(aesarmv8_dec)
stp fp, lr, [sp, #-16]! /* push stack frame */
mov fp, sp
ldr q0, [x1] /* q0 := block */
bl aesarmv8_dec1
str q0, [x2] /* store block */
ldr q0, [x1] /* q0 := ctxt */
bl aesarmv8_dec1 /* q0 := ptxt; trash x0/x3/q16 */
str q0, [x2] /* store ptxt */
ldp fp, lr, [sp], #16 /* pop stack frame */
ret
END(aesarmv8_dec)
@ -505,7 +505,7 @@ ENTRY(aesarmv8_cbc_enc)
eor v0.16b, v0.16b, v1.16b /* q0 := cv ^ ptxt */
mov x0, x9 /* x0 := enckey */
mov x3, x5 /* x3 := nrounds */
bl aesarmv8_enc1 /* q0 := ciphertext block */
bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */
subs x10, x10, #0x10 /* count down nbytes */
str q0, [x2], #0x10 /* store ciphertext block */
b.ne 1b /* repeat if x10 is nonzero */
@ -527,10 +527,9 @@ END(aesarmv8_cbc_enc)
* Standard ABI calling convention.
*/
ENTRY(aesarmv8_cbc_dec1)
stp fp, lr, [sp, #-32]! /* push stack frame with uint128 */
stp fp, lr, [sp, #-16]! /* push stack frame */
mov fp, sp
ldr q8, [x4] /* q8 := iv */
str q8, [sp, #16] /* save iv */
ldr q24, [x4] /* q24 := iv */
mov x9, x0 /* x9 := enckey */
mov x10, x3 /* x10 := nbytes */
add x1, x1, x3 /* x1 := pointer past end of in */
@ -539,18 +538,17 @@ ENTRY(aesarmv8_cbc_dec1)
str q0, [x4] /* update iv */
1: mov x0, x9 /* x0 := enckey */
mov x3, x5 /* x3 := nrounds */
bl aesarmv8_dec1 /* q0 := cv ^ ptxt; trash x0/x3 */
bl aesarmv8_dec1 /* q0 := cv ^ ptxt; trash x0/x3/q16 */
subs x10, x10, #0x10 /* count down nbytes */
b.eq 2f /* stop if this is the first block */
ldr q8, [x1, #-0x10]! /* q8 := chaining value */
eor v0.16b, v0.16b, v8.16b /* q0 := plaintext block */
ldr q31, [x1, #-0x10]! /* q31 := chaining value */
eor v0.16b, v0.16b, v31.16b /* q0 := plaintext block */
str q0, [x2, #-0x10]! /* store plaintext block */
mov v0.16b, v8.16b /* move cv = ciphertext block */
mov v0.16b, v31.16b /* move cv = ciphertext block */
b 1b
2: ldr q8, [sp, #16] /* q8 := iv */
eor v0.16b, v0.16b, v8.16b /* q0 := first plaintext block */
2: eor v0.16b, v0.16b, v24.16b /* q0 := first plaintext block */
str q0, [x2, #-0x10]! /* store first plaintext block */
ldp fp, lr, [sp], #32 /* pop stack frame */
ldp fp, lr, [sp], #16 /* pop stack frame */
ret
END(aesarmv8_cbc_dec1)
@ -566,10 +564,9 @@ END(aesarmv8_cbc_dec1)
* Standard ABI calling convention.
*/
ENTRY(aesarmv8_cbc_dec8)
stp fp, lr, [sp, #-32]! /* push stack frame with uint128 */
stp fp, lr, [sp, #-16]! /* push stack frame */
mov fp, sp
ldr q8, [x4] /* q8 := iv */
str q8, [sp, #16] /* save iv */
ldr q24, [x4] /* q24 := iv */
mov x9, x0 /* x9 := enckey */
mov x10, x3 /* x10 := nbytes */
add x1, x1, x3 /* x1 := pointer past end of in */
@ -579,23 +576,24 @@ ENTRY(aesarmv8_cbc_dec8)
1: ldp q4, q5, [x1, #-0x20]!
ldp q2, q3, [x1, #-0x20]!
ldp q0, q1, [x1, #-0x20]!
mov v15.16b, v6.16b /* q[8+i] := cv[i], 0<i<8 */
mov v14.16b, v5.16b
mov v13.16b, v4.16b
mov v12.16b, v3.16b
mov v11.16b, v2.16b
mov v10.16b, v1.16b
mov v9.16b, v0.16b
mov v31.16b, v6.16b /* q[24+i] := cv[i], 0<i<8 */
mov v30.16b, v5.16b
mov v29.16b, v4.16b
mov v28.16b, v3.16b
mov v27.16b, v2.16b
mov v26.16b, v1.16b
mov v25.16b, v0.16b
mov x0, x9 /* x0 := enckey */
mov x3, x5 /* x3 := nrounds */
bl aesarmv8_dec8 /* q[i] := cv[i] ^ pt[i] */
eor v7.16b, v7.16b, v15.16b /* q[i] := pt[i] */
eor v6.16b, v6.16b, v14.16b
eor v5.16b, v5.16b, v13.16b
eor v4.16b, v4.16b, v12.16b
eor v3.16b, v3.16b, v11.16b
eor v2.16b, v2.16b, v10.16b
eor v1.16b, v1.16b, v9.16b
bl aesarmv8_dec8 /* q[i] := cv[i] ^ pt[i];
* trash x0/x3/q16 */
eor v7.16b, v7.16b, v31.16b /* q[i] := pt[i] */
eor v6.16b, v6.16b, v30.16b
eor v5.16b, v5.16b, v29.16b
eor v4.16b, v4.16b, v28.16b
eor v3.16b, v3.16b, v27.16b
eor v2.16b, v2.16b, v26.16b
eor v1.16b, v1.16b, v25.16b
subs x10, x10, #0x80 /* count down nbytes */
stp q6, q7, [x2, #-0x20]! /* store plaintext blocks */
stp q4, q5, [x2, #-0x20]!
@ -605,10 +603,9 @@ ENTRY(aesarmv8_cbc_dec8)
eor v0.16b, v0.16b, v7.16b /* q0 := pt0 */
stp q0, q1, [x2, #-0x20]!
b 1b
2: ldr q8, [sp, #16] /* q8 := iv */
eor v0.16b, v0.16b, v8.16b /* q0 := pt0 */
2: eor v0.16b, v0.16b, v24.16b /* q0 := pt0 */
stp q0, q1, [x2, #-0x20]! /* store first two plaintext blocks */
ldp fp, lr, [sp], #32 /* pop stack frame */
ldp fp, lr, [sp], #16 /* pop stack frame */
ret
END(aesarmv8_cbc_dec8)
@ -629,18 +626,18 @@ ENTRY(aesarmv8_xts_enc1)
mov fp, sp
mov x9, x0 /* x9 := enckey */
mov x10, x3 /* x10 := nbytes */
ldr q9, [x4] /* q9 := tweak */
ldr q31, [x4] /* q31 := tweak */
1: ldr q0, [x1], #0x10 /* q0 := ptxt */
mov x0, x9 /* x0 := enckey */
mov x3, x5 /* x3 := nrounds */
eor v0.16b, v0.16b, v9.16b /* q0 := ptxt ^ tweak */
bl aesarmv8_enc1 /* q0 := AES(ptxt ^ tweak) */
eor v0.16b, v0.16b, v9.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */
eor v0.16b, v0.16b, v31.16b /* q0 := ptxt ^ tweak */
bl aesarmv8_enc1 /* q0 := AES(...); trash x0/x3/q16 */
eor v0.16b, v0.16b, v31.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */
str q0, [x2], #0x10 /* store ciphertext block */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
subs x10, x10, #0x10 /* count down nbytes */
b.ne 1b /* repeat if more blocks */
str q9, [x4] /* update tweak */
str q31, [x4] /* update tweak */
ldp fp, lr, [sp], #16 /* pop stack frame */
ret
END(aesarmv8_xts_enc1)
@ -657,61 +654,58 @@ END(aesarmv8_xts_enc1)
* Standard ABI calling convention.
*/
ENTRY(aesarmv8_xts_enc8)
stp fp, lr, [sp, #-48]! /* push stack frame uint128[2] */
stp fp, lr, [sp, #-16]! /* push stack frame */
mov fp, sp
mov x9, x0 /* x9 := enckey */
mov x10, x3 /* x10 := nbytes */
ldr q9, [x4] /* q9 := tweak */
1: str q9, [sp, #16] /* save tweak[0] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
str q9, [sp, #32] /* save tweak[1] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
mov v10.16b, v9.16b /* q10 := tweak[2] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
mov v11.16b, v9.16b /* q11 := tweak[3] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
mov v12.16b, v9.16b /* q11 := tweak[4] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
mov v13.16b, v9.16b /* q11 := tweak[5] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
mov v14.16b, v9.16b /* q11 := tweak[6] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
mov v15.16b, v9.16b /* q11 := tweak[7] */
ldp q8, q9, [sp, #16] /* q8 := tweak[0], q9 := tweak[1] */
ldp q0, q1, [x1], #0x20 /* q[i] := pt[i] */
ldr q31, [x4] /* q31 := tweak */
1: mov v24.16b, v31.16b /* q24 := tweak[0] */
bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v25.16b, v31.16b /* q25 := tweak[1] */
bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v26.16b, v31.16b /* q26 := tweak[2] */
bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v27.16b, v31.16b /* q27 := tweak[3] */
bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v28.16b, v31.16b /* q28 := tweak[4] */
bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v29.16b, v31.16b /* q29 := tweak[5] */
bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v30.16b, v31.16b /* q30 := tweak[6] */
bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
/* q31 := tweak[7] */
ldp q0, q1, [x1], #0x20 /* q[i] := ptxt[i] */
ldp q2, q3, [x1], #0x20
ldp q4, q5, [x1], #0x20
ldp q6, q7, [x1], #0x20
eor v0.16b, v0.16b, v8.16b /* q[i] := pt[i] ^ tweak[i] */
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
eor v4.16b, v4.16b, v12.16b
eor v5.16b, v5.16b, v13.16b
eor v6.16b, v6.16b, v14.16b
eor v7.16b, v7.16b, v15.16b
eor v0.16b, v0.16b, v24.16b /* q[i] := ptxt[i] ^ tweak[i] */
eor v1.16b, v1.16b, v25.16b
eor v2.16b, v2.16b, v26.16b
eor v3.16b, v3.16b, v27.16b
eor v4.16b, v4.16b, v28.16b
eor v5.16b, v5.16b, v29.16b
eor v6.16b, v6.16b, v30.16b
eor v7.16b, v7.16b, v31.16b
mov x0, x9 /* x0 := enckey */
mov x3, x5 /* x3 := nrounds */
bl aesarmv8_enc8 /* encrypt q0,...,q7; trash x0/x3/q8 */
ldr q8, [sp, #16] /* reload q8 := tweak[0] */
eor v1.16b, v1.16b, v9.16b /* q[i] := AES(...) ^ tweak[i] */
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
eor v0.16b, v0.16b, v8.16b
eor v4.16b, v4.16b, v12.16b
eor v5.16b, v5.16b, v13.16b
eor v6.16b, v6.16b, v14.16b
eor v7.16b, v7.16b, v15.16b
bl aesarmv8_enc8 /* encrypt q0-q7; trash x0/x3/q16 */
eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */
eor v1.16b, v1.16b, v25.16b
eor v2.16b, v2.16b, v26.16b
eor v3.16b, v3.16b, v27.16b
eor v4.16b, v4.16b, v28.16b
eor v5.16b, v5.16b, v29.16b
eor v6.16b, v6.16b, v30.16b
eor v7.16b, v7.16b, v31.16b
stp q0, q1, [x2], #0x20 /* store ciphertext blocks */
stp q2, q3, [x2], #0x20 /* store ciphertext blocks */
stp q4, q5, [x2], #0x20 /* store ciphertext blocks */
stp q6, q7, [x2], #0x20 /* store ciphertext blocks */
mov v9.16b, v15.16b /* q9 := q15 = tweak[7] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
stp q2, q3, [x2], #0x20
stp q4, q5, [x2], #0x20
stp q6, q7, [x2], #0x20
bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
subs x10, x10, #0x80 /* count down nbytes */
b.ne 1b /* repeat if more block groups */
str q9, [x4] /* update tweak */
ldp fp, lr, [sp], #48 /* pop stack frame */
str q31, [x4] /* update tweak */
ldp fp, lr, [sp], #16 /* pop stack frame */
ret
END(aesarmv8_xts_enc8)
@ -720,7 +714,7 @@ END(aesarmv8_xts_enc8)
* uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
* uint32_t nrounds@x5)
*
* Decrypt a contiguous sequence of blocks with AES-XTS.
* Decrypt a contiguous sequdece of blocks with AES-XTS.
*
* nbytes must be a positive integral multiple of 16. This routine
* is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once.
@ -732,18 +726,18 @@ ENTRY(aesarmv8_xts_dec1)
mov fp, sp
mov x9, x0 /* x9 := deckey */
mov x10, x3 /* x10 := nbytes */
ldr q9, [x4] /* q9 := tweak */
1: ldr q0, [x1], #0x10 /* q0 := ptxt */
ldr q31, [x4] /* q31 := tweak */
1: ldr q0, [x1], #0x10 /* q0 := ctxt */
mov x0, x9 /* x0 := deckey */
mov x3, x5 /* x3 := nrounds */
eor v0.16b, v0.16b, v9.16b /* q0 := ptxt ^ tweak */
bl aesarmv8_dec1 /* q0 := AES(ptxt ^ tweak) */
eor v0.16b, v0.16b, v9.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */
str q0, [x2], #0x10 /* store ciphertext block */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
eor v0.16b, v0.16b, v31.16b /* q0 := ctxt ^ tweak */
bl aesarmv8_dec1 /* q0 := AES(...); trash x0/x3/q16 */
eor v0.16b, v0.16b, v31.16b /* q0 := AES(ctxt ^ tweak) ^ tweak */
str q0, [x2], #0x10 /* store plaintext block */
bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
subs x10, x10, #0x10 /* count down nbytes */
b.ne 1b /* repeat if more blocks */
str q9, [x4] /* update tweak */
str q31, [x4] /* update tweak */
ldp fp, lr, [sp], #16 /* pop stack frame */
ret
END(aesarmv8_xts_dec1)
@ -753,75 +747,72 @@ END(aesarmv8_xts_dec1)
* uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4,
* uint32_t nrounds@x5)
*
* Decrypt a contiguous sequence of blocks with AES-XTS.
* Decrypt a contiguous sequdece of blocks with AES-XTS.
*
* nbytes must be a positive integral multiple of 128.
*
* Standard ABI calling convention.
*/
ENTRY(aesarmv8_xts_dec8)
stp fp, lr, [sp, #-48]! /* push stack frame uint128[2] */
stp fp, lr, [sp, #-16]! /* push stack frame */
mov fp, sp
mov x9, x0 /* x9 := deckey */
mov x10, x3 /* x10 := nbytes */
ldr q9, [x4] /* q9 := tweak */
1: str q9, [sp, #16] /* save tweak[0] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
str q9, [sp, #32] /* save tweak[1] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
mov v10.16b, v9.16b /* q10 := tweak[2] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
mov v11.16b, v9.16b /* q11 := tweak[3] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
mov v12.16b, v9.16b /* q11 := tweak[4] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
mov v13.16b, v9.16b /* q11 := tweak[5] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
mov v14.16b, v9.16b /* q11 := tweak[6] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
mov v15.16b, v9.16b /* q11 := tweak[7] */
ldp q8, q9, [sp, #16] /* q8 := tweak[0], q9 := tweak[1] */
ldp q0, q1, [x1], #0x20 /* q[i] := pt[i] */
ldr q31, [x4] /* q31 := tweak */
1: mov v24.16b, v31.16b /* q24 := tweak[0] */
bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v25.16b, v31.16b /* q25 := tweak[1] */
bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v26.16b, v31.16b /* q26 := tweak[2] */
bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v27.16b, v31.16b /* q27 := tweak[3] */
bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v28.16b, v31.16b /* q28 := tweak[4] */
bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v29.16b, v31.16b /* q29 := tweak[5] */
bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
mov v30.16b, v31.16b /* q30 := tweak[6] */
bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
/* q31 := tweak[7] */
ldp q0, q1, [x1], #0x20 /* q[i] := ctxt[i] */
ldp q2, q3, [x1], #0x20
ldp q4, q5, [x1], #0x20
ldp q6, q7, [x1], #0x20
eor v0.16b, v0.16b, v8.16b /* q[i] := pt[i] ^ tweak[i] */
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
eor v4.16b, v4.16b, v12.16b
eor v5.16b, v5.16b, v13.16b
eor v6.16b, v6.16b, v14.16b
eor v7.16b, v7.16b, v15.16b
eor v0.16b, v0.16b, v24.16b /* q[i] := ctxt[i] ^ tweak[i] */
eor v1.16b, v1.16b, v25.16b
eor v2.16b, v2.16b, v26.16b
eor v3.16b, v3.16b, v27.16b
eor v4.16b, v4.16b, v28.16b
eor v5.16b, v5.16b, v29.16b
eor v6.16b, v6.16b, v30.16b
eor v7.16b, v7.16b, v31.16b
mov x0, x9 /* x0 := deckey */
mov x3, x5 /* x3 := nrounds */
bl aesarmv8_dec8 /* decrypt q0,...,q7; trash x0/x3/q8 */
ldr q8, [sp, #16] /* reload q8 := tweak[0] */
eor v1.16b, v1.16b, v9.16b /* q[i] := AES(...) ^ tweak[i] */
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
eor v0.16b, v0.16b, v8.16b
eor v4.16b, v4.16b, v12.16b
eor v5.16b, v5.16b, v13.16b
eor v6.16b, v6.16b, v14.16b
eor v7.16b, v7.16b, v15.16b
stp q0, q1, [x2], #0x20 /* store ciphertext blocks */
stp q2, q3, [x2], #0x20 /* store ciphertext blocks */
stp q4, q5, [x2], #0x20 /* store ciphertext blocks */
stp q6, q7, [x2], #0x20 /* store ciphertext blocks */
mov v9.16b, v15.16b /* q9 := q15 = tweak[7] */
bl aesarmv8_xts_mulx /* q9 *= x; trash x0/q0/q1 */
bl aesarmv8_dec8 /* decrypt q0-q7; trash x0/x3/q16 */
eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */
eor v1.16b, v1.16b, v25.16b
eor v2.16b, v2.16b, v26.16b
eor v3.16b, v3.16b, v27.16b
eor v4.16b, v4.16b, v28.16b
eor v5.16b, v5.16b, v29.16b
eor v6.16b, v6.16b, v30.16b
eor v7.16b, v7.16b, v31.16b
stp q0, q1, [x2], #0x20 /* store plaintext blocks */
stp q2, q3, [x2], #0x20
stp q4, q5, [x2], #0x20
stp q6, q7, [x2], #0x20
bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
subs x10, x10, #0x80 /* count down nbytes */
b.ne 1b /* repeat if more block groups */
str q9, [x4] /* update tweak */
ldp fp, lr, [sp], #48 /* pop stack frame */
str q31, [x4] /* update tweak */
ldp fp, lr, [sp], #16 /* pop stack frame */
ret
END(aesarmv8_xts_dec8)
/*
* aesarmv8_xts_mulx(tweak@q9)
* aesarmv8_xts_mulx(tweak@q31)
*
* Multiply q9 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
* Multiply q31 by x, modulo x^128 + x^7 + x^2 + x + 1, in place.
* Uses x0 and q0/q1 as temporaries.
*/
.text
@ -836,12 +827,12 @@ aesarmv8_xts_mulx:
* carried into x^128 = x^7 + x^2 + x + 1.
*/
adrl x0, xtscarry
cmlt v1.2d, v9.2d, #0 /* v1.2d[i] := -1 if v9.2d[i] < 0, else 0 */
cmlt v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v9.2d[i] < 0, else 0 */
ldr q0, [x0] /* q0 := xtscarry */
ext v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */
shl v9.2d, v9.2d, #1 /* shift */
shl v31.2d, v31.2d, #1 /* shift */
and v0.16b, v0.16b, v1.16b /* copy xtscarry according to mask */
eor v9.16b, v9.16b, v0.16b /* incorporate (a) and (b) */
eor v31.16b, v31.16b, v0.16b /* incorporate (a) and (b) */
ret
END(aesarmv8_xts_mulx)
@ -862,9 +853,9 @@ END(xtscarry)
ENTRY(aesarmv8_xts_update)
stp fp, lr, [sp, #-16]! /* push stack frame */
mov fp, sp
ldr q9, [x0] /* load tweak */
bl aesarmv8_xts_mulx /* q9 *= x */
str q9, [x1] /* store tweak */
ldr q31, [x0] /* load tweak */
bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */
str q31, [x1] /* store tweak */
ldp fp, lr, [sp], #16 /* pop stack frame */
ret
END(aesarmv8_xts_update)
@ -875,22 +866,22 @@ END(aesarmv8_xts_update)
*
* Encrypt a single AES block in q0.
*
* Internal ABI. Uses q8 as temporary. Destroys x0 and x3.
* Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
*/
.text
_ALIGN_TEXT
.type aesarmv8_enc1,@function
aesarmv8_enc1:
ldr q8, [x0], #0x10 /* load round key */
ldr q16, [x0], #0x10 /* load round key */
1: subs x3, x3, #1
/* q0 := ShiftRows(SubBytes(AddRoundKey_q8(q0))) */
aese v0.16b, v8.16b
ldr q8, [x0], #0x10 /* load next round key */
/* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */
aese v0.16b, v16.16b
ldr q16, [x0], #0x10 /* load next round key */
b.eq 2f
/* q0 := MixColumns(q0) */
aesmc v0.16b, v0.16b
b 1b
2: eor v0.16b, v0.16b, v8.16b
2: eor v0.16b, v0.16b, v16.16b
ret
END(aesarmv8_enc1)
@ -901,24 +892,24 @@ END(aesarmv8_enc1)
*
* Encrypt eight AES blocks in q0 through q7 in parallel.
*
* Internal ABI. Uses q8 as temporary. Destroys x0 and x3.
* Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
*/
.text
_ALIGN_TEXT
.type aesarmv8_enc8,@function
aesarmv8_enc8:
ldr q8, [x0], #0x10 /* load round key */
ldr q16, [x0], #0x10 /* load round key */
1: subs x3, x3, #1
/* q[i] := ShiftRows(SubBytes(AddRoundKey_q8(q[i]))) */
aese v0.16b, v8.16b
aese v1.16b, v8.16b
aese v2.16b, v8.16b
aese v3.16b, v8.16b
aese v4.16b, v8.16b
aese v5.16b, v8.16b
aese v6.16b, v8.16b
aese v7.16b, v8.16b
ldr q8, [x0], #0x10 /* load next round key */
/* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
aese v0.16b, v16.16b
aese v1.16b, v16.16b
aese v2.16b, v16.16b
aese v3.16b, v16.16b
aese v4.16b, v16.16b
aese v5.16b, v16.16b
aese v6.16b, v16.16b
aese v7.16b, v16.16b
ldr q16, [x0], #0x10 /* load next round key */
b.eq 2f
/* q[i] := MixColumns(q[i]) */
aesmc v0.16b, v0.16b
@ -930,14 +921,14 @@ aesarmv8_enc8:
aesmc v6.16b, v6.16b
aesmc v7.16b, v7.16b
b 1b
2: eor v0.16b, v0.16b, v8.16b /* AddRoundKey */
eor v1.16b, v1.16b, v8.16b
eor v2.16b, v2.16b, v8.16b
eor v3.16b, v3.16b, v8.16b
eor v4.16b, v4.16b, v8.16b
eor v5.16b, v5.16b, v8.16b
eor v6.16b, v6.16b, v8.16b
eor v7.16b, v7.16b, v8.16b
2: eor v0.16b, v0.16b, v16.16b /* AddRoundKey */
eor v1.16b, v1.16b, v16.16b
eor v2.16b, v2.16b, v16.16b
eor v3.16b, v3.16b, v16.16b
eor v4.16b, v4.16b, v16.16b
eor v5.16b, v5.16b, v16.16b
eor v6.16b, v6.16b, v16.16b
eor v7.16b, v7.16b, v16.16b
ret
END(aesarmv8_enc8)
@ -947,22 +938,22 @@ END(aesarmv8_enc8)
*
* Decrypt a single AES block in q0.
*
* Internal ABI. Uses q8 as temporary. Destroys x0 and x3.
* Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
*/
.text
_ALIGN_TEXT
.type aesarmv8_dec1,@function
aesarmv8_dec1:
ldr q8, [x0], #0x10 /* load round key */
ldr q16, [x0], #0x10 /* load round key */
1: subs x3, x3, #1
/* q0 := InSubBytes(InShiftRows(AddRoundKey_q8(q0))) */
aesd v0.16b, v8.16b
ldr q8, [x0], #0x10 /* load next round key */
/* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
aesd v0.16b, v16.16b
ldr q16, [x0], #0x10 /* load next round key */
b.eq 2f
/* q0 := InMixColumns(q0) */
aesimc v0.16b, v0.16b
b 1b
2: eor v0.16b, v0.16b, v8.16b
2: eor v0.16b, v0.16b, v16.16b
ret
END(aesarmv8_dec1)
@ -973,24 +964,24 @@ END(aesarmv8_dec1)
*
* Decrypt eight AES blocks in q0 through q7 in parallel.
*
* Internal ABI. Uses q8 as temporary. Destroys x0 and x3.
* Internal ABI. Uses q16 as temporary. Destroys x0 and x3.
*/
.text
_ALIGN_TEXT
.type aesarmv8_dec8,@function
aesarmv8_dec8:
ldr q8, [x0], #0x10 /* load round key */
ldr q16, [x0], #0x10 /* load round key */
1: subs x3, x3, #1
/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q8(q[i]))) */
aesd v0.16b, v8.16b
aesd v1.16b, v8.16b
aesd v2.16b, v8.16b
aesd v3.16b, v8.16b
aesd v4.16b, v8.16b
aesd v5.16b, v8.16b
aesd v6.16b, v8.16b
aesd v7.16b, v8.16b
ldr q8, [x0], #0x10 /* load next round key */
/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
aesd v0.16b, v16.16b
aesd v1.16b, v16.16b
aesd v2.16b, v16.16b
aesd v3.16b, v16.16b
aesd v4.16b, v16.16b
aesd v5.16b, v16.16b
aesd v6.16b, v16.16b
aesd v7.16b, v16.16b
ldr q16, [x0], #0x10 /* load next round key */
b.eq 2f
/* q[i] := InMixColumns(q[i]) */
aesimc v0.16b, v0.16b
@ -1002,13 +993,13 @@ aesarmv8_dec8:
aesimc v6.16b, v6.16b
aesimc v7.16b, v7.16b
b 1b
2: eor v0.16b, v0.16b, v8.16b /* AddRoundKey */
eor v1.16b, v1.16b, v8.16b
eor v2.16b, v2.16b, v8.16b
eor v3.16b, v3.16b, v8.16b
eor v4.16b, v4.16b, v8.16b
eor v5.16b, v5.16b, v8.16b
eor v6.16b, v6.16b, v8.16b
eor v7.16b, v7.16b, v8.16b
2: eor v0.16b, v0.16b, v16.16b /* AddRoundKey */
eor v1.16b, v1.16b, v16.16b
eor v2.16b, v2.16b, v16.16b
eor v3.16b, v3.16b, v16.16b
eor v4.16b, v4.16b, v16.16b
eor v5.16b, v5.16b, v16.16b
eor v6.16b, v6.16b, v16.16b
eor v7.16b, v7.16b, v16.16b
ret
END(aesarmv8_dec8)