Issue aese/aesmc and aesd/aesimc in pairs.

Advised by the aarch64 optimization guide; increases cgd throughput
by about 10%.
This commit is contained in:
riastradh 2020-07-27 20:54:11 +00:00
parent 57324de2aa
commit 0b26be56cd
1 changed files with 60 additions and 28 deletions

View File

@ -1,4 +1,4 @@
/* $NetBSD: aes_armv8_64.S,v 1.9 2020/07/27 20:53:22 riastradh Exp $ */
/* $NetBSD: aes_armv8_64.S,v 1.10 2020/07/27 20:54:11 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@ -1041,15 +1041,18 @@ END(ctr32_inc)
.type aesarmv8_enc1,@function
aesarmv8_enc1:
ldr q16, [x0], #0x10 /* load round key */
b 2f
sub x3, x3, #1
_ALIGN_TEXT
1: /* q0 := MixColumns(q0) */
1: /* q0 := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q0)))) */
aese v0.16b, v16.16b
aesmc v0.16b, v0.16b
2: subs x3, x3, #1
ldr q16, [x0], #0x10
subs x3, x3, #1
b.ne 1b
/* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */
aese v0.16b, v16.16b
ldr q16, [x0], #0x10 /* load next round key */
b.ne 1b
ldr q16, [x0] /* load last round key */
/* q0 := AddRoundKey_q16(q0) */
eor v0.16b, v0.16b, v16.16b
ret
END(aesarmv8_enc1)
@ -1067,17 +1070,21 @@ END(aesarmv8_enc1)
.type aesarmv8_enc2,@function
aesarmv8_enc2:
ldr q16, [x0], #0x10 /* load round key */
b 2f
sub x3, x3, #1
_ALIGN_TEXT
1: /* q[i] := MixColumns(q[i]) */
1: /* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */
aese v0.16b, v16.16b
aesmc v0.16b, v0.16b
aese v1.16b, v16.16b
aesmc v1.16b, v1.16b
2: subs x3, x3, #1
ldr q16, [x0], #0x10 /* load next round key */
subs x3, x3, #1
b.ne 1b
/* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
aese v0.16b, v16.16b
aese v1.16b, v16.16b
ldr q16, [x0], #0x10 /* load next round key */
b.ne 1b
ldr q16, [x0] /* load last round key */
/* q[i] := AddRoundKey_q16(q[i]) */
eor v0.16b, v0.16b, v16.16b
eor v1.16b, v1.16b, v16.16b
ret
@ -1097,18 +1104,28 @@ END(aesarmv8_enc2)
.type aesarmv8_enc8,@function
aesarmv8_enc8:
ldr q16, [x0], #0x10 /* load round key */
b 2f
sub x3, x3, #1
_ALIGN_TEXT
1: /* q[i] := MixColumns(q[i]) */
1: /* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */
aese v0.16b, v16.16b
aesmc v0.16b, v0.16b
aese v1.16b, v16.16b
aesmc v1.16b, v1.16b
aese v2.16b, v16.16b
aesmc v2.16b, v2.16b
aese v3.16b, v16.16b
aesmc v3.16b, v3.16b
aese v4.16b, v16.16b
aesmc v4.16b, v4.16b
aese v5.16b, v16.16b
aesmc v5.16b, v5.16b
aese v6.16b, v16.16b
aesmc v6.16b, v6.16b
aese v7.16b, v16.16b
aesmc v7.16b, v7.16b
2: subs x3, x3, #1
ldr q16, [x0], #0x10 /* load next round key */
subs x3, x3, #1
b.ne 1b
/* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */
aese v0.16b, v16.16b
aese v1.16b, v16.16b
@ -1118,9 +1135,9 @@ aesarmv8_enc8:
aese v5.16b, v16.16b
aese v6.16b, v16.16b
aese v7.16b, v16.16b
ldr q16, [x0], #0x10 /* load next round key */
b.ne 1b
eor v0.16b, v0.16b, v16.16b /* AddRoundKey */
ldr q16, [x0] /* load last round key */
/* q[i] := AddRoundKey_q16(q[i]) */
eor v0.16b, v0.16b, v16.16b
eor v1.16b, v1.16b, v16.16b
eor v2.16b, v2.16b, v16.16b
eor v3.16b, v3.16b, v16.16b
@ -1144,15 +1161,19 @@ END(aesarmv8_enc8)
.type aesarmv8_dec1,@function
aesarmv8_dec1:
ldr q16, [x0], #0x10 /* load round key */
b 2f
sub x3, x3, #1
_ALIGN_TEXT
1: /* q0 := InMixColumns(q0) */
1: /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
aesd v0.16b, v16.16b
/* q0 := InMixColumns(q0) */
aesimc v0.16b, v0.16b
2: subs x3, x3, #1
ldr q16, [x0], #0x10 /* load next round key */
subs x3, x3, #1
b.ne 1b
/* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */
aesd v0.16b, v16.16b
ldr q16, [x0], #0x10 /* load next round key */
b.ne 1b
ldr q16, [x0] /* load last round key */
/* q0 := AddRoundKey_q16(q0) */
eor v0.16b, v0.16b, v16.16b
ret
END(aesarmv8_dec1)
@ -1171,18 +1192,29 @@ END(aesarmv8_dec1)
.type aesarmv8_dec8,@function
aesarmv8_dec8:
ldr q16, [x0], #0x10 /* load round key */
b 2f
sub x3, x3, #1
_ALIGN_TEXT
1: /* q[i] := InMixColumns(q[i]) */
1: /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
aesd v0.16b, v16.16b
/* q[i] := InMixColumns(q[i]) */
aesimc v0.16b, v0.16b
aesd v1.16b, v16.16b
aesimc v1.16b, v1.16b
aesd v2.16b, v16.16b
aesimc v2.16b, v2.16b
aesd v3.16b, v16.16b
aesimc v3.16b, v3.16b
aesd v4.16b, v16.16b
aesimc v4.16b, v4.16b
aesd v5.16b, v16.16b
aesimc v5.16b, v5.16b
aesd v6.16b, v16.16b
aesimc v6.16b, v6.16b
aesd v7.16b, v16.16b
aesimc v7.16b, v7.16b
2: subs x3, x3, #1
ldr q16, [x0], #0x10 /* load next round key */
subs x3, x3, #1
b.ne 1b
/* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */
aesd v0.16b, v16.16b
aesd v1.16b, v16.16b
@ -1192,9 +1224,9 @@ aesarmv8_dec8:
aesd v5.16b, v16.16b
aesd v6.16b, v16.16b
aesd v7.16b, v16.16b
ldr q16, [x0], #0x10 /* load next round key */
b.ne 1b
eor v0.16b, v0.16b, v16.16b /* AddRoundKey */
ldr q16, [x0] /* load last round key */
/* q[i] := AddRoundKey_q16(q[i]) */
eor v0.16b, v0.16b, v16.16b
eor v1.16b, v1.16b, v16.16b
eor v2.16b, v2.16b, v16.16b
eor v3.16b, v3.16b, v16.16b