aes neon: Gather mc_forward/backward so we can load 256 bits at once.

This commit is contained in:
riastradh 2020-09-10 11:31:03 +00:00
parent 3e1dd6a02d
commit ea2d112d7c

View File

@ -1,4 +1,4 @@
/* $NetBSD: aes_neon_32.S,v 1.10 2020/09/10 11:30:28 riastradh Exp $ */
/* $NetBSD: aes_neon_32.S,v 1.11 2020/09/10 11:31:03 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@ -28,7 +28,7 @@
#include <arm/asm.h>
RCSID("$NetBSD: aes_neon_32.S,v 1.10 2020/09/10 11:30:28 riastradh Exp $")
RCSID("$NetBSD: aes_neon_32.S,v 1.11 2020/09/10 11:31:03 riastradh Exp $")
.fpu neon
@ -54,36 +54,26 @@ inva:
.byte 0x09,0x08,0x05,0x02,0x0C,0x0E,0x0D,0x03
END(inva)
.type mc_forward,_ASM_TYPE_OBJECT
mc_forward:
.byte 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04 /* 0 */
.type mc,_ASM_TYPE_OBJECT
mc:
.byte 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04 /* 0 forward */
.byte 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C
.byte 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08 /* 1 */
.byte 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00
.byte 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C /* 2 */
.byte 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04
.Lmc_forward_3:
.byte 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00 /* 3 */
.byte 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08
END(mc_forward)
.type mc_backward,_ASM_TYPE_OBJECT
mc_backward:
.byte 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06 /* 0 */
.byte 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06 /* 0 backward */
.byte 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E
.byte 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02 /* 1 */
.byte 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08 /* 1 forward */
.byte 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00
.byte 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02 /* 1 backward */
.byte 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A
.byte 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E /* 2 */
.byte 0x09,0x0A,0x0B,0x08,0x0D,0x0E,0x0F,0x0C /* 2 forward */
.byte 0x01,0x02,0x03,0x00,0x05,0x06,0x07,0x04
.byte 0x0B,0x08,0x09,0x0A,0x0F,0x0C,0x0D,0x0E /* 2 backward */
.byte 0x03,0x00,0x01,0x02,0x07,0x04,0x05,0x06
.byte 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A /* 3 */
.Lmc_forward_3:
.byte 0x0D,0x0E,0x0F,0x0C,0x01,0x02,0x03,0x00 /* 3 forward */
.byte 0x05,0x06,0x07,0x04,0x09,0x0A,0x0B,0x08
.byte 0x07,0x04,0x05,0x06,0x0B,0x08,0x09,0x0A /* 3 backward */
.byte 0x0F,0x0C,0x0D,0x0E,0x03,0x00,0x01,0x02
END(mc_backward)
END(mc)
.type sr,_ASM_TYPE_OBJECT
sr:
@ -210,8 +200,7 @@ ENTRY(aes_neon_enc1)
/*
* r3: rmod4
* r4: mc_forward
* r5: mc_backward
* r4: mc
* r6,r8,r10,ip: temporaries
* q0={d0-d1}: x/ak/A
* q1={d2-d3}: 0x0f0f...
@ -225,8 +214,8 @@ ENTRY(aes_neon_enc1)
* q9={d18-d19}: sb2[1]
* q10={d20-d21}: inv
* q11={d22-d23}: inva
* q12={d24-d25}: ir/iak/iakr/sb1_0(io)/mc_backward[rmod4]
* q13={d26-d27}: jr/jak/jakr/sb1_1(jo)/mc_forward[rmod4]
* q12={d24-d25}: ir/iak/iakr/sb1_0(io)/mc[rmod4].backward
* q13={d26-d27}: jr/jak/jakr/sb1_1(jo)/mc[rmod4].forward
* q14={d28-d29}: rk/A2/A2_B_D
* q15={d30-d31}: A2_B/sr[rmod4]
*/
@ -254,9 +243,8 @@ ENTRY(aes_neon_enc1)
vld1.8 {q8-q9}, [r6 :256] /* q8 = sb2[0], q9 = sb2[1] */
vld1.8 {q10-q11}, [r8 :256] /* q10 = inv, q11 = inva */
/* (r4, r5) := (&mc_forward[0], &mc_backward[0]) */
add r4, ip, #(mc_forward - .Lconstants)
add r5, ip, #(mc_backward - .Lconstants)
/* r4 := mc */
add r4, ip, #(mc - .Lconstants)
/* (q2, q3) := (lo, hi) */
vshr.u8 q3, q0, #4
@ -291,13 +279,11 @@ ENTRY(aes_neon_enc1)
vtbl.8 d25, {q8}, d5
vtbl.8 d26, {q9}, d6
vtbl.8 d27, {q9}, d7
add r6, r4, r3, lsl #5 /* r6 := &mc[rmod4] */
veor q14, q12, q13
/* (q12, q13) := (mc_forward[rmod4], mc_backward[rmod4]) */
add r6, r4, r3, lsl #4
add r8, r5, r3, lsl #4
vld1.8 {q12}, [r6 :128]
vld1.8 {q13}, [r8 :128]
/* (q12, q13) := (mc[rmod4].forward, mc[rmod4].backward) */
vld1.8 {q12-q13}, [r6 :256]
/* q15 := A2_B = A2 + A(mcf) */
vtbl.8 d30, {q0}, d24
@ -474,7 +460,7 @@ ENTRY(aes_neon_dec1)
add r8, ip, #(.Lmc_forward_3 - .Lconstants)
vld1.8 {q6-q7}, [r4 :256] /* q6 := dsbb[0], q7 := dsbb[1] */
vld1.8 {q10-q11}, [r6 :256] /* q10 := inv, q11 := inva */
vld1.8 {q15}, [r8 :128] /* q15 := mc_forward[3] */
vld1.8 {q15}, [r8 :128] /* q15 := mc[3].forward */
/* (q2, q3) := (lo, hi) */
vshr.u8 q3, q0, #4