Update the ARM asm support code from version 1.0.2 with some local

modifications to make it compile and work on armv4.
This commit is contained in:
martin 2015-03-10 13:28:47 +00:00
parent f7c3fbfac0
commit e6ba3d7169
17 changed files with 12052 additions and 12 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,4 @@
.PATH.S: ${.PARSEDIR}
AES_SRCS = aes-armv4.S aesv8-armx.S bsaes-armv7.S aes_cbc.c
AESCPPFLAGS = -DAES_ASM -DBSAES_ASM
.include "../../aes.inc"

View File

@ -0,0 +1,733 @@
#include "arm_arch.h"
#include "arm_asm.h"
#if __ARM_MAX_ARCH__>=7
.text
.arch armv7-a
.fpu neon
.code 32
.align 5
rcon:
.long 0x01,0x01,0x01,0x01
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat
.long 0x1b,0x1b,0x1b,0x1b
.globl aes_v8_set_encrypt_key
.type aes_v8_set_encrypt_key,%function
.align 5
aes_v8_set_encrypt_key:
.Lenc_key:
mov r3,#-1
cmp r0,#0
beq .Lenc_key_abort
cmp r2,#0
beq .Lenc_key_abort
mov r3,#-2
cmp r1,#128
blt .Lenc_key_abort
cmp r1,#256
bgt .Lenc_key_abort
tst r1,#0x3f
bne .Lenc_key_abort
adr r3,rcon
cmp r1,#192
veor q0,q0,q0
vld1.8 {q3},[r0]!
mov r1,#8 @ reuse r1
vld1.32 {q1,q2},[r3]!
blt .Loop128
beq .L192
b .L256
.align 4
.Loop128:
vtbl.8 d20,{q3},d4
vtbl.8 d21,{q3},d5
vext.8 q9,q0,q3,#12
vst1.32 {q3},[r2]!
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
subs r1,r1,#1
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q10,q10,q1
veor q3,q3,q9
vshl.u8 q1,q1,#1
veor q3,q3,q10
bne .Loop128
vld1.32 {q1},[r3]
vtbl.8 d20,{q3},d4
vtbl.8 d21,{q3},d5
vext.8 q9,q0,q3,#12
vst1.32 {q3},[r2]!
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q10,q10,q1
veor q3,q3,q9
vshl.u8 q1,q1,#1
veor q3,q3,q10
vtbl.8 d20,{q3},d4
vtbl.8 d21,{q3},d5
vext.8 q9,q0,q3,#12
vst1.32 {q3},[r2]!
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q10,q10,q1
veor q3,q3,q9
veor q3,q3,q10
vst1.32 {q3},[r2]
add r2,r2,#0x50
mov r12,#10
b .Ldone
.align 4
.L192:
vld1.8 {d16},[r0]!
vmov.i8 q10,#8 @ borrow q10
vst1.32 {q3},[r2]!
vsub.i8 q2,q2,q10 @ adjust the mask
.Loop192:
vtbl.8 d20,{q8},d4
vtbl.8 d21,{q8},d5
vext.8 q9,q0,q3,#12
vst1.32 {d16},[r2]!
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
subs r1,r1,#1
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q3,q3,q9
vdup.32 q9,d7[1]
veor q9,q9,q8
veor q10,q10,q1
vext.8 q8,q0,q8,#12
vshl.u8 q1,q1,#1
veor q8,q8,q9
veor q3,q3,q10
veor q8,q8,q10
vst1.32 {q3},[r2]!
bne .Loop192
mov r12,#12
add r2,r2,#0x20
b .Ldone
.align 4
.L256:
vld1.8 {q8},[r0]
mov r1,#7
mov r12,#14
vst1.32 {q3},[r2]!
.Loop256:
vtbl.8 d20,{q8},d4
vtbl.8 d21,{q8},d5
vext.8 q9,q0,q3,#12
vst1.32 {q8},[r2]!
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
subs r1,r1,#1
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q10,q10,q1
veor q3,q3,q9
vshl.u8 q1,q1,#1
veor q3,q3,q10
vst1.32 {q3},[r2]!
beq .Ldone
vdup.32 q10,d7[1]
vext.8 q9,q0,q8,#12
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
veor q8,q8,q9
vext.8 q9,q0,q9,#12
veor q8,q8,q9
vext.8 q9,q0,q9,#12
veor q8,q8,q9
veor q8,q8,q10
b .Loop256
.Ldone:
str r12,[r2]
mov r3,#0
.Lenc_key_abort:
mov r0,r3 @ return value
RET
.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
.globl aes_v8_set_decrypt_key
.type aes_v8_set_decrypt_key,%function
.align 5
aes_v8_set_decrypt_key:
stmdb sp!,{r4,lr}
bl .Lenc_key
cmp r0,#0
bne .Ldec_key_abort
sub r2,r2,#240 @ restore original r2
mov r4,#-16
add r0,r2,r12,lsl#4 @ end of key schedule
vld1.32 {q0},[r2]
vld1.32 {q1},[r0]
vst1.32 {q0},[r0],r4
vst1.32 {q1},[r2]!
.Loop_imc:
vld1.32 {q0},[r2]
vld1.32 {q1},[r0]
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
vst1.32 {q0},[r0],r4
vst1.32 {q1},[r2]!
cmp r0,r2
bhi .Loop_imc
vld1.32 {q0},[r2]
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
vst1.32 {q0},[r0]
eor r0,r0,r0 @ return value
.Ldec_key_abort:
ldmia sp!,{r4,pc}
.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
.globl aes_v8_encrypt
.type aes_v8_encrypt,%function
.align 5
aes_v8_encrypt:
ldr r3,[r2,#240]
vld1.32 {q0},[r2]!
vld1.8 {q2},[r0]
sub r3,r3,#2
vld1.32 {q1},[r2]!
.Loop_enc:
.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
vld1.32 {q0},[r2]!
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
subs r3,r3,#2
.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
vld1.32 {q1},[r2]!
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
bgt .Loop_enc
.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
vld1.32 {q0},[r2]
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
veor q2,q2,q0
vst1.8 {q2},[r1]
RET
.size aes_v8_encrypt,.-aes_v8_encrypt
.globl aes_v8_decrypt
.type aes_v8_decrypt,%function
.align 5
aes_v8_decrypt:
ldr r3,[r2,#240]
vld1.32 {q0},[r2]!
vld1.8 {q2},[r0]
sub r3,r3,#2
vld1.32 {q1},[r2]!
.Loop_dec:
.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
vld1.32 {q0},[r2]!
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
subs r3,r3,#2
.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
vld1.32 {q1},[r2]!
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
bgt .Loop_dec
.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
vld1.32 {q0},[r2]
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
veor q2,q2,q0
vst1.8 {q2},[r1]
RET
.size aes_v8_decrypt,.-aes_v8_decrypt
.globl aes_v8_cbc_encrypt
.type aes_v8_cbc_encrypt,%function
.align 5
aes_v8_cbc_encrypt:
mov ip,sp
stmdb sp!,{r4-r8,lr}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldmia ip,{r4-r5} @ load remaining args
subs r2,r2,#16
mov r8,#16
blo .Lcbc_abort
moveq r8,#0
cmp r5,#0 @ en- or decrypting?
ldr r5,[r3,#240]
and r2,r2,#-16
vld1.8 {q6},[r4]
vld1.8 {q0},[r0],r8
vld1.32 {q8-q9},[r3] @ load key schedule...
sub r5,r5,#6
add r7,r3,r5,lsl#4 @ pointer to last 7 round keys
sub r5,r5,#2
vld1.32 {q10-q11},[r7]!
vld1.32 {q12-q13},[r7]!
vld1.32 {q14-q15},[r7]!
vld1.32 {q7},[r7]
add r7,r3,#32
mov r6,r5
beq .Lcbc_dec
cmp r5,#2
veor q0,q0,q6
veor q5,q8,q7
beq .Lcbc_enc128
.Loop_cbc_enc:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
vld1.32 {q8},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
subs r6,r6,#2
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
vld1.32 {q9},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
bgt .Loop_cbc_enc
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
subs r2,r2,#16
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
moveq r8,#0
.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
add r7,r3,#16
.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.8 {q8},[r0],r8
.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
veor q8,q8,q5
.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
mov r6,r5
veor q6,q0,q7
vst1.8 {q6},[r1]!
bhs .Loop_cbc_enc
b .Lcbc_done
.align 5
.Lcbc_enc128:
vld1.32 {q2-q3},[r7]
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
b .Lenter_cbc_enc128
.Loop_cbc_enc128:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vst1.8 {q6},[r1]!
.Lenter_cbc_enc128:
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
subs r2,r2,#16
.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
moveq r8,#0
.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.8 {q8},[r0],r8
.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
veor q8,q8,q5
.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
veor q6,q0,q7
bhs .Loop_cbc_enc128
vst1.8 {q6},[r1]!
b .Lcbc_done
.align 5
.Lcbc_dec:
vld1.8 {q10},[r0]!
subs r2,r2,#32 @ bias
add r6,r5,#2
vorr q3,q0,q0
vorr q1,q0,q0
vorr q11,q10,q10
blo .Lcbc_dec_tail
vorr q1,q10,q10
vld1.8 {q10},[r0]!
vorr q2,q0,q0
vorr q3,q1,q1
vorr q11,q10,q10
.Loop3x_cbc_dec:
.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
vld1.32 {q8},[r7]!
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
subs r6,r6,#2
.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
vld1.32 {q9},[r7]!
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
bgt .Loop3x_cbc_dec
.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
veor q4,q6,q7
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
veor q5,q2,q7
.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
veor q9,q3,q7
subs r2,r2,#0x30
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vorr q6,q11,q11
movlo r6,r2 @ r6, r6, is zero at this point
.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
add r0,r0,r6 @ r0 is adjusted in such way that
@ at exit from the loop q1-q10
@ are loaded with last "words"
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
mov r7,r3
.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
vld1.8 {q2},[r0]!
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.8 {q3},[r0]!
.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14
.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
vld1.8 {q11},[r0]!
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15
.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15
add r6,r5,#2
veor q4,q4,q0
veor q5,q5,q1
veor q10,q10,q9
vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
vorr q0,q2,q2
vst1.8 {q4},[r1]!
vorr q1,q3,q3
vst1.8 {q5},[r1]!
vst1.8 {q10},[r1]!
vorr q10,q11,q11
bhs .Loop3x_cbc_dec
cmn r2,#0x30
beq .Lcbc_done
nop
.Lcbc_dec_tail:
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
vld1.32 {q8},[r7]!
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
subs r6,r6,#2
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
vld1.32 {q9},[r7]!
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
bgt .Lcbc_dec_tail
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
cmn r2,#0x20
.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
veor q5,q6,q7
.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
veor q9,q3,q7
.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15
beq .Lcbc_dec_one
veor q5,q5,q1
veor q9,q9,q10
vorr q6,q11,q11
vst1.8 {q5},[r1]!
vst1.8 {q9},[r1]!
b .Lcbc_done
.Lcbc_dec_one:
veor q5,q5,q10
vorr q6,q11,q11
vst1.8 {q5},[r1]!
.Lcbc_done:
vst1.8 {q6},[r4]
.Lcbc_abort:
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r8,pc}
.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
.globl aes_v8_ctr32_encrypt_blocks
.type aes_v8_ctr32_encrypt_blocks,%function
.align 5
aes_v8_ctr32_encrypt_blocks:
mov ip,sp
stmdb sp!,{r4-r10,lr}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldr r4, [ip] @ load remaining arg
ldr r5,[r3,#240]
ldr r8, [r4, #12]
vld1.32 {q0},[r4]
vld1.32 {q8-q9},[r3] @ load key schedule...
sub r5,r5,#4
mov r12,#16
cmp r2,#2
add r7,r3,r5,lsl#4 @ pointer to last 5 round keys
sub r5,r5,#2
vld1.32 {q12-q13},[r7]!
vld1.32 {q14-q15},[r7]!
vld1.32 {q7},[r7]
add r7,r3,#32
mov r6,r5
movlo r12,#0
#ifndef __ARMEB__
rev r8, r8
#endif
vorr q1,q0,q0
add r10, r8, #1
vorr q10,q0,q0
add r8, r8, #2
vorr q6,q0,q0
rev r10, r10
vmov.32 d3[1],r10
bls .Lctr32_tail
rev r12, r8
sub r2,r2,#3 @ bias
vmov.32 d21[1],r12
b .Loop3x_ctr32
.align 4
.Loop3x_ctr32:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
vld1.32 {q8},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
subs r6,r6,#2
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
vld1.32 {q9},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
bgt .Loop3x_ctr32
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
mov r7,r3
.byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0
vld1.8 {q2},[r0]!
.byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1
.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
vorr q0,q6,q6
.byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9
vld1.8 {q3},[r0]!
.byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9
.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
vorr q1,q6,q6
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
vld1.8 {q11},[r0]!
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
.byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10
vorr q10,q6,q6
add r9,r8,#1
.byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12
.byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12
.byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12
veor q2,q2,q7
add r10,r8,#2
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
veor q3,q3,q7
add r8,r8,#3
.byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13
.byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13
.byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13
veor q11,q11,q7
rev r9,r9
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
vmov.32 d1[1], r9
rev r10,r10
.byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14
.byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14
.byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14
vmov.32 d3[1], r10
rev r12,r8
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
vmov.32 d21[1], r12
subs r2,r2,#3
.byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15
.byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15
.byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15
mov r6,r5
veor q2,q2,q4
veor q3,q3,q5
veor q11,q11,q9
vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
vst1.8 {q2},[r1]!
vst1.8 {q3},[r1]!
vst1.8 {q11},[r1]!
bhs .Loop3x_ctr32
adds r2,r2,#3
beq .Lctr32_done
cmp r2,#1
mov r12,#16
moveq r12,#0
.Lctr32_tail:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
vld1.32 {q8},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
subs r6,r6,#2
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
vld1.32 {q9},[r7]!
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
bgt .Lctr32_tail
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
vld1.8 {q2},[r0],r12
.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
.byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12
vld1.8 {q3},[r0]
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
.byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14
veor q2,q2,q7
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
veor q3,q3,q7
.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
.byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15
cmp r2,#1
veor q2,q2,q0
veor q3,q3,q1
vst1.8 {q2},[r1]!
beq .Lctr32_done
vst1.8 {q3},[r1]
.Lctr32_done:
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r10,pc}
.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
#endif

View File

@ -0,0 +1,5 @@
#if defined (_ARM_ARCH_4T)
# define RET bx lr
#else
# define RET mov pc, lr
#endif

View File

@ -0,0 +1,204 @@
#include "arm_arch.h"
#include "arm_asm.h"
.text
.code 32
.type mul_1x1_ialu,%function
.align 5
mul_1x1_ialu:
mov r4,#0
bic r5,r1,#3<<30 @ a1=a&0x3fffffff
str r4,[sp,#0] @ tab[0]=0
add r6,r5,r5 @ a2=a1<<1
str r5,[sp,#4] @ tab[1]=a1
eor r7,r5,r6 @ a1^a2
str r6,[sp,#8] @ tab[2]=a2
mov r8,r5,lsl#2 @ a4=a1<<2
str r7,[sp,#12] @ tab[3]=a1^a2
eor r9,r5,r8 @ a1^a4
str r8,[sp,#16] @ tab[4]=a4
eor r4,r6,r8 @ a2^a4
str r9,[sp,#20] @ tab[5]=a1^a4
eor r7,r7,r8 @ a1^a2^a4
str r4,[sp,#24] @ tab[6]=a2^a4
and r8,r12,r0,lsl#2
str r7,[sp,#28] @ tab[7]=a1^a2^a4
and r9,r12,r0,lsr#1
ldr r5,[sp,r8] @ tab[b & 0x7]
and r8,r12,r0,lsr#4
ldr r7,[sp,r9] @ tab[b >> 3 & 0x7]
and r9,r12,r0,lsr#7
ldr r6,[sp,r8] @ tab[b >> 6 & 0x7]
eor r5,r5,r7,lsl#3 @ stall
mov r4,r7,lsr#29
ldr r7,[sp,r9] @ tab[b >> 9 & 0x7]
and r8,r12,r0,lsr#10
eor r5,r5,r6,lsl#6
eor r4,r4,r6,lsr#26
ldr r6,[sp,r8] @ tab[b >> 12 & 0x7]
and r9,r12,r0,lsr#13
eor r5,r5,r7,lsl#9
eor r4,r4,r7,lsr#23
ldr r7,[sp,r9] @ tab[b >> 15 & 0x7]
and r8,r12,r0,lsr#16
eor r5,r5,r6,lsl#12
eor r4,r4,r6,lsr#20
ldr r6,[sp,r8] @ tab[b >> 18 & 0x7]
and r9,r12,r0,lsr#19
eor r5,r5,r7,lsl#15
eor r4,r4,r7,lsr#17
ldr r7,[sp,r9] @ tab[b >> 21 & 0x7]
and r8,r12,r0,lsr#22
eor r5,r5,r6,lsl#18
eor r4,r4,r6,lsr#14
ldr r6,[sp,r8] @ tab[b >> 24 & 0x7]
and r9,r12,r0,lsr#25
eor r5,r5,r7,lsl#21
eor r4,r4,r7,lsr#11
ldr r7,[sp,r9] @ tab[b >> 27 & 0x7]
tst r1,#1<<30
and r8,r12,r0,lsr#28
eor r5,r5,r6,lsl#24
eor r4,r4,r6,lsr#8
ldr r6,[sp,r8] @ tab[b >> 30 ]
eorne r5,r5,r0,lsl#30
eorne r4,r4,r0,lsr#2
tst r1,#1<<31
eor r5,r5,r7,lsl#27
eor r4,r4,r7,lsr#5
eorne r5,r5,r0,lsl#31
eorne r4,r4,r0,lsr#1
eor r5,r5,r6,lsl#30
eor r4,r4,r6,lsr#2
mov pc,lr
.size mul_1x1_ialu,.-mul_1x1_ialu
.global bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,%function
.align 5
bn_GF2m_mul_2x2:
#if __ARM_MAX_ARCH__>=7
ldr r12,.LOPENSSL_armcap
.Lpic: ldr r12,[pc,r12]
tst r12,#1
bne .LNEON
#endif
stmdb sp!,{r4-r10,lr}
mov r10,r0 @ reassign 1st argument
mov r0,r3 @ r0=b1
ldr r3,[sp,#32] @ load b0
mov r12,#7<<2
sub sp,sp,#32 @ allocate tab[8]
bl mul_1x1_ialu @ a1·b1
str r5,[r10,#8]
str r4,[r10,#12]
eor r0,r0,r3 @ flip b0 and b1
eor r1,r1,r2 @ flip a0 and a1
eor r3,r3,r0
eor r2,r2,r1
eor r0,r0,r3
eor r1,r1,r2
bl mul_1x1_ialu @ a0·b0
str r5,[r10]
str r4,[r10,#4]
eor r1,r1,r2
eor r0,r0,r3
bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
ldmia r10,{r6-r9}
eor r5,r5,r4
eor r4,r4,r7
eor r5,r5,r6
eor r4,r4,r8
eor r5,r5,r9
eor r4,r4,r9
str r4,[r10,#8]
eor r5,r5,r4
add sp,sp,#32 @ destroy tab[8]
str r5,[r10,#4]
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r10,pc}
#else
ldmia sp!,{r4-r10,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.align 5
.LNEON:
ldr r12, [sp] @ 5th argument
vmov.32 d26, r2, r1
vmov.32 d27, r12, r3
vmov.i64 d28, #0x0000ffffffffffff
vmov.i64 d29, #0x00000000ffffffff
vmov.i64 d30, #0x000000000000ffff
vext.8 d2, d26, d26, #1 @ A1
vmull.p8 q1, d2, d27 @ F = A1*B
vext.8 d0, d27, d27, #1 @ B1
vmull.p8 q0, d26, d0 @ E = A*B1
vext.8 d4, d26, d26, #2 @ A2
vmull.p8 q2, d4, d27 @ H = A2*B
vext.8 d16, d27, d27, #2 @ B2
vmull.p8 q8, d26, d16 @ G = A*B2
vext.8 d6, d26, d26, #3 @ A3
veor q1, q1, q0 @ L = E + F
vmull.p8 q3, d6, d27 @ J = A3*B
vext.8 d0, d27, d27, #3 @ B3
veor q2, q2, q8 @ M = G + H
vmull.p8 q0, d26, d0 @ I = A*B3
veor d2, d2, d3 @ t0 = (L) (P0 + P1) << 8
vand d3, d3, d28
vext.8 d16, d27, d27, #4 @ B4
veor d4, d4, d5 @ t1 = (M) (P2 + P3) << 16
vand d5, d5, d29
vmull.p8 q8, d26, d16 @ K = A*B4
veor q3, q3, q0 @ N = I + J
veor d2, d2, d3
veor d4, d4, d5
veor d6, d6, d7 @ t2 = (N) (P4 + P5) << 24
vand d7, d7, d30
vext.8 q1, q1, q1, #15
veor d16, d16, d17 @ t3 = (K) (P6 + P7) << 32
vmov.i64 d17, #0
vext.8 q2, q2, q2, #14
veor d6, d6, d7
vmull.p8 q0, d26, d27 @ D = A*B
vext.8 q8, q8, q8, #12
vext.8 q3, q3, q3, #13
veor q1, q1, q2
veor q3, q3, q8
veor q0, q0, q1
veor q0, q0, q3
vst1.32 {q0}, [r0]
RET @ bx lr
#endif
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
#if __ARM_MAX_ARCH__>=7
.align 5
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-(.Lpic+8)
#endif
.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
.align 5
#if __ARM_MAX_ARCH__>=7
.comm OPENSSL_armcap_P,4,4
#endif

View File

@ -0,0 +1,581 @@
#include "arm_arch.h"
#include "arm_asm.h"
.text
.code 32
#if __ARM_MAX_ARCH__>=7
.align 5
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-bn_mul_mont
#endif
.global bn_mul_mont
.type bn_mul_mont,%function
.align 5
bn_mul_mont:
ldr ip,[sp,#4] @ load num
stmdb sp!,{r0,r2} @ sp points at argument block
#if __ARM_MAX_ARCH__>=7
tst ip,#7
bne .Lialu
adr r0,bn_mul_mont
ldr r2,.LOPENSSL_armcap
ldr r0,[r0,r2]
tst r0,#1 @ NEON available?
ldmia sp, {r0,r2}
beq .Lialu
add sp,sp,#8
b bn_mul8x_mont_neon
.align 4
.Lialu:
#endif
cmp ip,#2
mov r0,ip @ load num
movlt r0,#0
addlt sp,sp,#2*4
blt .Labrt
stmdb sp!,{r4-r12,lr} @ save 10 registers
mov r0,r0,lsl#2 @ rescale r0 for byte count
sub sp,sp,r0 @ alloca(4*num)
sub sp,sp,#4 @ +extra dword
sub r0,r0,#4 @ "num=num-1"
add r4,r2,r0 @ &bp[num-1]
add r0,sp,r0 @ r0 to point at &tp[num-1]
ldr r8,[r0,#14*4] @ &n0
ldr r2,[r2] @ bp[0]
ldr r5,[r1],#4 @ ap[0],ap++
ldr r6,[r3],#4 @ np[0],np++
ldr r8,[r8] @ *n0
str r4,[r0,#15*4] @ save &bp[num]
umull r10,r11,r5,r2 @ ap[0]*bp[0]
str r8,[r0,#14*4] @ save n0 value
mul r8,r10,r8 @ "tp[0]"*n0
mov r12,#0
umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]"
mov r4,sp
.L1st:
ldr r5,[r1],#4 @ ap[j],ap++
mov r10,r11
ldr r6,[r3],#4 @ np[j],np++
mov r11,#0
umlal r10,r11,r5,r2 @ ap[j]*bp[0]
mov r14,#0
umlal r12,r14,r6,r8 @ np[j]*n0
adds r12,r12,r10
str r12,[r4],#4 @ tp[j-1]=,tp++
adc r12,r14,#0
cmp r4,r0
bne .L1st
adds r12,r12,r11
ldr r4,[r0,#13*4] @ restore bp
mov r14,#0
ldr r8,[r0,#14*4] @ restore n0
adc r14,r14,#0
str r12,[r0] @ tp[num-1]=
str r14,[r0,#4] @ tp[num]=
.Louter:
sub r7,r0,sp @ "original" r0-1 value
sub r1,r1,r7 @ "rewind" ap to &ap[1]
ldr r2,[r4,#4]! @ *(++bp)
sub r3,r3,r7 @ "rewind" np to &np[1]
ldr r5,[r1,#-4] @ ap[0]
ldr r10,[sp] @ tp[0]
ldr r6,[r3,#-4] @ np[0]
ldr r7,[sp,#4] @ tp[1]
mov r11,#0
umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0]
str r4,[r0,#13*4] @ save bp
mul r8,r10,r8
mov r12,#0
umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]"
mov r4,sp
.Linner:
ldr r5,[r1],#4 @ ap[j],ap++
adds r10,r11,r7 @ +=tp[j]
ldr r6,[r3],#4 @ np[j],np++
mov r11,#0
umlal r10,r11,r5,r2 @ ap[j]*bp[i]
mov r14,#0
umlal r12,r14,r6,r8 @ np[j]*n0
adc r11,r11,#0
ldr r7,[r4,#8] @ tp[j+1]
adds r12,r12,r10
str r12,[r4],#4 @ tp[j-1]=,tp++
adc r12,r14,#0
cmp r4,r0
bne .Linner
adds r12,r12,r11
mov r14,#0
ldr r4,[r0,#13*4] @ restore bp
adc r14,r14,#0
ldr r8,[r0,#14*4] @ restore n0
adds r12,r12,r7
ldr r7,[r0,#15*4] @ restore &bp[num]
adc r14,r14,#0
str r12,[r0] @ tp[num-1]=
str r14,[r0,#4] @ tp[num]=
cmp r4,r7
bne .Louter
ldr r2,[r0,#12*4] @ pull rp
add r0,r0,#4 @ r0 to point at &tp[num]
sub r5,r0,sp @ "original" num value
mov r4,sp @ "rewind" r4
mov r1,r4 @ "borrow" r1
sub r3,r3,r5 @ "rewind" r3 to &np[0]
subs r7,r7,r7 @ "clear" carry flag
.Lsub: ldr r7,[r4],#4
ldr r6,[r3],#4
sbcs r7,r7,r6 @ tp[j]-np[j]
str r7,[r2],#4 @ rp[j]=
teq r4,r0 @ preserve carry
bne .Lsub
sbcs r14,r14,#0 @ upmost carry
mov r4,sp @ "rewind" r4
sub r2,r2,r5 @ "rewind" r2
and r1,r4,r14
bic r3,r2,r14
orr r1,r1,r3 @ ap=borrow?tp:rp
.Lcopy: ldr r7,[r1],#4 @ copy or in-place refresh
str sp,[r4],#4 @ zap tp
str r7,[r2],#4
cmp r4,r0
bne .Lcopy
add sp,r0,#4 @ skip over tp[num+1]
ldmia sp!,{r4-r12,lr} @ restore registers
add sp,sp,#2*4 @ skip over {r0,r2}
mov r0,#1
.Labrt:
#if __ARM_ARCH__>=5
RET @ .word 0xe12fff1e
#else
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif
.size bn_mul_mont,.-bn_mul_mont
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.type bn_mul8x_mont_neon,%function
.align 5
bn_mul8x_mont_neon:
mov ip,sp
stmdb sp!,{r4-r11}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldmia ip,{r4-r5} @ load rest of parameter block
sub r7,sp,#16
vld1.32 {d28[0]}, [r2,:32]!
sub r7,r7,r5,lsl#4
vld1.32 {d0-d3}, [r1]! @ can't specify :32 :-(
and r7,r7,#-64
vld1.32 {d30[0]}, [r4,:32]
mov sp,r7 @ alloca
veor d8,d8,d8
subs r8,r5,#8
vzip.16 d28,d8
vmull.u32 q6,d28,d0[0]
vmull.u32 q7,d28,d0[1]
vmull.u32 q8,d28,d1[0]
vshl.i64 d10,d13,#16
vmull.u32 q9,d28,d1[1]
vadd.u64 d10,d10,d12
veor d8,d8,d8
vmul.u32 d29,d10,d30
vmull.u32 q10,d28,d2[0]
vld1.32 {d4-d7}, [r3]!
vmull.u32 q11,d28,d2[1]
vmull.u32 q12,d28,d3[0]
vzip.16 d29,d8
vmull.u32 q13,d28,d3[1]
bne .LNEON_1st
@ special case for num=8, everything is in register bank...
vmlal.u32 q6,d29,d4[0]
sub r9,r5,#1
vmlal.u32 q7,d29,d4[1]
vmlal.u32 q8,d29,d5[0]
vmlal.u32 q9,d29,d5[1]
vmlal.u32 q10,d29,d6[0]
vmov q5,q6
vmlal.u32 q11,d29,d6[1]
vmov q6,q7
vmlal.u32 q12,d29,d7[0]
vmov q7,q8
vmlal.u32 q13,d29,d7[1]
vmov q8,q9
vmov q9,q10
vshr.u64 d10,d10,#16
vmov q10,q11
vmov q11,q12
vadd.u64 d10,d10,d11
vmov q12,q13
veor q13,q13
vshr.u64 d10,d10,#16
b .LNEON_outer8
.align 4
.LNEON_outer8:
vld1.32 {d28[0]}, [r2,:32]!
veor d8,d8,d8
vzip.16 d28,d8
vadd.u64 d12,d12,d10
vmlal.u32 q6,d28,d0[0]
vmlal.u32 q7,d28,d0[1]
vmlal.u32 q8,d28,d1[0]
vshl.i64 d10,d13,#16
vmlal.u32 q9,d28,d1[1]
vadd.u64 d10,d10,d12
veor d8,d8,d8
subs r9,r9,#1
vmul.u32 d29,d10,d30
vmlal.u32 q10,d28,d2[0]
vmlal.u32 q11,d28,d2[1]
vmlal.u32 q12,d28,d3[0]
vzip.16 d29,d8
vmlal.u32 q13,d28,d3[1]
vmlal.u32 q6,d29,d4[0]
vmlal.u32 q7,d29,d4[1]
vmlal.u32 q8,d29,d5[0]
vmlal.u32 q9,d29,d5[1]
vmlal.u32 q10,d29,d6[0]
vmov q5,q6
vmlal.u32 q11,d29,d6[1]
vmov q6,q7
vmlal.u32 q12,d29,d7[0]
vmov q7,q8
vmlal.u32 q13,d29,d7[1]
vmov q8,q9
vmov q9,q10
vshr.u64 d10,d10,#16
vmov q10,q11
vmov q11,q12
vadd.u64 d10,d10,d11
vmov q12,q13
veor q13,q13
vshr.u64 d10,d10,#16
bne .LNEON_outer8
vadd.u64 d12,d12,d10
mov r7,sp
vshr.u64 d10,d12,#16
mov r8,r5
vadd.u64 d13,d13,d10
add r6,sp,#16
vshr.u64 d10,d13,#16
vzip.16 d12,d13
b .LNEON_tail2
.align 4
.LNEON_1st:
vmlal.u32 q6,d29,d4[0]
vld1.32 {d0-d3}, [r1]!
vmlal.u32 q7,d29,d4[1]
subs r8,r8,#8
vmlal.u32 q8,d29,d5[0]
vmlal.u32 q9,d29,d5[1]
vmlal.u32 q10,d29,d6[0]
vld1.32 {d4-d5}, [r3]!
vmlal.u32 q11,d29,d6[1]
vst1.64 {q6-q7}, [r7,:256]!
vmlal.u32 q12,d29,d7[0]
vmlal.u32 q13,d29,d7[1]
vst1.64 {q8-q9}, [r7,:256]!
vmull.u32 q6,d28,d0[0]
vld1.32 {d6-d7}, [r3]!
vmull.u32 q7,d28,d0[1]
vst1.64 {q10-q11}, [r7,:256]!
vmull.u32 q8,d28,d1[0]
vmull.u32 q9,d28,d1[1]
vst1.64 {q12-q13}, [r7,:256]!
vmull.u32 q10,d28,d2[0]
vmull.u32 q11,d28,d2[1]
vmull.u32 q12,d28,d3[0]
vmull.u32 q13,d28,d3[1]
bne .LNEON_1st
vmlal.u32 q6,d29,d4[0]
add r6,sp,#16
vmlal.u32 q7,d29,d4[1]
sub r1,r1,r5,lsl#2 @ rewind r1
vmlal.u32 q8,d29,d5[0]
vld1.64 {q5}, [sp,:128]
vmlal.u32 q9,d29,d5[1]
sub r9,r5,#1
vmlal.u32 q10,d29,d6[0]
vst1.64 {q6-q7}, [r7,:256]!
vmlal.u32 q11,d29,d6[1]
vshr.u64 d10,d10,#16
vld1.64 {q6}, [r6, :128]!
vmlal.u32 q12,d29,d7[0]
vst1.64 {q8-q9}, [r7,:256]!
vmlal.u32 q13,d29,d7[1]
vst1.64 {q10-q11}, [r7,:256]!
vadd.u64 d10,d10,d11
veor q4,q4,q4
vst1.64 {q12-q13}, [r7,:256]!
vld1.64 {q7-q8}, [r6, :256]!
vst1.64 {q4}, [r7,:128]
vshr.u64 d10,d10,#16
b .LNEON_outer
.align 4
.LNEON_outer:
vld1.32 {d28[0]}, [r2,:32]!
sub r3,r3,r5,lsl#2 @ rewind r3
vld1.32 {d0-d3}, [r1]!
veor d8,d8,d8
mov r7,sp
vzip.16 d28,d8
sub r8,r5,#8
vadd.u64 d12,d12,d10
vmlal.u32 q6,d28,d0[0]
vld1.64 {q9-q10},[r6,:256]!
vmlal.u32 q7,d28,d0[1]
vmlal.u32 q8,d28,d1[0]
vld1.64 {q11-q12},[r6,:256]!
vmlal.u32 q9,d28,d1[1]
vshl.i64 d10,d13,#16
veor d8,d8,d8
vadd.u64 d10,d10,d12
vld1.64 {q13},[r6,:128]!
vmul.u32 d29,d10,d30
vmlal.u32 q10,d28,d2[0]
vld1.32 {d4-d7}, [r3]!
vmlal.u32 q11,d28,d2[1]
vmlal.u32 q12,d28,d3[0]
vzip.16 d29,d8
vmlal.u32 q13,d28,d3[1]
.LNEON_inner:
vmlal.u32 q6,d29,d4[0]
vld1.32 {d0-d3}, [r1]!
vmlal.u32 q7,d29,d4[1]
subs r8,r8,#8
vmlal.u32 q8,d29,d5[0]
vmlal.u32 q9,d29,d5[1]
vst1.64 {q6-q7}, [r7,:256]!
vmlal.u32 q10,d29,d6[0]
vld1.64 {q6}, [r6, :128]!
vmlal.u32 q11,d29,d6[1]
vst1.64 {q8-q9}, [r7,:256]!
vmlal.u32 q12,d29,d7[0]
vld1.64 {q7-q8}, [r6, :256]!
vmlal.u32 q13,d29,d7[1]
vst1.64 {q10-q11}, [r7,:256]!
vmlal.u32 q6,d28,d0[0]
vld1.64 {q9-q10}, [r6, :256]!
vmlal.u32 q7,d28,d0[1]
vst1.64 {q12-q13}, [r7,:256]!
vmlal.u32 q8,d28,d1[0]
vld1.64 {q11-q12}, [r6, :256]!
vmlal.u32 q9,d28,d1[1]
vld1.32 {d4-d7}, [r3]!
vmlal.u32 q10,d28,d2[0]
vld1.64 {q13}, [r6, :128]!
vmlal.u32 q11,d28,d2[1]
vmlal.u32 q12,d28,d3[0]
vmlal.u32 q13,d28,d3[1]
bne .LNEON_inner
vmlal.u32 q6,d29,d4[0]
add r6,sp,#16
vmlal.u32 q7,d29,d4[1]
sub r1,r1,r5,lsl#2 @ rewind r1
vmlal.u32 q8,d29,d5[0]
vld1.64 {q5}, [sp,:128]
vmlal.u32 q9,d29,d5[1]
subs r9,r9,#1
vmlal.u32 q10,d29,d6[0]
vst1.64 {q6-q7}, [r7,:256]!
vmlal.u32 q11,d29,d6[1]
vld1.64 {q6}, [r6, :128]!
vshr.u64 d10,d10,#16
vst1.64 {q8-q9}, [r7,:256]!
vmlal.u32 q12,d29,d7[0]
vld1.64 {q7-q8}, [r6, :256]!
vmlal.u32 q13,d29,d7[1]
vst1.64 {q10-q11}, [r7,:256]!
vadd.u64 d10,d10,d11
vst1.64 {q12-q13}, [r7,:256]!
vshr.u64 d10,d10,#16
bne .LNEON_outer
mov r7,sp
mov r8,r5
.LNEON_tail:
vadd.u64 d12,d12,d10
vld1.64 {q9-q10}, [r6, :256]!
vshr.u64 d10,d12,#16
vadd.u64 d13,d13,d10
vld1.64 {q11-q12}, [r6, :256]!
vshr.u64 d10,d13,#16
vld1.64 {q13}, [r6, :128]!
vzip.16 d12,d13
.LNEON_tail2:
vadd.u64 d14,d14,d10
vst1.32 {d12[0]}, [r7, :32]!
vshr.u64 d10,d14,#16
vadd.u64 d15,d15,d10
vshr.u64 d10,d15,#16
vzip.16 d14,d15
vadd.u64 d16,d16,d10
vst1.32 {d14[0]}, [r7, :32]!
vshr.u64 d10,d16,#16
vadd.u64 d17,d17,d10
vshr.u64 d10,d17,#16
vzip.16 d16,d17
vadd.u64 d18,d18,d10
vst1.32 {d16[0]}, [r7, :32]!
vshr.u64 d10,d18,#16
vadd.u64 d19,d19,d10
vshr.u64 d10,d19,#16
vzip.16 d18,d19
vadd.u64 d20,d20,d10
vst1.32 {d18[0]}, [r7, :32]!
vshr.u64 d10,d20,#16
vadd.u64 d21,d21,d10
vshr.u64 d10,d21,#16
vzip.16 d20,d21
vadd.u64 d22,d22,d10
vst1.32 {d20[0]}, [r7, :32]!
vshr.u64 d10,d22,#16
vadd.u64 d23,d23,d10
vshr.u64 d10,d23,#16
vzip.16 d22,d23
vadd.u64 d24,d24,d10
vst1.32 {d22[0]}, [r7, :32]!
vshr.u64 d10,d24,#16
vadd.u64 d25,d25,d10
vld1.64 {q6}, [r6, :128]!
vshr.u64 d10,d25,#16
vzip.16 d24,d25
vadd.u64 d26,d26,d10
vst1.32 {d24[0]}, [r7, :32]!
vshr.u64 d10,d26,#16
vadd.u64 d27,d27,d10
vld1.64 {q7-q8}, [r6, :256]!
vshr.u64 d10,d27,#16
vzip.16 d26,d27
subs r8,r8,#8
vst1.32 {d26[0]}, [r7, :32]!
bne .LNEON_tail
vst1.32 {d10[0]}, [r7, :32] @ top-most bit
sub r3,r3,r5,lsl#2 @ rewind r3
subs r1,sp,#0 @ clear carry flag
add r2,sp,r5,lsl#2
.LNEON_sub:
ldmia r1!, {r4-r7}
ldmia r3!, {r8-r11}
sbcs r8, r4,r8
sbcs r9, r5,r9
sbcs r10,r6,r10
sbcs r11,r7,r11
teq r1,r2 @ preserves carry
stmia r0!, {r8-r11}
bne .LNEON_sub
ldr r10, [r1] @ load top-most bit
veor q0,q0,q0
sub r11,r2,sp @ this is num*4
veor q1,q1,q1
mov r1,sp
sub r0,r0,r11 @ rewind r0
mov r3,r2 @ second 3/4th of frame
sbcs r10,r10,#0 @ result is carry flag
.LNEON_copy_n_zap:
ldmia r1!, {r4-r7}
ldmia r0, {r8-r11}
movcc r8, r4
vst1.64 {q0-q1}, [r3,:256]! @ wipe
movcc r9, r5
movcc r10,r6
vst1.64 {q0-q1}, [r3,:256]! @ wipe
movcc r11,r7
ldmia r1, {r4-r7}
stmia r0!, {r8-r11}
sub r1,r1,#16
ldmia r0, {r8-r11}
movcc r8, r4
vst1.64 {q0-q1}, [r1,:256]! @ wipe
movcc r9, r5
movcc r10,r6
vst1.64 {q0-q1}, [r3,:256]! @ wipe
movcc r11,r7
teq r1,r2 @ preserves carry
stmia r0!, {r8-r11}
bne .LNEON_copy_n_zap
sub sp,ip,#96
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r11}
RET @ .word 0xe12fff1e
.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
#endif
.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
.align 2
#if __ARM_MAX_ARCH__>=7
.comm OPENSSL_armcap_P,4,4
#endif

View File

@ -0,0 +1,210 @@
#include "arm_arch.h"
#include "arm_asm.h"
.text
.code 32
.align 5
.global OPENSSL_atomic_add
.type OPENSSL_atomic_add,%function
OPENSSL_atomic_add:
#if __ARM_ARCH__>=6
.Ladd: ldrex r2,[r0]
add r3,r2,r1
strex r2,r3,[r0]
cmp r2,#0
bne .Ladd
mov r0,r3
RET
#else
stmdb sp!,{r4-r6,lr}
ldr r2,.Lspinlock
adr r3,.Lspinlock
mov r4,r0
mov r5,r1
add r6,r3,r2 @ &spinlock
b .+8
.Lspin: bl sched_yield
mov r0,#-1
swp r0,r0,[r6]
cmp r0,#0
bne .Lspin
ldr r2,[r4]
add r2,r2,r5
str r2,[r4]
str r0,[r6] @ release spinlock
ldmia sp!,{r4-r6,lr}
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
#endif
.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
.global OPENSSL_cleanse
.type OPENSSL_cleanse,%function
OPENSSL_cleanse:
eor ip,ip,ip
cmp r1,#7
subhs r1,r1,#4
bhs .Lot
cmp r1,#0
beq .Lcleanse_done
.Little:
strb ip,[r0],#1
subs r1,r1,#1
bhi .Little
b .Lcleanse_done
.Lot: tst r0,#3
beq .Laligned
strb ip,[r0],#1
sub r1,r1,#1
b .Lot
.Laligned:
str ip,[r0],#4
subs r1,r1,#4
bhs .Laligned
adds r1,r1,#4
bne .Little
.Lcleanse_done:
#if __ARM_ARCH__>=5
RET
#else
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
#endif
.size OPENSSL_cleanse,.-OPENSSL_cleanse
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.align 5
.global _armv7_neon_probe
.type _armv7_neon_probe,%function
_armv7_neon_probe:
vorr q0,q0,q0
RET
.size _armv7_neon_probe,.-_armv7_neon_probe
.global _armv7_tick
.type _armv7_tick,%function
_armv7_tick:
mrrc p15,1,r0,r1,c14 @ CNTVCT
RET
.size _armv7_tick,.-_armv7_tick
.global _armv8_aes_probe
.type _armv8_aes_probe,%function
_armv8_aes_probe:
.byte 0x00,0x03,0xb0,0xf3 @ aese.8 q0,q0
RET
.size _armv8_aes_probe,.-_armv8_aes_probe
.global _armv8_sha1_probe
.type _armv8_sha1_probe,%function
_armv8_sha1_probe:
.byte 0x40,0x0c,0x00,0xf2 @ sha1c.32 q0,q0,q0
RET
.size _armv8_sha1_probe,.-_armv8_sha1_probe
.global _armv8_sha256_probe
.type _armv8_sha256_probe,%function
_armv8_sha256_probe:
.byte 0x40,0x0c,0x00,0xf3 @ sha256h.32 q0,q0,q0
RET
.size _armv8_sha256_probe,.-_armv8_sha256_probe
.global _armv8_pmull_probe
.type _armv8_pmull_probe,%function
_armv8_pmull_probe:
.byte 0x00,0x0e,0xa0,0xf2 @ vmull.p64 q0,d0,d0
RET
.size _armv8_pmull_probe,.-_armv8_pmull_probe
#endif
.global OPENSSL_wipe_cpu
.type OPENSSL_wipe_cpu,%function
OPENSSL_wipe_cpu:
#if __ARM_MAX_ARCH__>=7
ldr r0,.LOPENSSL_armcap
adr r1,.LOPENSSL_armcap
ldr r0,[r1,r0]
#endif
eor r2,r2,r2
eor r3,r3,r3
eor ip,ip,ip
#if __ARM_MAX_ARCH__>=7
tst r0,#1
beq .Lwipe_done
veor q0, q0, q0
veor q1, q1, q1
veor q2, q2, q2
veor q3, q3, q3
veor q8, q8, q8
veor q9, q9, q9
veor q10, q10, q10
veor q11, q11, q11
veor q12, q12, q12
veor q13, q13, q13
veor q14, q14, q14
veor q15, q15, q15
.Lwipe_done:
#endif
mov r0,sp
#if __ARM_ARCH__>=5
RET
#else
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
#endif
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
.global OPENSSL_instrument_bus
.type OPENSSL_instrument_bus,%function
OPENSSL_instrument_bus:
eor r0,r0,r0
#if __ARM_ARCH__>=5
RET
#else
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
#endif
.size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
.global OPENSSL_instrument_bus2
.type OPENSSL_instrument_bus2,%function
OPENSSL_instrument_bus2:
eor r0,r0,r0
#if __ARM_ARCH__>=5
RET
#else
tst lr,#1
moveq pc,lr
.word 0xe12fff1e @ bx lr
#endif
.size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
.align 5
#if __ARM_MAX_ARCH__>=7
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-.LOPENSSL_armcap
#endif
#if __ARM_ARCH__>=6
.align 5
#else
.Lspinlock:
.word atomic_add_spinlock-.Lspinlock
.align 5
.data
.align 2
atomic_add_spinlock:
.word 0
#endif
.comm OPENSSL_armcap_P,4,4
.hidden OPENSSL_armcap_P

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,10 @@
.PATH.S: ${.PARSEDIR}
CPUID_SRCS = armv4cpuid.S armcap.c armv4-mont.S armv4-gf2m.S
CPUID = yes
CPPFLAGS += -DOPENSSL_BN_ASM_MONT -DOPENSSL_BN_ASM_GF2m
.if ${MACHINE_ARCH:M*armv4*} == ""
CPPFLAGS += -D__ARM_MAX_ARCH__=8
.else
CPPFLAGS += -D__ARM_MAX_ARCH__=4
.endif
.include "../../crypto.inc"

View File

@ -1,12 +0,0 @@
# $NetBSD: des.inc,v 1.1 2009/07/19 23:30:45 christos Exp $
# XXX WE NEED A WAY TO DETERMINE THESE AT COMPILE-TIME BASED ON
# XXX COMPILER OPTIONS.
# These are good for StrongARM with -mcpu=strongarm
#CPPFLAGS+= -DDES_RISC1
# These are good for XScale with -mcpu=xscale
#CPPFLAGS+= -DDES_RISC1 -DDES_PTR
.include "${.CURDIR}/des.inc"

View File

@ -0,0 +1,524 @@
#include "arm_arch.h"
#include "arm_asm.h"
.text
.code 32
.type rem_4bit,%object
.align 5
rem_4bit:
.short 0x0000,0x1C20,0x3840,0x2460
.short 0x7080,0x6CA0,0x48C0,0x54E0
.short 0xE100,0xFD20,0xD940,0xC560
.short 0x9180,0x8DA0,0xA9C0,0xB5E0
.size rem_4bit,.-rem_4bit
.type rem_4bit_get,%function
rem_4bit_get:
sub r2,pc,#8
sub r2,r2,#32 @ &rem_4bit
b .Lrem_4bit_got
nop
.size rem_4bit_get,.-rem_4bit_get
.global gcm_ghash_4bit
.type gcm_ghash_4bit,%function
gcm_ghash_4bit:
sub r12,pc,#8
add r3,r2,r3 @ r3 to point at the end
stmdb sp!,{r3-r11,lr} @ save r3/end too
sub r12,r12,#48 @ &rem_4bit
ldmia r12,{r4-r11} @ copy rem_4bit ...
stmdb sp!,{r4-r11} @ ... to stack
ldrb r12,[r2,#15]
ldrb r14,[r0,#15]
.Louter:
eor r12,r12,r14
and r14,r12,#0xf0
and r12,r12,#0x0f
mov r3,#14
add r7,r1,r12,lsl#4
ldmia r7,{r4-r7} @ load Htbl[nlo]
add r11,r1,r14
ldrb r12,[r2,#14]
and r14,r4,#0xf @ rem
ldmia r11,{r8-r11} @ load Htbl[nhi]
add r14,r14,r14
eor r4,r8,r4,lsr#4
ldrh r8,[sp,r14] @ rem_4bit[rem]
eor r4,r4,r5,lsl#28
ldrb r14,[r0,#14]
eor r5,r9,r5,lsr#4
eor r5,r5,r6,lsl#28
eor r6,r10,r6,lsr#4
eor r6,r6,r7,lsl#28
eor r7,r11,r7,lsr#4
eor r12,r12,r14
and r14,r12,#0xf0
and r12,r12,#0x0f
eor r7,r7,r8,lsl#16
.Linner:
add r11,r1,r12,lsl#4
and r12,r4,#0xf @ rem
subs r3,r3,#1
add r12,r12,r12
ldmia r11,{r8-r11} @ load Htbl[nlo]
eor r4,r8,r4,lsr#4
eor r4,r4,r5,lsl#28
eor r5,r9,r5,lsr#4
eor r5,r5,r6,lsl#28
ldrh r8,[sp,r12] @ rem_4bit[rem]
eor r6,r10,r6,lsr#4
ldrplb r12,[r2,r3]
eor r6,r6,r7,lsl#28
eor r7,r11,r7,lsr#4
add r11,r1,r14
and r14,r4,#0xf @ rem
eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
add r14,r14,r14
ldmia r11,{r8-r11} @ load Htbl[nhi]
eor r4,r8,r4,lsr#4
ldrplb r8,[r0,r3]
eor r4,r4,r5,lsl#28
eor r5,r9,r5,lsr#4
ldrh r9,[sp,r14]
eor r5,r5,r6,lsl#28
eor r6,r10,r6,lsr#4
eor r6,r6,r7,lsl#28
eorpl r12,r12,r8
eor r7,r11,r7,lsr#4
andpl r14,r12,#0xf0
andpl r12,r12,#0x0f
eor r7,r7,r9,lsl#16 @ ^= rem_4bit[rem]
bpl .Linner
ldr r3,[sp,#32] @ re-load r3/end
add r2,r2,#16
mov r14,r4
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev r4,r4
str r4,[r0,#12]
#elif defined(__ARMEB__)
str r4,[r0,#12]
#else
mov r9,r4,lsr#8
strb r4,[r0,#12+3]
mov r10,r4,lsr#16
strb r9,[r0,#12+2]
mov r11,r4,lsr#24
strb r10,[r0,#12+1]
strb r11,[r0,#12]
#endif
cmp r2,r3
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev r5,r5
str r5,[r0,#8]
#elif defined(__ARMEB__)
str r5,[r0,#8]
#else
mov r9,r5,lsr#8
strb r5,[r0,#8+3]
mov r10,r5,lsr#16
strb r9,[r0,#8+2]
mov r11,r5,lsr#24
strb r10,[r0,#8+1]
strb r11,[r0,#8]
#endif
ldrneb r12,[r2,#15]
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev r6,r6
str r6,[r0,#4]
#elif defined(__ARMEB__)
str r6,[r0,#4]
#else
mov r9,r6,lsr#8
strb r6,[r0,#4+3]
mov r10,r6,lsr#16
strb r9,[r0,#4+2]
mov r11,r6,lsr#24
strb r10,[r0,#4+1]
strb r11,[r0,#4]
#endif
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev r7,r7
str r7,[r0,#0]
#elif defined(__ARMEB__)
str r7,[r0,#0]
#else
mov r9,r7,lsr#8
strb r7,[r0,#0+3]
mov r10,r7,lsr#16
strb r9,[r0,#0+2]
mov r11,r7,lsr#24
strb r10,[r0,#0+1]
strb r11,[r0,#0]
#endif
bne .Louter
add sp,sp,#36
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif
.size gcm_ghash_4bit,.-gcm_ghash_4bit
.global gcm_gmult_4bit
.type gcm_gmult_4bit,%function
gcm_gmult_4bit:
stmdb sp!,{r4-r11,lr}
ldrb r12,[r0,#15]
b rem_4bit_get
.Lrem_4bit_got:
and r14,r12,#0xf0
and r12,r12,#0x0f
mov r3,#14
add r7,r1,r12,lsl#4
ldmia r7,{r4-r7} @ load Htbl[nlo]
ldrb r12,[r0,#14]
add r11,r1,r14
and r14,r4,#0xf @ rem
ldmia r11,{r8-r11} @ load Htbl[nhi]
add r14,r14,r14
eor r4,r8,r4,lsr#4
ldrh r8,[r2,r14] @ rem_4bit[rem]
eor r4,r4,r5,lsl#28
eor r5,r9,r5,lsr#4
eor r5,r5,r6,lsl#28
eor r6,r10,r6,lsr#4
eor r6,r6,r7,lsl#28
eor r7,r11,r7,lsr#4
and r14,r12,#0xf0
eor r7,r7,r8,lsl#16
and r12,r12,#0x0f
.Loop:
add r11,r1,r12,lsl#4
and r12,r4,#0xf @ rem
subs r3,r3,#1
add r12,r12,r12
ldmia r11,{r8-r11} @ load Htbl[nlo]
eor r4,r8,r4,lsr#4
eor r4,r4,r5,lsl#28
eor r5,r9,r5,lsr#4
eor r5,r5,r6,lsl#28
ldrh r8,[r2,r12] @ rem_4bit[rem]
eor r6,r10,r6,lsr#4
ldrplb r12,[r0,r3]
eor r6,r6,r7,lsl#28
eor r7,r11,r7,lsr#4
add r11,r1,r14
and r14,r4,#0xf @ rem
eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
add r14,r14,r14
ldmia r11,{r8-r11} @ load Htbl[nhi]
eor r4,r8,r4,lsr#4
eor r4,r4,r5,lsl#28
eor r5,r9,r5,lsr#4
ldrh r8,[r2,r14] @ rem_4bit[rem]
eor r5,r5,r6,lsl#28
eor r6,r10,r6,lsr#4
eor r6,r6,r7,lsl#28
eor r7,r11,r7,lsr#4
andpl r14,r12,#0xf0
andpl r12,r12,#0x0f
eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
bpl .Loop
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev r4,r4
str r4,[r0,#12]
#elif defined(__ARMEB__)
str r4,[r0,#12]
#else
mov r9,r4,lsr#8
strb r4,[r0,#12+3]
mov r10,r4,lsr#16
strb r9,[r0,#12+2]
mov r11,r4,lsr#24
strb r10,[r0,#12+1]
strb r11,[r0,#12]
#endif
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev r5,r5
str r5,[r0,#8]
#elif defined(__ARMEB__)
str r5,[r0,#8]
#else
mov r9,r5,lsr#8
strb r5,[r0,#8+3]
mov r10,r5,lsr#16
strb r9,[r0,#8+2]
mov r11,r5,lsr#24
strb r10,[r0,#8+1]
strb r11,[r0,#8]
#endif
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev r6,r6
str r6,[r0,#4]
#elif defined(__ARMEB__)
str r6,[r0,#4]
#else
mov r9,r6,lsr#8
strb r6,[r0,#4+3]
mov r10,r6,lsr#16
strb r9,[r0,#4+2]
mov r11,r6,lsr#24
strb r10,[r0,#4+1]
strb r11,[r0,#4]
#endif
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev r7,r7
str r7,[r0,#0]
#elif defined(__ARMEB__)
str r7,[r0,#0]
#else
mov r9,r7,lsr#8
strb r7,[r0,#0+3]
mov r10,r7,lsr#16
strb r9,[r0,#0+2]
mov r11,r7,lsr#24
strb r10,[r0,#0+1]
strb r11,[r0,#0]
#endif
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif
.size gcm_gmult_4bit,.-gcm_gmult_4bit
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.global gcm_init_neon
.type gcm_init_neon,%function
.align 4
gcm_init_neon:
vld1.64 d7,[r1,:64]! @ load H
vmov.i8 q8,#0xe1
vld1.64 d6,[r1,:64]
vshl.i64 d17,#57
vshr.u64 d16,#63 @ t0=0xc2....01
vdup.8 q9,d7[7]
vshr.u64 d26,d6,#63
vshr.s8 q9,#7 @ broadcast carry bit
vshl.i64 q3,q3,#1
vand q8,q8,q9
vorr d7,d26 @ H<<<=1
veor q3,q3,q8 @ twisted H
vstmia r0,{q3}
RET @ bx lr
.size gcm_init_neon,.-gcm_init_neon
.global gcm_gmult_neon
.type gcm_gmult_neon,%function
.align 4
gcm_gmult_neon:
vld1.64 d7,[r0,:64]! @ load Xi
vld1.64 d6,[r0,:64]!
vmov.i64 d29,#0x0000ffffffffffff
vldmia r1,{d26-d27} @ load twisted H
vmov.i64 d30,#0x00000000ffffffff
#ifdef __ARMEL__
vrev64.8 q3,q3
#endif
vmov.i64 d31,#0x000000000000ffff
veor d28,d26,d27 @ Karatsuba pre-processing
mov r3,#16
b .Lgmult_neon
.size gcm_gmult_neon,.-gcm_gmult_neon
.global gcm_ghash_neon
.type gcm_ghash_neon,%function
.align 4
gcm_ghash_neon:
vld1.64 d1,[r0,:64]! @ load Xi
vld1.64 d0,[r0,:64]!
vmov.i64 d29,#0x0000ffffffffffff
vldmia r1,{d26-d27} @ load twisted H
vmov.i64 d30,#0x00000000ffffffff
#ifdef __ARMEL__
vrev64.8 q0,q0
#endif
vmov.i64 d31,#0x000000000000ffff
veor d28,d26,d27 @ Karatsuba pre-processing
.Loop_neon:
vld1.64 d7,[r2]! @ load inp
vld1.64 d6,[r2]!
#ifdef __ARMEL__
vrev64.8 q3,q3
#endif
veor q3,q0 @ inp^=Xi
.Lgmult_neon:
vext.8 d16, d26, d26, #1 @ A1
vmull.p8 q8, d16, d6 @ F = A1*B
vext.8 d0, d6, d6, #1 @ B1
vmull.p8 q0, d26, d0 @ E = A*B1
vext.8 d18, d26, d26, #2 @ A2
vmull.p8 q9, d18, d6 @ H = A2*B
vext.8 d22, d6, d6, #2 @ B2
vmull.p8 q11, d26, d22 @ G = A*B2
vext.8 d20, d26, d26, #3 @ A3
veor q8, q8, q0 @ L = E + F
vmull.p8 q10, d20, d6 @ J = A3*B
vext.8 d0, d6, d6, #3 @ B3
veor q9, q9, q11 @ M = G + H
vmull.p8 q0, d26, d0 @ I = A*B3
veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
vand d17, d17, d29
vext.8 d22, d6, d6, #4 @ B4
veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
vand d19, d19, d30
vmull.p8 q11, d26, d22 @ K = A*B4
veor q10, q10, q0 @ N = I + J
veor d16, d16, d17
veor d18, d18, d19
veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
vand d21, d21, d31
vext.8 q8, q8, q8, #15
veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
vmov.i64 d23, #0
vext.8 q9, q9, q9, #14
veor d20, d20, d21
vmull.p8 q0, d26, d6 @ D = A*B
vext.8 q11, q11, q11, #12
vext.8 q10, q10, q10, #13
veor q8, q8, q9
veor q10, q10, q11
veor q0, q0, q8
veor q0, q0, q10
veor d6,d6,d7 @ Karatsuba pre-processing
vext.8 d16, d28, d28, #1 @ A1
vmull.p8 q8, d16, d6 @ F = A1*B
vext.8 d2, d6, d6, #1 @ B1
vmull.p8 q1, d28, d2 @ E = A*B1
vext.8 d18, d28, d28, #2 @ A2
vmull.p8 q9, d18, d6 @ H = A2*B
vext.8 d22, d6, d6, #2 @ B2
vmull.p8 q11, d28, d22 @ G = A*B2
vext.8 d20, d28, d28, #3 @ A3
veor q8, q8, q1 @ L = E + F
vmull.p8 q10, d20, d6 @ J = A3*B
vext.8 d2, d6, d6, #3 @ B3
veor q9, q9, q11 @ M = G + H
vmull.p8 q1, d28, d2 @ I = A*B3
veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
vand d17, d17, d29
vext.8 d22, d6, d6, #4 @ B4
veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
vand d19, d19, d30
vmull.p8 q11, d28, d22 @ K = A*B4
veor q10, q10, q1 @ N = I + J
veor d16, d16, d17
veor d18, d18, d19
veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
vand d21, d21, d31
vext.8 q8, q8, q8, #15
veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
vmov.i64 d23, #0
vext.8 q9, q9, q9, #14
veor d20, d20, d21
vmull.p8 q1, d28, d6 @ D = A*B
vext.8 q11, q11, q11, #12
vext.8 q10, q10, q10, #13
veor q8, q8, q9
veor q10, q10, q11
veor q1, q1, q8
veor q1, q1, q10
vext.8 d16, d27, d27, #1 @ A1
vmull.p8 q8, d16, d7 @ F = A1*B
vext.8 d4, d7, d7, #1 @ B1
vmull.p8 q2, d27, d4 @ E = A*B1
vext.8 d18, d27, d27, #2 @ A2
vmull.p8 q9, d18, d7 @ H = A2*B
vext.8 d22, d7, d7, #2 @ B2
vmull.p8 q11, d27, d22 @ G = A*B2
vext.8 d20, d27, d27, #3 @ A3
veor q8, q8, q2 @ L = E + F
vmull.p8 q10, d20, d7 @ J = A3*B
vext.8 d4, d7, d7, #3 @ B3
veor q9, q9, q11 @ M = G + H
vmull.p8 q2, d27, d4 @ I = A*B3
veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
vand d17, d17, d29
vext.8 d22, d7, d7, #4 @ B4
veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
vand d19, d19, d30
vmull.p8 q11, d27, d22 @ K = A*B4
veor q10, q10, q2 @ N = I + J
veor d16, d16, d17
veor d18, d18, d19
veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
vand d21, d21, d31
vext.8 q8, q8, q8, #15
veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
vmov.i64 d23, #0
vext.8 q9, q9, q9, #14
veor d20, d20, d21
vmull.p8 q2, d27, d7 @ D = A*B
vext.8 q11, q11, q11, #12
vext.8 q10, q10, q10, #13
veor q8, q8, q9
veor q10, q10, q11
veor q2, q2, q8
veor q2, q2, q10
veor q1,q1,q0 @ Karatsuba post-processing
veor q1,q1,q2
veor d1,d1,d2
veor d4,d4,d3 @ Xh|Xl - 256-bit result
@ equivalent of reduction_avx from ghash-x86_64.pl
vshl.i64 q9,q0,#57 @ 1st phase
vshl.i64 q10,q0,#62
veor q10,q10,q9 @
vshl.i64 q9,q0,#63
veor q10, q10, q9 @
veor d1,d1,d20 @
veor d4,d4,d21
vshr.u64 q10,q0,#1 @ 2nd phase
veor q2,q2,q0
veor q0,q0,q10 @
vshr.u64 q10,q10,#6
vshr.u64 q0,q0,#1 @
veor q0,q0,q2 @
veor q0,q0,q10 @
subs r3,#16
bne .Loop_neon
#ifdef __ARMEL__
vrev64.8 q0,q0
#endif
sub r0,#16
vst1.64 d1,[r0,:64]! @ write out Xi
vst1.64 d0,[r0,:64]
RET @ bx lr
.size gcm_ghash_neon,.-gcm_ghash_neon
#endif
.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
.align 2

View File

@ -0,0 +1,117 @@
#include "arm_arch.h"
#include "arm_asm.h"
.text
.fpu neon
.code 32
.global gcm_init_v8
.type gcm_init_v8,%function
.align 4
gcm_init_v8:
vld1.64 {q9},[r1] @ load H
vmov.i8 q8,#0xe1
vext.8 q3,q9,q9,#8
vshl.i64 q8,q8,#57
vshr.u64 q10,q8,#63
vext.8 q8,q10,q8,#8 @ t0=0xc2....01
vdup.32 q9,d18[1]
vshr.u64 q11,q3,#63
vshr.s32 q9,q9,#31 @ broadcast carry bit
vand q11,q11,q8
vshl.i64 q3,q3,#1
vext.8 q11,q11,q11,#8
vand q8,q8,q9
vorr q3,q3,q11 @ H<<<=1
veor q3,q3,q8 @ twisted H
vst1.64 {q3},[r0]
RET
.size gcm_init_v8,.-gcm_init_v8
.global gcm_gmult_v8
.type gcm_gmult_v8,%function
.align 4
gcm_gmult_v8:
vld1.64 {q9},[r0] @ load Xi
vmov.i8 q11,#0xe1
vld1.64 {q12},[r1] @ load twisted H
vshl.u64 q11,q11,#57
#ifndef __ARMEB__
vrev64.8 q9,q9
#endif
vext.8 q13,q12,q12,#8
mov r3,#0
vext.8 q3,q9,q9,#8
mov r12,#0
veor q13,q13,q12 @ Karatsuba pre-processing
mov r2,r0
b .Lgmult_v8
.size gcm_gmult_v8,.-gcm_gmult_v8
.global gcm_ghash_v8
.type gcm_ghash_v8,%function
.align 4
gcm_ghash_v8:
vld1.64 {q0},[r0] @ load [rotated] Xi
subs r3,r3,#16
vmov.i8 q11,#0xe1
mov r12,#16
vld1.64 {q12},[r1] @ load twisted H
moveq r12,#0
vext.8 q0,q0,q0,#8
vshl.u64 q11,q11,#57
vld1.64 {q9},[r2],r12 @ load [rotated] inp
vext.8 q13,q12,q12,#8
#ifndef __ARMEB__
vrev64.8 q0,q0
vrev64.8 q9,q9
#endif
veor q13,q13,q12 @ Karatsuba pre-processing
vext.8 q3,q9,q9,#8
b .Loop_v8
.align 4
.Loop_v8:
vext.8 q10,q0,q0,#8
veor q3,q3,q0 @ inp^=Xi
veor q9,q9,q10 @ q9 is rotated inp^Xi
.Lgmult_v8:
.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
veor q9,q9,q3 @ Karatsuba pre-processing
.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
subs r3,r3,#16
.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
moveq r12,#0
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
veor q10,q0,q2
veor q1,q1,q9
vld1.64 {q9},[r2],r12 @ load [rotated] inp
veor q1,q1,q10
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase
vmov d4,d3 @ Xh|Xm - 256-bit result
vmov d3,d0 @ Xm is rotated Xl
#ifndef __ARMEB__
vrev64.8 q9,q9
#endif
veor q0,q1,q10
vext.8 q3,q9,q9,#8
vext.8 q10,q0,q0,#8 @ 2nd phase
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
veor q10,q10,q2
veor q0,q0,q10
bhs .Loop_v8
#ifndef __ARMEB__
vrev64.8 q0,q0
#endif
vext.8 q0,q0,q0,#8
vst1.64 {q0},[r0] @ write out Xi
RET
.size gcm_ghash_v8,.-gcm_ghash_v8
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
.align 2

View File

@ -0,0 +1,4 @@
.PATH.S: ${.PARSEDIR}
MODES_SRCS = ghash-armv4.S ghashv8-armx.S gcm128.c
MODESCPPFLAGS = -DGHASH_ASM
.include "../../modes.inc"

View File

@ -0,0 +1,7 @@
.if ${MACHINE_ARCH:M*armv4*} == ""
.PATH.S: ${.PARSEDIR}
SHA_SRCS = sha1-armv4-large.S sha256-armv4.S sha512-armv4.S
SHACPPFLAGS = -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM
.endif
.include "../../sha.inc"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff