This commit is contained in:
christos 2017-01-08 01:59:43 +00:00
parent 5a016eb166
commit 631c5ab455
18 changed files with 27386 additions and 90 deletions

View File

@ -1318,7 +1318,7 @@ AES_cbc_encrypt:
leaq .LAES_Td(%rip),%r14 leaq .LAES_Td(%rip),%r14
.Lcbc_picked_te: .Lcbc_picked_te:
movl OPENSSL_ia32cap_P@GOTPCREL(%rip),%r10d movl OPENSSL_ia32cap_P(%rip),%r10d
cmpq $512,%rdx cmpq $512,%rdx
jb .Lcbc_slow_prologue jb .Lcbc_slow_prologue
testq $15,%rdx testq $15,%rdx

View File

@ -1,16 +1,754 @@
#include <machine/asm.h> #include <machine/asm.h>
.text .text
.globl aesni_gcm_encrypt .type _aesni_ctr32_ghash_6x,@function
.type aesni_gcm_encrypt,@function .align 32
aesni_gcm_encrypt: _aesni_ctr32_ghash_6x:
xorl %eax,%eax vmovdqu 32(%r11),%xmm2
.byte 0xf3,0xc3 subq $6,%rdx
.size aesni_gcm_encrypt,.-aesni_gcm_encrypt vpxor %xmm4,%xmm4,%xmm4
vmovdqu 0-128(%rcx),%xmm15
vpaddb %xmm2,%xmm1,%xmm10
vpaddb %xmm2,%xmm10,%xmm11
vpaddb %xmm2,%xmm11,%xmm12
vpaddb %xmm2,%xmm12,%xmm13
vpaddb %xmm2,%xmm13,%xmm14
vpxor %xmm15,%xmm1,%xmm9
vmovdqu %xmm4,16+8(%rsp)
jmp .Loop6x
.align 32
.Loop6x:
addl $100663296,%ebx
jc .Lhandle_ctr32
vmovdqu 0-32(%r9),%xmm3
vpaddb %xmm2,%xmm14,%xmm1
vpxor %xmm15,%xmm10,%xmm10
vpxor %xmm15,%xmm11,%xmm11
.Lresume_ctr32:
vmovdqu %xmm1,(%r8)
vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
vpxor %xmm15,%xmm12,%xmm12
vmovups 16-128(%rcx),%xmm2
vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
xorq %r12,%r12
cmpq %r14,%r15
vaesenc %xmm2,%xmm9,%xmm9
vmovdqu 48+8(%rsp),%xmm0
vpxor %xmm15,%xmm13,%xmm13
vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
vaesenc %xmm2,%xmm10,%xmm10
vpxor %xmm15,%xmm14,%xmm14
setnc %r12b
vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
vaesenc %xmm2,%xmm11,%xmm11
vmovdqu 16-32(%r9),%xmm3
negq %r12
vaesenc %xmm2,%xmm12,%xmm12
vpxor %xmm5,%xmm6,%xmm6
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
vpxor %xmm4,%xmm8,%xmm8
vaesenc %xmm2,%xmm13,%xmm13
vpxor %xmm5,%xmm1,%xmm4
andq $0x60,%r12
vmovups 32-128(%rcx),%xmm15
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
vaesenc %xmm2,%xmm14,%xmm14
vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
leaq (%r14,%r12,1),%r14
vaesenc %xmm15,%xmm9,%xmm9
vpxor 16+8(%rsp),%xmm8,%xmm8
vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
vmovdqu 64+8(%rsp),%xmm0
vaesenc %xmm15,%xmm10,%xmm10
movbeq 88(%r14),%r13
vaesenc %xmm15,%xmm11,%xmm11
movbeq 80(%r14),%r12
vaesenc %xmm15,%xmm12,%xmm12
movq %r13,32+8(%rsp)
vaesenc %xmm15,%xmm13,%xmm13
movq %r12,40+8(%rsp)
vmovdqu 48-32(%r9),%xmm5
vaesenc %xmm15,%xmm14,%xmm14
vmovups 48-128(%rcx),%xmm15
vpxor %xmm1,%xmm6,%xmm6
vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
vaesenc %xmm15,%xmm9,%xmm9
vpxor %xmm2,%xmm6,%xmm6
vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
vaesenc %xmm15,%xmm10,%xmm10
vpxor %xmm3,%xmm7,%xmm7
vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
vaesenc %xmm15,%xmm11,%xmm11
vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
vmovdqu 80+8(%rsp),%xmm0
vaesenc %xmm15,%xmm12,%xmm12
vaesenc %xmm15,%xmm13,%xmm13
vpxor %xmm1,%xmm4,%xmm4
vmovdqu 64-32(%r9),%xmm1
vaesenc %xmm15,%xmm14,%xmm14
vmovups 64-128(%rcx),%xmm15
vpxor %xmm2,%xmm6,%xmm6
vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
vaesenc %xmm15,%xmm9,%xmm9
vpxor %xmm3,%xmm6,%xmm6
vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
vaesenc %xmm15,%xmm10,%xmm10
movbeq 72(%r14),%r13
vpxor %xmm5,%xmm7,%xmm7
vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
vaesenc %xmm15,%xmm11,%xmm11
movbeq 64(%r14),%r12
vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
vmovdqu 96+8(%rsp),%xmm0
vaesenc %xmm15,%xmm12,%xmm12
movq %r13,48+8(%rsp)
vaesenc %xmm15,%xmm13,%xmm13
movq %r12,56+8(%rsp)
vpxor %xmm2,%xmm4,%xmm4
vmovdqu 96-32(%r9),%xmm2
vaesenc %xmm15,%xmm14,%xmm14
vmovups 80-128(%rcx),%xmm15
vpxor %xmm3,%xmm6,%xmm6
vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
vaesenc %xmm15,%xmm9,%xmm9
vpxor %xmm5,%xmm6,%xmm6
vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
vaesenc %xmm15,%xmm10,%xmm10
movbeq 56(%r14),%r13
vpxor %xmm1,%xmm7,%xmm7
vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
vpxor 112+8(%rsp),%xmm8,%xmm8
vaesenc %xmm15,%xmm11,%xmm11
movbeq 48(%r14),%r12
vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
vaesenc %xmm15,%xmm12,%xmm12
movq %r13,64+8(%rsp)
vaesenc %xmm15,%xmm13,%xmm13
movq %r12,72+8(%rsp)
vpxor %xmm3,%xmm4,%xmm4
vmovdqu 112-32(%r9),%xmm3
vaesenc %xmm15,%xmm14,%xmm14
vmovups 96-128(%rcx),%xmm15
vpxor %xmm5,%xmm6,%xmm6
vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
vaesenc %xmm15,%xmm9,%xmm9
vpxor %xmm1,%xmm6,%xmm6
vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
vaesenc %xmm15,%xmm10,%xmm10
movbeq 40(%r14),%r13
vpxor %xmm2,%xmm7,%xmm7
vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
vaesenc %xmm15,%xmm11,%xmm11
movbeq 32(%r14),%r12
vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
vaesenc %xmm15,%xmm12,%xmm12
movq %r13,80+8(%rsp)
vaesenc %xmm15,%xmm13,%xmm13
movq %r12,88+8(%rsp)
vpxor %xmm5,%xmm6,%xmm6
vaesenc %xmm15,%xmm14,%xmm14
vpxor %xmm1,%xmm6,%xmm6
vmovups 112-128(%rcx),%xmm15
vpslldq $8,%xmm6,%xmm5
vpxor %xmm2,%xmm4,%xmm4
vmovdqu 16(%r11),%xmm3
vaesenc %xmm15,%xmm9,%xmm9
vpxor %xmm8,%xmm7,%xmm7
vaesenc %xmm15,%xmm10,%xmm10
vpxor %xmm5,%xmm4,%xmm4
movbeq 24(%r14),%r13
vaesenc %xmm15,%xmm11,%xmm11
movbeq 16(%r14),%r12
vpalignr $8,%xmm4,%xmm4,%xmm0
vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
movq %r13,96+8(%rsp)
vaesenc %xmm15,%xmm12,%xmm12
movq %r12,104+8(%rsp)
vaesenc %xmm15,%xmm13,%xmm13
vmovups 128-128(%rcx),%xmm1
vaesenc %xmm15,%xmm14,%xmm14
vaesenc %xmm1,%xmm9,%xmm9
vmovups 144-128(%rcx),%xmm15
vaesenc %xmm1,%xmm10,%xmm10
vpsrldq $8,%xmm6,%xmm6
vaesenc %xmm1,%xmm11,%xmm11
vpxor %xmm6,%xmm7,%xmm7
vaesenc %xmm1,%xmm12,%xmm12
vpxor %xmm0,%xmm4,%xmm4
movbeq 8(%r14),%r13
vaesenc %xmm1,%xmm13,%xmm13
movbeq 0(%r14),%r12
vaesenc %xmm1,%xmm14,%xmm14
vmovups 160-128(%rcx),%xmm1
cmpl $11,%ebp
jb .Lenc_tail
vaesenc %xmm15,%xmm9,%xmm9
vaesenc %xmm15,%xmm10,%xmm10
vaesenc %xmm15,%xmm11,%xmm11
vaesenc %xmm15,%xmm12,%xmm12
vaesenc %xmm15,%xmm13,%xmm13
vaesenc %xmm15,%xmm14,%xmm14
vaesenc %xmm1,%xmm9,%xmm9
vaesenc %xmm1,%xmm10,%xmm10
vaesenc %xmm1,%xmm11,%xmm11
vaesenc %xmm1,%xmm12,%xmm12
vaesenc %xmm1,%xmm13,%xmm13
vmovups 176-128(%rcx),%xmm15
vaesenc %xmm1,%xmm14,%xmm14
vmovups 192-128(%rcx),%xmm1
je .Lenc_tail
vaesenc %xmm15,%xmm9,%xmm9
vaesenc %xmm15,%xmm10,%xmm10
vaesenc %xmm15,%xmm11,%xmm11
vaesenc %xmm15,%xmm12,%xmm12
vaesenc %xmm15,%xmm13,%xmm13
vaesenc %xmm15,%xmm14,%xmm14
vaesenc %xmm1,%xmm9,%xmm9
vaesenc %xmm1,%xmm10,%xmm10
vaesenc %xmm1,%xmm11,%xmm11
vaesenc %xmm1,%xmm12,%xmm12
vaesenc %xmm1,%xmm13,%xmm13
vmovups 208-128(%rcx),%xmm15
vaesenc %xmm1,%xmm14,%xmm14
vmovups 224-128(%rcx),%xmm1
jmp .Lenc_tail
.align 32
.Lhandle_ctr32:
vmovdqu (%r11),%xmm0
vpshufb %xmm0,%xmm1,%xmm6
vmovdqu 48(%r11),%xmm5
vpaddd 64(%r11),%xmm6,%xmm10
vpaddd %xmm5,%xmm6,%xmm11
vmovdqu 0-32(%r9),%xmm3
vpaddd %xmm5,%xmm10,%xmm12
vpshufb %xmm0,%xmm10,%xmm10
vpaddd %xmm5,%xmm11,%xmm13
vpshufb %xmm0,%xmm11,%xmm11
vpxor %xmm15,%xmm10,%xmm10
vpaddd %xmm5,%xmm12,%xmm14
vpshufb %xmm0,%xmm12,%xmm12
vpxor %xmm15,%xmm11,%xmm11
vpaddd %xmm5,%xmm13,%xmm1
vpshufb %xmm0,%xmm13,%xmm13
vpshufb %xmm0,%xmm14,%xmm14
vpshufb %xmm0,%xmm1,%xmm1
jmp .Lresume_ctr32
.align 32
.Lenc_tail:
vaesenc %xmm15,%xmm9,%xmm9
vmovdqu %xmm7,16+8(%rsp)
vpalignr $8,%xmm4,%xmm4,%xmm8
vaesenc %xmm15,%xmm10,%xmm10
vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
vpxor 0(%rdi),%xmm1,%xmm2
vaesenc %xmm15,%xmm11,%xmm11
vpxor 16(%rdi),%xmm1,%xmm0
vaesenc %xmm15,%xmm12,%xmm12
vpxor 32(%rdi),%xmm1,%xmm5
vaesenc %xmm15,%xmm13,%xmm13
vpxor 48(%rdi),%xmm1,%xmm6
vaesenc %xmm15,%xmm14,%xmm14
vpxor 64(%rdi),%xmm1,%xmm7
vpxor 80(%rdi),%xmm1,%xmm3
vmovdqu (%r8),%xmm1
vaesenclast %xmm2,%xmm9,%xmm9
vmovdqu 32(%r11),%xmm2
vaesenclast %xmm0,%xmm10,%xmm10
vpaddb %xmm2,%xmm1,%xmm0
movq %r13,112+8(%rsp)
leaq 96(%rdi),%rdi
vaesenclast %xmm5,%xmm11,%xmm11
vpaddb %xmm2,%xmm0,%xmm5
movq %r12,120+8(%rsp)
leaq 96(%rsi),%rsi
vmovdqu 0-128(%rcx),%xmm15
vaesenclast %xmm6,%xmm12,%xmm12
vpaddb %xmm2,%xmm5,%xmm6
vaesenclast %xmm7,%xmm13,%xmm13
vpaddb %xmm2,%xmm6,%xmm7
vaesenclast %xmm3,%xmm14,%xmm14
vpaddb %xmm2,%xmm7,%xmm3
addq $0x60,%r10
subq $0x6,%rdx
jc .L6x_done
vmovups %xmm9,-96(%rsi)
vpxor %xmm15,%xmm1,%xmm9
vmovups %xmm10,-80(%rsi)
vmovdqa %xmm0,%xmm10
vmovups %xmm11,-64(%rsi)
vmovdqa %xmm5,%xmm11
vmovups %xmm12,-48(%rsi)
vmovdqa %xmm6,%xmm12
vmovups %xmm13,-32(%rsi)
vmovdqa %xmm7,%xmm13
vmovups %xmm14,-16(%rsi)
vmovdqa %xmm3,%xmm14
vmovdqu 32+8(%rsp),%xmm7
jmp .Loop6x
.L6x_done:
vpxor 16+8(%rsp),%xmm8,%xmm8
vpxor %xmm4,%xmm8,%xmm8
.byte 0xf3,0xc3
.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
.globl aesni_gcm_decrypt .globl aesni_gcm_decrypt
.type aesni_gcm_decrypt,@function .type aesni_gcm_decrypt,@function
.align 32
aesni_gcm_decrypt: aesni_gcm_decrypt:
xorl %eax,%eax xorq %r10,%r10
cmpq $0x60,%rdx
jb .Lgcm_dec_abort
leaq (%rsp),%rax
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
vzeroupper
vmovdqu (%r8),%xmm1
addq $-128,%rsp
movl 12(%r8),%ebx
leaq .Lbswap_mask(%rip),%r11
leaq -128(%rcx),%r14
movq $0xf80,%r15
vmovdqu (%r9),%xmm8
andq $-128,%rsp
vmovdqu (%r11),%xmm0
leaq 128(%rcx),%rcx
leaq 32+32(%r9),%r9
movl 240-128(%rcx),%ebp
vpshufb %xmm0,%xmm8,%xmm8
andq %r15,%r14
andq %rsp,%r15
subq %r14,%r15
jc .Ldec_no_key_aliasing
cmpq $768,%r15
jnc .Ldec_no_key_aliasing
subq %r15,%rsp
.Ldec_no_key_aliasing:
vmovdqu 80(%rdi),%xmm7
leaq (%rdi),%r14
vmovdqu 64(%rdi),%xmm4
leaq -192(%rdi,%rdx,1),%r15
vmovdqu 48(%rdi),%xmm5
shrq $4,%rdx
xorq %r10,%r10
vmovdqu 32(%rdi),%xmm6
vpshufb %xmm0,%xmm7,%xmm7
vmovdqu 16(%rdi),%xmm2
vpshufb %xmm0,%xmm4,%xmm4
vmovdqu (%rdi),%xmm3
vpshufb %xmm0,%xmm5,%xmm5
vmovdqu %xmm4,48(%rsp)
vpshufb %xmm0,%xmm6,%xmm6
vmovdqu %xmm5,64(%rsp)
vpshufb %xmm0,%xmm2,%xmm2
vmovdqu %xmm6,80(%rsp)
vpshufb %xmm0,%xmm3,%xmm3
vmovdqu %xmm2,96(%rsp)
vmovdqu %xmm3,112(%rsp)
call _aesni_ctr32_ghash_6x
vmovups %xmm9,-96(%rsi)
vmovups %xmm10,-80(%rsi)
vmovups %xmm11,-64(%rsi)
vmovups %xmm12,-48(%rsi)
vmovups %xmm13,-32(%rsi)
vmovups %xmm14,-16(%rsi)
vpshufb (%r11),%xmm8,%xmm8
vmovdqu %xmm8,-64(%r9)
vzeroupper
movq -48(%rax),%r15
movq -40(%rax),%r14
movq -32(%rax),%r13
movq -24(%rax),%r12
movq -16(%rax),%rbp
movq -8(%rax),%rbx
leaq (%rax),%rsp
.Lgcm_dec_abort:
movq %r10,%rax
.byte 0xf3,0xc3 .byte 0xf3,0xc3
.size aesni_gcm_decrypt,.-aesni_gcm_decrypt .size aesni_gcm_decrypt,.-aesni_gcm_decrypt
.type _aesni_ctr32_6x,@function
.align 32
_aesni_ctr32_6x:
vmovdqu 0-128(%rcx),%xmm4
vmovdqu 32(%r11),%xmm2
leaq -1(%rbp),%r13
vmovups 16-128(%rcx),%xmm15
leaq 32-128(%rcx),%r12
vpxor %xmm4,%xmm1,%xmm9
addl $100663296,%ebx
jc .Lhandle_ctr32_2
vpaddb %xmm2,%xmm1,%xmm10
vpaddb %xmm2,%xmm10,%xmm11
vpxor %xmm4,%xmm10,%xmm10
vpaddb %xmm2,%xmm11,%xmm12
vpxor %xmm4,%xmm11,%xmm11
vpaddb %xmm2,%xmm12,%xmm13
vpxor %xmm4,%xmm12,%xmm12
vpaddb %xmm2,%xmm13,%xmm14
vpxor %xmm4,%xmm13,%xmm13
vpaddb %xmm2,%xmm14,%xmm1
vpxor %xmm4,%xmm14,%xmm14
jmp .Loop_ctr32
.align 16
.Loop_ctr32:
vaesenc %xmm15,%xmm9,%xmm9
vaesenc %xmm15,%xmm10,%xmm10
vaesenc %xmm15,%xmm11,%xmm11
vaesenc %xmm15,%xmm12,%xmm12
vaesenc %xmm15,%xmm13,%xmm13
vaesenc %xmm15,%xmm14,%xmm14
vmovups (%r12),%xmm15
leaq 16(%r12),%r12
decl %r13d
jnz .Loop_ctr32
vmovdqu (%r12),%xmm3
vaesenc %xmm15,%xmm9,%xmm9
vpxor 0(%rdi),%xmm3,%xmm4
vaesenc %xmm15,%xmm10,%xmm10
vpxor 16(%rdi),%xmm3,%xmm5
vaesenc %xmm15,%xmm11,%xmm11
vpxor 32(%rdi),%xmm3,%xmm6
vaesenc %xmm15,%xmm12,%xmm12
vpxor 48(%rdi),%xmm3,%xmm8
vaesenc %xmm15,%xmm13,%xmm13
vpxor 64(%rdi),%xmm3,%xmm2
vaesenc %xmm15,%xmm14,%xmm14
vpxor 80(%rdi),%xmm3,%xmm3
leaq 96(%rdi),%rdi
vaesenclast %xmm4,%xmm9,%xmm9
vaesenclast %xmm5,%xmm10,%xmm10
vaesenclast %xmm6,%xmm11,%xmm11
vaesenclast %xmm8,%xmm12,%xmm12
vaesenclast %xmm2,%xmm13,%xmm13
vaesenclast %xmm3,%xmm14,%xmm14
vmovups %xmm9,0(%rsi)
vmovups %xmm10,16(%rsi)
vmovups %xmm11,32(%rsi)
vmovups %xmm12,48(%rsi)
vmovups %xmm13,64(%rsi)
vmovups %xmm14,80(%rsi)
leaq 96(%rsi),%rsi
.byte 0xf3,0xc3
.align 32
.Lhandle_ctr32_2:
vpshufb %xmm0,%xmm1,%xmm6
vmovdqu 48(%r11),%xmm5
vpaddd 64(%r11),%xmm6,%xmm10
vpaddd %xmm5,%xmm6,%xmm11
vpaddd %xmm5,%xmm10,%xmm12
vpshufb %xmm0,%xmm10,%xmm10
vpaddd %xmm5,%xmm11,%xmm13
vpshufb %xmm0,%xmm11,%xmm11
vpxor %xmm4,%xmm10,%xmm10
vpaddd %xmm5,%xmm12,%xmm14
vpshufb %xmm0,%xmm12,%xmm12
vpxor %xmm4,%xmm11,%xmm11
vpaddd %xmm5,%xmm13,%xmm1
vpshufb %xmm0,%xmm13,%xmm13
vpxor %xmm4,%xmm12,%xmm12
vpshufb %xmm0,%xmm14,%xmm14
vpxor %xmm4,%xmm13,%xmm13
vpshufb %xmm0,%xmm1,%xmm1
vpxor %xmm4,%xmm14,%xmm14
jmp .Loop_ctr32
.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
.globl aesni_gcm_encrypt
.type aesni_gcm_encrypt,@function
.align 32
aesni_gcm_encrypt:
xorq %r10,%r10
cmpq $288,%rdx
jb .Lgcm_enc_abort
leaq (%rsp),%rax
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
vzeroupper
vmovdqu (%r8),%xmm1
addq $-128,%rsp
movl 12(%r8),%ebx
leaq .Lbswap_mask(%rip),%r11
leaq -128(%rcx),%r14
movq $0xf80,%r15
leaq 128(%rcx),%rcx
vmovdqu (%r11),%xmm0
andq $-128,%rsp
movl 240-128(%rcx),%ebp
andq %r15,%r14
andq %rsp,%r15
subq %r14,%r15
jc .Lenc_no_key_aliasing
cmpq $768,%r15
jnc .Lenc_no_key_aliasing
subq %r15,%rsp
.Lenc_no_key_aliasing:
leaq (%rsi),%r14
leaq -192(%rsi,%rdx,1),%r15
shrq $4,%rdx
call _aesni_ctr32_6x
vpshufb %xmm0,%xmm9,%xmm8
vpshufb %xmm0,%xmm10,%xmm2
vmovdqu %xmm8,112(%rsp)
vpshufb %xmm0,%xmm11,%xmm4
vmovdqu %xmm2,96(%rsp)
vpshufb %xmm0,%xmm12,%xmm5
vmovdqu %xmm4,80(%rsp)
vpshufb %xmm0,%xmm13,%xmm6
vmovdqu %xmm5,64(%rsp)
vpshufb %xmm0,%xmm14,%xmm7
vmovdqu %xmm6,48(%rsp)
call _aesni_ctr32_6x
vmovdqu (%r9),%xmm8
leaq 32+32(%r9),%r9
subq $12,%rdx
movq $192,%r10
vpshufb %xmm0,%xmm8,%xmm8
call _aesni_ctr32_ghash_6x
vmovdqu 32(%rsp),%xmm7
vmovdqu (%r11),%xmm0
vmovdqu 0-32(%r9),%xmm3
vpunpckhqdq %xmm7,%xmm7,%xmm1
vmovdqu 32-32(%r9),%xmm15
vmovups %xmm9,-96(%rsi)
vpshufb %xmm0,%xmm9,%xmm9
vpxor %xmm7,%xmm1,%xmm1
vmovups %xmm10,-80(%rsi)
vpshufb %xmm0,%xmm10,%xmm10
vmovups %xmm11,-64(%rsi)
vpshufb %xmm0,%xmm11,%xmm11
vmovups %xmm12,-48(%rsi)
vpshufb %xmm0,%xmm12,%xmm12
vmovups %xmm13,-32(%rsi)
vpshufb %xmm0,%xmm13,%xmm13
vmovups %xmm14,-16(%rsi)
vpshufb %xmm0,%xmm14,%xmm14
vmovdqu %xmm9,16(%rsp)
vmovdqu 48(%rsp),%xmm6
vmovdqu 16-32(%r9),%xmm0
vpunpckhqdq %xmm6,%xmm6,%xmm2
vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
vpxor %xmm6,%xmm2,%xmm2
vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
vmovdqu 64(%rsp),%xmm9
vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
vmovdqu 48-32(%r9),%xmm3
vpxor %xmm5,%xmm4,%xmm4
vpunpckhqdq %xmm9,%xmm9,%xmm5
vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
vpxor %xmm9,%xmm5,%xmm5
vpxor %xmm7,%xmm6,%xmm6
vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
vmovdqu 80-32(%r9),%xmm15
vpxor %xmm1,%xmm2,%xmm2
vmovdqu 80(%rsp),%xmm1
vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
vmovdqu 64-32(%r9),%xmm0
vpxor %xmm4,%xmm7,%xmm7
vpunpckhqdq %xmm1,%xmm1,%xmm4
vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
vpxor %xmm1,%xmm4,%xmm4
vpxor %xmm6,%xmm9,%xmm9
vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
vpxor %xmm2,%xmm5,%xmm5
vmovdqu 96(%rsp),%xmm2
vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
vmovdqu 96-32(%r9),%xmm3
vpxor %xmm7,%xmm6,%xmm6
vpunpckhqdq %xmm2,%xmm2,%xmm7
vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
vpxor %xmm2,%xmm7,%xmm7
vpxor %xmm9,%xmm1,%xmm1
vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
vmovdqu 128-32(%r9),%xmm15
vpxor %xmm5,%xmm4,%xmm4
vpxor 112(%rsp),%xmm8,%xmm8
vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
vmovdqu 112-32(%r9),%xmm0
vpunpckhqdq %xmm8,%xmm8,%xmm9
vpxor %xmm6,%xmm5,%xmm5
vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
vpxor %xmm8,%xmm9,%xmm9
vpxor %xmm1,%xmm2,%xmm2
vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
vpxor %xmm4,%xmm7,%xmm4
vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
vmovdqu 0-32(%r9),%xmm3
vpunpckhqdq %xmm14,%xmm14,%xmm1
vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
vpxor %xmm14,%xmm1,%xmm1
vpxor %xmm5,%xmm6,%xmm5
vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
vmovdqu 32-32(%r9),%xmm15
vpxor %xmm2,%xmm8,%xmm7
vpxor %xmm4,%xmm9,%xmm6
vmovdqu 16-32(%r9),%xmm0
vpxor %xmm5,%xmm7,%xmm9
vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
vpxor %xmm9,%xmm6,%xmm6
vpunpckhqdq %xmm13,%xmm13,%xmm2
vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
vpxor %xmm13,%xmm2,%xmm2
vpslldq $8,%xmm6,%xmm9
vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
vpxor %xmm9,%xmm5,%xmm8
vpsrldq $8,%xmm6,%xmm6
vpxor %xmm6,%xmm7,%xmm7
vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
vmovdqu 48-32(%r9),%xmm3
vpxor %xmm4,%xmm5,%xmm5
vpunpckhqdq %xmm12,%xmm12,%xmm9
vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
vpxor %xmm12,%xmm9,%xmm9
vpxor %xmm14,%xmm13,%xmm13
vpalignr $8,%xmm8,%xmm8,%xmm14
vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
vmovdqu 80-32(%r9),%xmm15
vpxor %xmm1,%xmm2,%xmm2
vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
vmovdqu 64-32(%r9),%xmm0
vpxor %xmm5,%xmm4,%xmm4
vpunpckhqdq %xmm11,%xmm11,%xmm1
vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
vpxor %xmm11,%xmm1,%xmm1
vpxor %xmm13,%xmm12,%xmm12
vxorps 16(%rsp),%xmm7,%xmm7
vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
vpxor %xmm2,%xmm9,%xmm9
vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
vxorps %xmm14,%xmm8,%xmm8
vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
vmovdqu 96-32(%r9),%xmm3
vpxor %xmm4,%xmm5,%xmm5
vpunpckhqdq %xmm10,%xmm10,%xmm2
vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
vpxor %xmm10,%xmm2,%xmm2
vpalignr $8,%xmm8,%xmm8,%xmm14
vpxor %xmm12,%xmm11,%xmm11
vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
vmovdqu 128-32(%r9),%xmm15
vpxor %xmm9,%xmm1,%xmm1
vxorps %xmm7,%xmm14,%xmm14
vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
vxorps %xmm14,%xmm8,%xmm8
vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
vmovdqu 112-32(%r9),%xmm0
vpxor %xmm5,%xmm4,%xmm4
vpunpckhqdq %xmm8,%xmm8,%xmm9
vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
vpxor %xmm8,%xmm9,%xmm9
vpxor %xmm11,%xmm10,%xmm10
vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
vpxor %xmm1,%xmm2,%xmm2
vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
vpxor %xmm4,%xmm5,%xmm5
vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
vpxor %xmm10,%xmm7,%xmm7
vpxor %xmm2,%xmm6,%xmm6
vpxor %xmm5,%xmm7,%xmm4
vpxor %xmm4,%xmm6,%xmm6
vpslldq $8,%xmm6,%xmm1
vmovdqu 16(%r11),%xmm3
vpsrldq $8,%xmm6,%xmm6
vpxor %xmm1,%xmm5,%xmm8
vpxor %xmm6,%xmm7,%xmm7
vpalignr $8,%xmm8,%xmm8,%xmm2
vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
vpxor %xmm2,%xmm8,%xmm8
vpalignr $8,%xmm8,%xmm8,%xmm2
vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
vpxor %xmm7,%xmm2,%xmm2
vpxor %xmm2,%xmm8,%xmm8
vpshufb (%r11),%xmm8,%xmm8
vmovdqu %xmm8,-64(%r9)
vzeroupper
movq -48(%rax),%r15
movq -40(%rax),%r14
movq -32(%rax),%r13
movq -24(%rax),%r12
movq -16(%rax),%rbp
movq -8(%rax),%rbx
leaq (%rax),%rsp
.Lgcm_enc_abort:
movq %r10,%rax
.byte 0xf3,0xc3
.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
.align 64
.Lbswap_mask:
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.Lpoly:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
.Lone_msb:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
.Ltwo_lsb:
.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
.Lone_lsb:
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64

View File

@ -7,6 +7,14 @@
.type aesni_multi_cbc_encrypt,@function .type aesni_multi_cbc_encrypt,@function
.align 32 .align 32
aesni_multi_cbc_encrypt: aesni_multi_cbc_encrypt:
cmpl $2,%edx
jb .Lenc_non_avx
movl OPENSSL_ia32cap_P+4(%rip),%ecx
testl $268435456,%ecx
jnz _avx_cbc_enc_shortcut
jmp .Lenc_non_avx
.align 16
.Lenc_non_avx:
movq %rsp,%rax movq %rsp,%rax
pushq %rbx pushq %rbx
pushq %rbp pushq %rbp
@ -263,6 +271,14 @@ aesni_multi_cbc_encrypt:
.type aesni_multi_cbc_decrypt,@function .type aesni_multi_cbc_decrypt,@function
.align 32 .align 32
aesni_multi_cbc_decrypt: aesni_multi_cbc_decrypt:
cmpl $2,%edx
jb .Ldec_non_avx
movl OPENSSL_ia32cap_P+4(%rip),%ecx
testl $268435456,%ecx
jnz _avx_cbc_dec_shortcut
jmp .Ldec_non_avx
.align 16
.Ldec_non_avx:
movq %rsp,%rax movq %rsp,%rax
pushq %rbx pushq %rbx
pushq %rbp pushq %rbp
@ -505,3 +521,916 @@ aesni_multi_cbc_decrypt:
.Ldec4x_epilogue: .Ldec4x_epilogue:
.byte 0xf3,0xc3 .byte 0xf3,0xc3
.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt .size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
.type aesni_multi_cbc_encrypt_avx,@function
.align 32
aesni_multi_cbc_encrypt_avx:
_avx_cbc_enc_shortcut:
movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
subq $192,%rsp
andq $-128,%rsp
movq %rax,16(%rsp)
.Lenc8x_body:
vzeroupper
vmovdqu (%rsi),%xmm15
leaq 120(%rsi),%rsi
leaq 160(%rdi),%rdi
shrl $1,%edx
.Lenc8x_loop_grande:
xorl %edx,%edx
movl -144(%rdi),%ecx
movq -160(%rdi),%r8
cmpl %edx,%ecx
movq -152(%rdi),%rbx
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu -136(%rdi),%xmm2
movl %ecx,32(%rsp)
cmovleq %rsp,%r8
subq %r8,%rbx
movq %rbx,64(%rsp)
movl -104(%rdi),%ecx
movq -120(%rdi),%r9
cmpl %edx,%ecx
movq -112(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu -96(%rdi),%xmm3
movl %ecx,36(%rsp)
cmovleq %rsp,%r9
subq %r9,%rbp
movq %rbp,72(%rsp)
movl -64(%rdi),%ecx
movq -80(%rdi),%r10
cmpl %edx,%ecx
movq -72(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu -56(%rdi),%xmm4
movl %ecx,40(%rsp)
cmovleq %rsp,%r10
subq %r10,%rbp
movq %rbp,80(%rsp)
movl -24(%rdi),%ecx
movq -40(%rdi),%r11
cmpl %edx,%ecx
movq -32(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu -16(%rdi),%xmm5
movl %ecx,44(%rsp)
cmovleq %rsp,%r11
subq %r11,%rbp
movq %rbp,88(%rsp)
movl 16(%rdi),%ecx
movq 0(%rdi),%r12
cmpl %edx,%ecx
movq 8(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu 24(%rdi),%xmm6
movl %ecx,48(%rsp)
cmovleq %rsp,%r12
subq %r12,%rbp
movq %rbp,96(%rsp)
movl 56(%rdi),%ecx
movq 40(%rdi),%r13
cmpl %edx,%ecx
movq 48(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu 64(%rdi),%xmm7
movl %ecx,52(%rsp)
cmovleq %rsp,%r13
subq %r13,%rbp
movq %rbp,104(%rsp)
movl 96(%rdi),%ecx
movq 80(%rdi),%r14
cmpl %edx,%ecx
movq 88(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu 104(%rdi),%xmm8
movl %ecx,56(%rsp)
cmovleq %rsp,%r14
subq %r14,%rbp
movq %rbp,112(%rsp)
movl 136(%rdi),%ecx
movq 120(%rdi),%r15
cmpl %edx,%ecx
movq 128(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu 144(%rdi),%xmm9
movl %ecx,60(%rsp)
cmovleq %rsp,%r15
subq %r15,%rbp
movq %rbp,120(%rsp)
testl %edx,%edx
jz .Lenc8x_done
vmovups 16-120(%rsi),%xmm1
vmovups 32-120(%rsi),%xmm0
movl 240-120(%rsi),%eax
vpxor (%r8),%xmm15,%xmm10
leaq 128(%rsp),%rbp
vpxor (%r9),%xmm15,%xmm11
vpxor (%r10),%xmm15,%xmm12
vpxor (%r11),%xmm15,%xmm13
vpxor %xmm10,%xmm2,%xmm2
vpxor (%r12),%xmm15,%xmm10
vpxor %xmm11,%xmm3,%xmm3
vpxor (%r13),%xmm15,%xmm11
vpxor %xmm12,%xmm4,%xmm4
vpxor (%r14),%xmm15,%xmm12
vpxor %xmm13,%xmm5,%xmm5
vpxor (%r15),%xmm15,%xmm13
vpxor %xmm10,%xmm6,%xmm6
movl $1,%ecx
vpxor %xmm11,%xmm7,%xmm7
vpxor %xmm12,%xmm8,%xmm8
vpxor %xmm13,%xmm9,%xmm9
jmp .Loop_enc8x
.align 32
.Loop_enc8x:
vaesenc %xmm1,%xmm2,%xmm2
cmpl 32+0(%rsp),%ecx
vaesenc %xmm1,%xmm3,%xmm3
prefetcht0 31(%r8)
vaesenc %xmm1,%xmm4,%xmm4
vaesenc %xmm1,%xmm5,%xmm5
leaq (%r8,%rbx,1),%rbx
cmovgeq %rsp,%r8
vaesenc %xmm1,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesenc %xmm1,%xmm7,%xmm7
subq %r8,%rbx
vaesenc %xmm1,%xmm8,%xmm8
vpxor 16(%r8),%xmm15,%xmm10
movq %rbx,64+0(%rsp)
vaesenc %xmm1,%xmm9,%xmm9
vmovups -72(%rsi),%xmm1
leaq 16(%r8,%rbx,1),%r8
vmovdqu %xmm10,0(%rbp)
vaesenc %xmm0,%xmm2,%xmm2
cmpl 32+4(%rsp),%ecx
movq 64+8(%rsp),%rbx
vaesenc %xmm0,%xmm3,%xmm3
prefetcht0 31(%r9)
vaesenc %xmm0,%xmm4,%xmm4
vaesenc %xmm0,%xmm5,%xmm5
leaq (%r9,%rbx,1),%rbx
cmovgeq %rsp,%r9
vaesenc %xmm0,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesenc %xmm0,%xmm7,%xmm7
subq %r9,%rbx
vaesenc %xmm0,%xmm8,%xmm8
vpxor 16(%r9),%xmm15,%xmm11
movq %rbx,64+8(%rsp)
vaesenc %xmm0,%xmm9,%xmm9
vmovups -56(%rsi),%xmm0
leaq 16(%r9,%rbx,1),%r9
vmovdqu %xmm11,16(%rbp)
vaesenc %xmm1,%xmm2,%xmm2
cmpl 32+8(%rsp),%ecx
movq 64+16(%rsp),%rbx
vaesenc %xmm1,%xmm3,%xmm3
prefetcht0 31(%r10)
vaesenc %xmm1,%xmm4,%xmm4
prefetcht0 15(%r8)
vaesenc %xmm1,%xmm5,%xmm5
leaq (%r10,%rbx,1),%rbx
cmovgeq %rsp,%r10
vaesenc %xmm1,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesenc %xmm1,%xmm7,%xmm7
subq %r10,%rbx
vaesenc %xmm1,%xmm8,%xmm8
vpxor 16(%r10),%xmm15,%xmm12
movq %rbx,64+16(%rsp)
vaesenc %xmm1,%xmm9,%xmm9
vmovups -40(%rsi),%xmm1
leaq 16(%r10,%rbx,1),%r10
vmovdqu %xmm12,32(%rbp)
vaesenc %xmm0,%xmm2,%xmm2
cmpl 32+12(%rsp),%ecx
movq 64+24(%rsp),%rbx
vaesenc %xmm0,%xmm3,%xmm3
prefetcht0 31(%r11)
vaesenc %xmm0,%xmm4,%xmm4
prefetcht0 15(%r9)
vaesenc %xmm0,%xmm5,%xmm5
leaq (%r11,%rbx,1),%rbx
cmovgeq %rsp,%r11
vaesenc %xmm0,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesenc %xmm0,%xmm7,%xmm7
subq %r11,%rbx
vaesenc %xmm0,%xmm8,%xmm8
vpxor 16(%r11),%xmm15,%xmm13
movq %rbx,64+24(%rsp)
vaesenc %xmm0,%xmm9,%xmm9
vmovups -24(%rsi),%xmm0
leaq 16(%r11,%rbx,1),%r11
vmovdqu %xmm13,48(%rbp)
vaesenc %xmm1,%xmm2,%xmm2
cmpl 32+16(%rsp),%ecx
movq 64+32(%rsp),%rbx
vaesenc %xmm1,%xmm3,%xmm3
prefetcht0 31(%r12)
vaesenc %xmm1,%xmm4,%xmm4
prefetcht0 15(%r10)
vaesenc %xmm1,%xmm5,%xmm5
leaq (%r12,%rbx,1),%rbx
cmovgeq %rsp,%r12
vaesenc %xmm1,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesenc %xmm1,%xmm7,%xmm7
subq %r12,%rbx
vaesenc %xmm1,%xmm8,%xmm8
vpxor 16(%r12),%xmm15,%xmm10
movq %rbx,64+32(%rsp)
vaesenc %xmm1,%xmm9,%xmm9
vmovups -8(%rsi),%xmm1
leaq 16(%r12,%rbx,1),%r12
vaesenc %xmm0,%xmm2,%xmm2
cmpl 32+20(%rsp),%ecx
movq 64+40(%rsp),%rbx
vaesenc %xmm0,%xmm3,%xmm3
prefetcht0 31(%r13)
vaesenc %xmm0,%xmm4,%xmm4
prefetcht0 15(%r11)
vaesenc %xmm0,%xmm5,%xmm5
leaq (%rbx,%r13,1),%rbx
cmovgeq %rsp,%r13
vaesenc %xmm0,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesenc %xmm0,%xmm7,%xmm7
subq %r13,%rbx
vaesenc %xmm0,%xmm8,%xmm8
vpxor 16(%r13),%xmm15,%xmm11
movq %rbx,64+40(%rsp)
vaesenc %xmm0,%xmm9,%xmm9
vmovups 8(%rsi),%xmm0
leaq 16(%r13,%rbx,1),%r13
vaesenc %xmm1,%xmm2,%xmm2
cmpl 32+24(%rsp),%ecx
movq 64+48(%rsp),%rbx
vaesenc %xmm1,%xmm3,%xmm3
prefetcht0 31(%r14)
vaesenc %xmm1,%xmm4,%xmm4
prefetcht0 15(%r12)
vaesenc %xmm1,%xmm5,%xmm5
leaq (%r14,%rbx,1),%rbx
cmovgeq %rsp,%r14
vaesenc %xmm1,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesenc %xmm1,%xmm7,%xmm7
subq %r14,%rbx
vaesenc %xmm1,%xmm8,%xmm8
vpxor 16(%r14),%xmm15,%xmm12
movq %rbx,64+48(%rsp)
vaesenc %xmm1,%xmm9,%xmm9
vmovups 24(%rsi),%xmm1
leaq 16(%r14,%rbx,1),%r14
vaesenc %xmm0,%xmm2,%xmm2
cmpl 32+28(%rsp),%ecx
movq 64+56(%rsp),%rbx
vaesenc %xmm0,%xmm3,%xmm3
prefetcht0 31(%r15)
vaesenc %xmm0,%xmm4,%xmm4
prefetcht0 15(%r13)
vaesenc %xmm0,%xmm5,%xmm5
leaq (%r15,%rbx,1),%rbx
cmovgeq %rsp,%r15
vaesenc %xmm0,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesenc %xmm0,%xmm7,%xmm7
subq %r15,%rbx
vaesenc %xmm0,%xmm8,%xmm8
vpxor 16(%r15),%xmm15,%xmm13
movq %rbx,64+56(%rsp)
vaesenc %xmm0,%xmm9,%xmm9
vmovups 40(%rsi),%xmm0
leaq 16(%r15,%rbx,1),%r15
vmovdqu 32(%rsp),%xmm14
prefetcht0 15(%r14)
prefetcht0 15(%r15)
cmpl $11,%eax
jb .Lenc8x_tail
vaesenc %xmm1,%xmm2,%xmm2
vaesenc %xmm1,%xmm3,%xmm3
vaesenc %xmm1,%xmm4,%xmm4
vaesenc %xmm1,%xmm5,%xmm5
vaesenc %xmm1,%xmm6,%xmm6
vaesenc %xmm1,%xmm7,%xmm7
vaesenc %xmm1,%xmm8,%xmm8
vaesenc %xmm1,%xmm9,%xmm9
vmovups 176-120(%rsi),%xmm1
vaesenc %xmm0,%xmm2,%xmm2
vaesenc %xmm0,%xmm3,%xmm3
vaesenc %xmm0,%xmm4,%xmm4
vaesenc %xmm0,%xmm5,%xmm5
vaesenc %xmm0,%xmm6,%xmm6
vaesenc %xmm0,%xmm7,%xmm7
vaesenc %xmm0,%xmm8,%xmm8
vaesenc %xmm0,%xmm9,%xmm9
vmovups 192-120(%rsi),%xmm0
je .Lenc8x_tail
vaesenc %xmm1,%xmm2,%xmm2
vaesenc %xmm1,%xmm3,%xmm3
vaesenc %xmm1,%xmm4,%xmm4
vaesenc %xmm1,%xmm5,%xmm5
vaesenc %xmm1,%xmm6,%xmm6
vaesenc %xmm1,%xmm7,%xmm7
vaesenc %xmm1,%xmm8,%xmm8
vaesenc %xmm1,%xmm9,%xmm9
vmovups 208-120(%rsi),%xmm1
vaesenc %xmm0,%xmm2,%xmm2
vaesenc %xmm0,%xmm3,%xmm3
vaesenc %xmm0,%xmm4,%xmm4
vaesenc %xmm0,%xmm5,%xmm5
vaesenc %xmm0,%xmm6,%xmm6
vaesenc %xmm0,%xmm7,%xmm7
vaesenc %xmm0,%xmm8,%xmm8
vaesenc %xmm0,%xmm9,%xmm9
vmovups 224-120(%rsi),%xmm0
.Lenc8x_tail:
vaesenc %xmm1,%xmm2,%xmm2
vpxor %xmm15,%xmm15,%xmm15
vaesenc %xmm1,%xmm3,%xmm3
vaesenc %xmm1,%xmm4,%xmm4
vpcmpgtd %xmm15,%xmm14,%xmm15
vaesenc %xmm1,%xmm5,%xmm5
vaesenc %xmm1,%xmm6,%xmm6
vpaddd %xmm14,%xmm15,%xmm15
vmovdqu 48(%rsp),%xmm14
vaesenc %xmm1,%xmm7,%xmm7
movq 64(%rsp),%rbx
vaesenc %xmm1,%xmm8,%xmm8
vaesenc %xmm1,%xmm9,%xmm9
vmovups 16-120(%rsi),%xmm1
vaesenclast %xmm0,%xmm2,%xmm2
vmovdqa %xmm15,32(%rsp)
vpxor %xmm15,%xmm15,%xmm15
vaesenclast %xmm0,%xmm3,%xmm3
vaesenclast %xmm0,%xmm4,%xmm4
vpcmpgtd %xmm15,%xmm14,%xmm15
vaesenclast %xmm0,%xmm5,%xmm5
vaesenclast %xmm0,%xmm6,%xmm6
vpaddd %xmm15,%xmm14,%xmm14
vmovdqu -120(%rsi),%xmm15
vaesenclast %xmm0,%xmm7,%xmm7
vaesenclast %xmm0,%xmm8,%xmm8
vmovdqa %xmm14,48(%rsp)
vaesenclast %xmm0,%xmm9,%xmm9
vmovups 32-120(%rsi),%xmm0
vmovups %xmm2,-16(%r8)
subq %rbx,%r8
vpxor 0(%rbp),%xmm2,%xmm2
vmovups %xmm3,-16(%r9)
subq 72(%rsp),%r9
vpxor 16(%rbp),%xmm3,%xmm3
vmovups %xmm4,-16(%r10)
subq 80(%rsp),%r10
vpxor 32(%rbp),%xmm4,%xmm4
vmovups %xmm5,-16(%r11)
subq 88(%rsp),%r11
vpxor 48(%rbp),%xmm5,%xmm5
vmovups %xmm6,-16(%r12)
subq 96(%rsp),%r12
vpxor %xmm10,%xmm6,%xmm6
vmovups %xmm7,-16(%r13)
subq 104(%rsp),%r13
vpxor %xmm11,%xmm7,%xmm7
vmovups %xmm8,-16(%r14)
subq 112(%rsp),%r14
vpxor %xmm12,%xmm8,%xmm8
vmovups %xmm9,-16(%r15)
subq 120(%rsp),%r15
vpxor %xmm13,%xmm9,%xmm9
decl %edx
jnz .Loop_enc8x
movq 16(%rsp),%rax
.Lenc8x_done:
vzeroupper
movq -48(%rax),%r15
movq -40(%rax),%r14
movq -32(%rax),%r13
movq -24(%rax),%r12
movq -16(%rax),%rbp
movq -8(%rax),%rbx
leaq (%rax),%rsp
.Lenc8x_epilogue:
.byte 0xf3,0xc3
.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
.type aesni_multi_cbc_decrypt_avx,@function
.align 32
aesni_multi_cbc_decrypt_avx:
_avx_cbc_dec_shortcut:
movq %rsp,%rax
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
subq $256,%rsp
andq $-256,%rsp
subq $192,%rsp
movq %rax,16(%rsp)
.Ldec8x_body:
vzeroupper
vmovdqu (%rsi),%xmm15
leaq 120(%rsi),%rsi
leaq 160(%rdi),%rdi
shrl $1,%edx
.Ldec8x_loop_grande:
xorl %edx,%edx
movl -144(%rdi),%ecx
movq -160(%rdi),%r8
cmpl %edx,%ecx
movq -152(%rdi),%rbx
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu -136(%rdi),%xmm2
movl %ecx,32(%rsp)
cmovleq %rsp,%r8
subq %r8,%rbx
movq %rbx,64(%rsp)
vmovdqu %xmm2,192(%rsp)
movl -104(%rdi),%ecx
movq -120(%rdi),%r9
cmpl %edx,%ecx
movq -112(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu -96(%rdi),%xmm3
movl %ecx,36(%rsp)
cmovleq %rsp,%r9
subq %r9,%rbp
movq %rbp,72(%rsp)
vmovdqu %xmm3,208(%rsp)
movl -64(%rdi),%ecx
movq -80(%rdi),%r10
cmpl %edx,%ecx
movq -72(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu -56(%rdi),%xmm4
movl %ecx,40(%rsp)
cmovleq %rsp,%r10
subq %r10,%rbp
movq %rbp,80(%rsp)
vmovdqu %xmm4,224(%rsp)
movl -24(%rdi),%ecx
movq -40(%rdi),%r11
cmpl %edx,%ecx
movq -32(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu -16(%rdi),%xmm5
movl %ecx,44(%rsp)
cmovleq %rsp,%r11
subq %r11,%rbp
movq %rbp,88(%rsp)
vmovdqu %xmm5,240(%rsp)
movl 16(%rdi),%ecx
movq 0(%rdi),%r12
cmpl %edx,%ecx
movq 8(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu 24(%rdi),%xmm6
movl %ecx,48(%rsp)
cmovleq %rsp,%r12
subq %r12,%rbp
movq %rbp,96(%rsp)
vmovdqu %xmm6,256(%rsp)
movl 56(%rdi),%ecx
movq 40(%rdi),%r13
cmpl %edx,%ecx
movq 48(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu 64(%rdi),%xmm7
movl %ecx,52(%rsp)
cmovleq %rsp,%r13
subq %r13,%rbp
movq %rbp,104(%rsp)
vmovdqu %xmm7,272(%rsp)
movl 96(%rdi),%ecx
movq 80(%rdi),%r14
cmpl %edx,%ecx
movq 88(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu 104(%rdi),%xmm8
movl %ecx,56(%rsp)
cmovleq %rsp,%r14
subq %r14,%rbp
movq %rbp,112(%rsp)
vmovdqu %xmm8,288(%rsp)
movl 136(%rdi),%ecx
movq 120(%rdi),%r15
cmpl %edx,%ecx
movq 128(%rdi),%rbp
cmovgl %ecx,%edx
testl %ecx,%ecx
vmovdqu 144(%rdi),%xmm9
movl %ecx,60(%rsp)
cmovleq %rsp,%r15
subq %r15,%rbp
movq %rbp,120(%rsp)
vmovdqu %xmm9,304(%rsp)
testl %edx,%edx
jz .Ldec8x_done
vmovups 16-120(%rsi),%xmm1
vmovups 32-120(%rsi),%xmm0
movl 240-120(%rsi),%eax
leaq 192+128(%rsp),%rbp
vmovdqu (%r8),%xmm2
vmovdqu (%r9),%xmm3
vmovdqu (%r10),%xmm4
vmovdqu (%r11),%xmm5
vmovdqu (%r12),%xmm6
vmovdqu (%r13),%xmm7
vmovdqu (%r14),%xmm8
vmovdqu (%r15),%xmm9
vmovdqu %xmm2,0(%rbp)
vpxor %xmm15,%xmm2,%xmm2
vmovdqu %xmm3,16(%rbp)
vpxor %xmm15,%xmm3,%xmm3
vmovdqu %xmm4,32(%rbp)
vpxor %xmm15,%xmm4,%xmm4
vmovdqu %xmm5,48(%rbp)
vpxor %xmm15,%xmm5,%xmm5
vmovdqu %xmm6,64(%rbp)
vpxor %xmm15,%xmm6,%xmm6
vmovdqu %xmm7,80(%rbp)
vpxor %xmm15,%xmm7,%xmm7
vmovdqu %xmm8,96(%rbp)
vpxor %xmm15,%xmm8,%xmm8
vmovdqu %xmm9,112(%rbp)
vpxor %xmm15,%xmm9,%xmm9
xorq $0x80,%rbp
movl $1,%ecx
jmp .Loop_dec8x
.align 32
.Loop_dec8x:
vaesdec %xmm1,%xmm2,%xmm2
cmpl 32+0(%rsp),%ecx
vaesdec %xmm1,%xmm3,%xmm3
prefetcht0 31(%r8)
vaesdec %xmm1,%xmm4,%xmm4
vaesdec %xmm1,%xmm5,%xmm5
leaq (%r8,%rbx,1),%rbx
cmovgeq %rsp,%r8
vaesdec %xmm1,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesdec %xmm1,%xmm7,%xmm7
subq %r8,%rbx
vaesdec %xmm1,%xmm8,%xmm8
vmovdqu 16(%r8),%xmm10
movq %rbx,64+0(%rsp)
vaesdec %xmm1,%xmm9,%xmm9
vmovups -72(%rsi),%xmm1
leaq 16(%r8,%rbx,1),%r8
vmovdqu %xmm10,128(%rsp)
vaesdec %xmm0,%xmm2,%xmm2
cmpl 32+4(%rsp),%ecx
movq 64+8(%rsp),%rbx
vaesdec %xmm0,%xmm3,%xmm3
prefetcht0 31(%r9)
vaesdec %xmm0,%xmm4,%xmm4
vaesdec %xmm0,%xmm5,%xmm5
leaq (%r9,%rbx,1),%rbx
cmovgeq %rsp,%r9
vaesdec %xmm0,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesdec %xmm0,%xmm7,%xmm7
subq %r9,%rbx
vaesdec %xmm0,%xmm8,%xmm8
vmovdqu 16(%r9),%xmm11
movq %rbx,64+8(%rsp)
vaesdec %xmm0,%xmm9,%xmm9
vmovups -56(%rsi),%xmm0
leaq 16(%r9,%rbx,1),%r9
vmovdqu %xmm11,144(%rsp)
vaesdec %xmm1,%xmm2,%xmm2
cmpl 32+8(%rsp),%ecx
movq 64+16(%rsp),%rbx
vaesdec %xmm1,%xmm3,%xmm3
prefetcht0 31(%r10)
vaesdec %xmm1,%xmm4,%xmm4
prefetcht0 15(%r8)
vaesdec %xmm1,%xmm5,%xmm5
leaq (%r10,%rbx,1),%rbx
cmovgeq %rsp,%r10
vaesdec %xmm1,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesdec %xmm1,%xmm7,%xmm7
subq %r10,%rbx
vaesdec %xmm1,%xmm8,%xmm8
vmovdqu 16(%r10),%xmm12
movq %rbx,64+16(%rsp)
vaesdec %xmm1,%xmm9,%xmm9
vmovups -40(%rsi),%xmm1
leaq 16(%r10,%rbx,1),%r10
vmovdqu %xmm12,160(%rsp)
vaesdec %xmm0,%xmm2,%xmm2
cmpl 32+12(%rsp),%ecx
movq 64+24(%rsp),%rbx
vaesdec %xmm0,%xmm3,%xmm3
prefetcht0 31(%r11)
vaesdec %xmm0,%xmm4,%xmm4
prefetcht0 15(%r9)
vaesdec %xmm0,%xmm5,%xmm5
leaq (%r11,%rbx,1),%rbx
cmovgeq %rsp,%r11
vaesdec %xmm0,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesdec %xmm0,%xmm7,%xmm7
subq %r11,%rbx
vaesdec %xmm0,%xmm8,%xmm8
vmovdqu 16(%r11),%xmm13
movq %rbx,64+24(%rsp)
vaesdec %xmm0,%xmm9,%xmm9
vmovups -24(%rsi),%xmm0
leaq 16(%r11,%rbx,1),%r11
vmovdqu %xmm13,176(%rsp)
vaesdec %xmm1,%xmm2,%xmm2
cmpl 32+16(%rsp),%ecx
movq 64+32(%rsp),%rbx
vaesdec %xmm1,%xmm3,%xmm3
prefetcht0 31(%r12)
vaesdec %xmm1,%xmm4,%xmm4
prefetcht0 15(%r10)
vaesdec %xmm1,%xmm5,%xmm5
leaq (%r12,%rbx,1),%rbx
cmovgeq %rsp,%r12
vaesdec %xmm1,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesdec %xmm1,%xmm7,%xmm7
subq %r12,%rbx
vaesdec %xmm1,%xmm8,%xmm8
vmovdqu 16(%r12),%xmm10
movq %rbx,64+32(%rsp)
vaesdec %xmm1,%xmm9,%xmm9
vmovups -8(%rsi),%xmm1
leaq 16(%r12,%rbx,1),%r12
vaesdec %xmm0,%xmm2,%xmm2
cmpl 32+20(%rsp),%ecx
movq 64+40(%rsp),%rbx
vaesdec %xmm0,%xmm3,%xmm3
prefetcht0 31(%r13)
vaesdec %xmm0,%xmm4,%xmm4
prefetcht0 15(%r11)
vaesdec %xmm0,%xmm5,%xmm5
leaq (%rbx,%r13,1),%rbx
cmovgeq %rsp,%r13
vaesdec %xmm0,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesdec %xmm0,%xmm7,%xmm7
subq %r13,%rbx
vaesdec %xmm0,%xmm8,%xmm8
vmovdqu 16(%r13),%xmm11
movq %rbx,64+40(%rsp)
vaesdec %xmm0,%xmm9,%xmm9
vmovups 8(%rsi),%xmm0
leaq 16(%r13,%rbx,1),%r13
vaesdec %xmm1,%xmm2,%xmm2
cmpl 32+24(%rsp),%ecx
movq 64+48(%rsp),%rbx
vaesdec %xmm1,%xmm3,%xmm3
prefetcht0 31(%r14)
vaesdec %xmm1,%xmm4,%xmm4
prefetcht0 15(%r12)
vaesdec %xmm1,%xmm5,%xmm5
leaq (%r14,%rbx,1),%rbx
cmovgeq %rsp,%r14
vaesdec %xmm1,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesdec %xmm1,%xmm7,%xmm7
subq %r14,%rbx
vaesdec %xmm1,%xmm8,%xmm8
vmovdqu 16(%r14),%xmm12
movq %rbx,64+48(%rsp)
vaesdec %xmm1,%xmm9,%xmm9
vmovups 24(%rsi),%xmm1
leaq 16(%r14,%rbx,1),%r14
vaesdec %xmm0,%xmm2,%xmm2
cmpl 32+28(%rsp),%ecx
movq 64+56(%rsp),%rbx
vaesdec %xmm0,%xmm3,%xmm3
prefetcht0 31(%r15)
vaesdec %xmm0,%xmm4,%xmm4
prefetcht0 15(%r13)
vaesdec %xmm0,%xmm5,%xmm5
leaq (%r15,%rbx,1),%rbx
cmovgeq %rsp,%r15
vaesdec %xmm0,%xmm6,%xmm6
cmovgq %rsp,%rbx
vaesdec %xmm0,%xmm7,%xmm7
subq %r15,%rbx
vaesdec %xmm0,%xmm8,%xmm8
vmovdqu 16(%r15),%xmm13
movq %rbx,64+56(%rsp)
vaesdec %xmm0,%xmm9,%xmm9
vmovups 40(%rsi),%xmm0
leaq 16(%r15,%rbx,1),%r15
vmovdqu 32(%rsp),%xmm14
prefetcht0 15(%r14)
prefetcht0 15(%r15)
cmpl $11,%eax
jb .Ldec8x_tail
vaesdec %xmm1,%xmm2,%xmm2
vaesdec %xmm1,%xmm3,%xmm3
vaesdec %xmm1,%xmm4,%xmm4
vaesdec %xmm1,%xmm5,%xmm5
vaesdec %xmm1,%xmm6,%xmm6
vaesdec %xmm1,%xmm7,%xmm7
vaesdec %xmm1,%xmm8,%xmm8
vaesdec %xmm1,%xmm9,%xmm9
vmovups 176-120(%rsi),%xmm1
vaesdec %xmm0,%xmm2,%xmm2
vaesdec %xmm0,%xmm3,%xmm3
vaesdec %xmm0,%xmm4,%xmm4
vaesdec %xmm0,%xmm5,%xmm5
vaesdec %xmm0,%xmm6,%xmm6
vaesdec %xmm0,%xmm7,%xmm7
vaesdec %xmm0,%xmm8,%xmm8
vaesdec %xmm0,%xmm9,%xmm9
vmovups 192-120(%rsi),%xmm0
je .Ldec8x_tail
vaesdec %xmm1,%xmm2,%xmm2
vaesdec %xmm1,%xmm3,%xmm3
vaesdec %xmm1,%xmm4,%xmm4
vaesdec %xmm1,%xmm5,%xmm5
vaesdec %xmm1,%xmm6,%xmm6
vaesdec %xmm1,%xmm7,%xmm7
vaesdec %xmm1,%xmm8,%xmm8
vaesdec %xmm1,%xmm9,%xmm9
vmovups 208-120(%rsi),%xmm1
vaesdec %xmm0,%xmm2,%xmm2
vaesdec %xmm0,%xmm3,%xmm3
vaesdec %xmm0,%xmm4,%xmm4
vaesdec %xmm0,%xmm5,%xmm5
vaesdec %xmm0,%xmm6,%xmm6
vaesdec %xmm0,%xmm7,%xmm7
vaesdec %xmm0,%xmm8,%xmm8
vaesdec %xmm0,%xmm9,%xmm9
vmovups 224-120(%rsi),%xmm0
.Ldec8x_tail:
vaesdec %xmm1,%xmm2,%xmm2
vpxor %xmm15,%xmm15,%xmm15
vaesdec %xmm1,%xmm3,%xmm3
vaesdec %xmm1,%xmm4,%xmm4
vpcmpgtd %xmm15,%xmm14,%xmm15
vaesdec %xmm1,%xmm5,%xmm5
vaesdec %xmm1,%xmm6,%xmm6
vpaddd %xmm14,%xmm15,%xmm15
vmovdqu 48(%rsp),%xmm14
vaesdec %xmm1,%xmm7,%xmm7
movq 64(%rsp),%rbx
vaesdec %xmm1,%xmm8,%xmm8
vaesdec %xmm1,%xmm9,%xmm9
vmovups 16-120(%rsi),%xmm1
vaesdeclast %xmm0,%xmm2,%xmm2
vmovdqa %xmm15,32(%rsp)
vpxor %xmm15,%xmm15,%xmm15
vaesdeclast %xmm0,%xmm3,%xmm3
vpxor 0(%rbp),%xmm2,%xmm2
vaesdeclast %xmm0,%xmm4,%xmm4
vpxor 16(%rbp),%xmm3,%xmm3
vpcmpgtd %xmm15,%xmm14,%xmm15
vaesdeclast %xmm0,%xmm5,%xmm5
vpxor 32(%rbp),%xmm4,%xmm4
vaesdeclast %xmm0,%xmm6,%xmm6
vpxor 48(%rbp),%xmm5,%xmm5
vpaddd %xmm15,%xmm14,%xmm14
vmovdqu -120(%rsi),%xmm15
vaesdeclast %xmm0,%xmm7,%xmm7
vpxor 64(%rbp),%xmm6,%xmm6
vaesdeclast %xmm0,%xmm8,%xmm8
vpxor 80(%rbp),%xmm7,%xmm7
vmovdqa %xmm14,48(%rsp)
vaesdeclast %xmm0,%xmm9,%xmm9
vpxor 96(%rbp),%xmm8,%xmm8
vmovups 32-120(%rsi),%xmm0
vmovups %xmm2,-16(%r8)
subq %rbx,%r8
vmovdqu 128+0(%rsp),%xmm2
vpxor 112(%rbp),%xmm9,%xmm9
vmovups %xmm3,-16(%r9)
subq 72(%rsp),%r9
vmovdqu %xmm2,0(%rbp)
vpxor %xmm15,%xmm2,%xmm2
vmovdqu 128+16(%rsp),%xmm3
vmovups %xmm4,-16(%r10)
subq 80(%rsp),%r10
vmovdqu %xmm3,16(%rbp)
vpxor %xmm15,%xmm3,%xmm3
vmovdqu 128+32(%rsp),%xmm4
vmovups %xmm5,-16(%r11)
subq 88(%rsp),%r11
vmovdqu %xmm4,32(%rbp)
vpxor %xmm15,%xmm4,%xmm4
vmovdqu 128+48(%rsp),%xmm5
vmovups %xmm6,-16(%r12)
subq 96(%rsp),%r12
vmovdqu %xmm5,48(%rbp)
vpxor %xmm15,%xmm5,%xmm5
vmovdqu %xmm10,64(%rbp)
vpxor %xmm10,%xmm15,%xmm6
vmovups %xmm7,-16(%r13)
subq 104(%rsp),%r13
vmovdqu %xmm11,80(%rbp)
vpxor %xmm11,%xmm15,%xmm7
vmovups %xmm8,-16(%r14)
subq 112(%rsp),%r14
vmovdqu %xmm12,96(%rbp)
vpxor %xmm12,%xmm15,%xmm8
vmovups %xmm9,-16(%r15)
subq 120(%rsp),%r15
vmovdqu %xmm13,112(%rbp)
vpxor %xmm13,%xmm15,%xmm9
xorq $128,%rbp
decl %edx
jnz .Loop_dec8x
movq 16(%rsp),%rax
.Ldec8x_done:
vzeroupper
movq -48(%rax),%r15
movq -40(%rax),%r14
movq -32(%rax),%r13
movq -24(%rax),%r12
movq -16(%rax),%rbp
movq -8(%rax),%rbx
leaq (%rax),%rsp
.Ldec8x_epilogue:
.byte 0xf3,0xc3
.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1250,7 +1250,108 @@ gcm_ghash_clmul:
.type gcm_init_avx,@function .type gcm_init_avx,@function
.align 32 .align 32
gcm_init_avx: gcm_init_avx:
jmp .L_init_clmul vzeroupper
vmovdqu (%rsi),%xmm2
vpshufd $78,%xmm2,%xmm2
vpshufd $255,%xmm2,%xmm4
vpsrlq $63,%xmm2,%xmm3
vpsllq $1,%xmm2,%xmm2
vpxor %xmm5,%xmm5,%xmm5
vpcmpgtd %xmm4,%xmm5,%xmm5
vpslldq $8,%xmm3,%xmm3
vpor %xmm3,%xmm2,%xmm2
vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5
vpxor %xmm5,%xmm2,%xmm2
vpunpckhqdq %xmm2,%xmm2,%xmm6
vmovdqa %xmm2,%xmm0
vpxor %xmm2,%xmm6,%xmm6
movq $4,%r10
jmp .Linit_start_avx
.align 32
.Linit_loop_avx:
vpalignr $8,%xmm3,%xmm4,%xmm5
vmovdqu %xmm5,-16(%rdi)
vpunpckhqdq %xmm0,%xmm0,%xmm3
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
vpxor %xmm0,%xmm1,%xmm4
vpxor %xmm4,%xmm3,%xmm3
vpslldq $8,%xmm3,%xmm4
vpsrldq $8,%xmm3,%xmm3
vpxor %xmm4,%xmm0,%xmm0
vpxor %xmm3,%xmm1,%xmm1
vpsllq $57,%xmm0,%xmm3
vpsllq $62,%xmm0,%xmm4
vpxor %xmm3,%xmm4,%xmm4
vpsllq $63,%xmm0,%xmm3
vpxor %xmm3,%xmm4,%xmm4
vpslldq $8,%xmm4,%xmm3
vpsrldq $8,%xmm4,%xmm4
vpxor %xmm3,%xmm0,%xmm0
vpxor %xmm4,%xmm1,%xmm1
vpsrlq $1,%xmm0,%xmm4
vpxor %xmm0,%xmm1,%xmm1
vpxor %xmm4,%xmm0,%xmm0
vpsrlq $5,%xmm4,%xmm4
vpxor %xmm4,%xmm0,%xmm0
vpsrlq $1,%xmm0,%xmm0
vpxor %xmm1,%xmm0,%xmm0
.Linit_start_avx:
vmovdqa %xmm0,%xmm5
vpunpckhqdq %xmm0,%xmm0,%xmm3
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
vpxor %xmm0,%xmm1,%xmm4
vpxor %xmm4,%xmm3,%xmm3
vpslldq $8,%xmm3,%xmm4
vpsrldq $8,%xmm3,%xmm3
vpxor %xmm4,%xmm0,%xmm0
vpxor %xmm3,%xmm1,%xmm1
vpsllq $57,%xmm0,%xmm3
vpsllq $62,%xmm0,%xmm4
vpxor %xmm3,%xmm4,%xmm4
vpsllq $63,%xmm0,%xmm3
vpxor %xmm3,%xmm4,%xmm4
vpslldq $8,%xmm4,%xmm3
vpsrldq $8,%xmm4,%xmm4
vpxor %xmm3,%xmm0,%xmm0
vpxor %xmm4,%xmm1,%xmm1
vpsrlq $1,%xmm0,%xmm4
vpxor %xmm0,%xmm1,%xmm1
vpxor %xmm4,%xmm0,%xmm0
vpsrlq $5,%xmm4,%xmm4
vpxor %xmm4,%xmm0,%xmm0
vpsrlq $1,%xmm0,%xmm0
vpxor %xmm1,%xmm0,%xmm0
vpshufd $78,%xmm5,%xmm3
vpshufd $78,%xmm0,%xmm4
vpxor %xmm5,%xmm3,%xmm3
vmovdqu %xmm5,0(%rdi)
vpxor %xmm0,%xmm4,%xmm4
vmovdqu %xmm0,16(%rdi)
leaq 48(%rdi),%rdi
subq $1,%r10
jnz .Linit_loop_avx
vpalignr $8,%xmm4,%xmm3,%xmm5
vmovdqu %xmm5,-16(%rdi)
vzeroupper
.byte 0xf3,0xc3
.size gcm_init_avx,.-gcm_init_avx .size gcm_init_avx,.-gcm_init_avx
.globl gcm_gmult_avx .globl gcm_gmult_avx
.type gcm_gmult_avx,@function .type gcm_gmult_avx,@function
@ -1262,7 +1363,377 @@ gcm_gmult_avx:
.type gcm_ghash_avx,@function .type gcm_ghash_avx,@function
.align 32 .align 32
gcm_ghash_avx: gcm_ghash_avx:
jmp .L_ghash_clmul vzeroupper
vmovdqu (%rdi),%xmm10
leaq .L0x1c2_polynomial(%rip),%r10
leaq 64(%rsi),%rsi
vmovdqu .Lbswap_mask(%rip),%xmm13
vpshufb %xmm13,%xmm10,%xmm10
cmpq $0x80,%rcx
jb .Lshort_avx
subq $0x80,%rcx
vmovdqu 112(%rdx),%xmm14
vmovdqu 0-64(%rsi),%xmm6
vpshufb %xmm13,%xmm14,%xmm14
vmovdqu 32-64(%rsi),%xmm7
vpunpckhqdq %xmm14,%xmm14,%xmm9
vmovdqu 96(%rdx),%xmm15
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
vpxor %xmm14,%xmm9,%xmm9
vpshufb %xmm13,%xmm15,%xmm15
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
vmovdqu 16-64(%rsi),%xmm6
vpunpckhqdq %xmm15,%xmm15,%xmm8
vmovdqu 80(%rdx),%xmm14
vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
vpxor %xmm15,%xmm8,%xmm8
vpshufb %xmm13,%xmm14,%xmm14
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
vpunpckhqdq %xmm14,%xmm14,%xmm9
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
vmovdqu 48-64(%rsi),%xmm6
vpxor %xmm14,%xmm9,%xmm9
vmovdqu 64(%rdx),%xmm15
vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
vmovdqu 80-64(%rsi),%xmm7
vpshufb %xmm13,%xmm15,%xmm15
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
vpxor %xmm1,%xmm4,%xmm4
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
vmovdqu 64-64(%rsi),%xmm6
vpxor %xmm2,%xmm5,%xmm5
vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
vpxor %xmm15,%xmm8,%xmm8
vmovdqu 48(%rdx),%xmm14
vpxor %xmm3,%xmm0,%xmm0
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
vpxor %xmm4,%xmm1,%xmm1
vpshufb %xmm13,%xmm14,%xmm14
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
vmovdqu 96-64(%rsi),%xmm6
vpxor %xmm5,%xmm2,%xmm2
vpunpckhqdq %xmm14,%xmm14,%xmm9
vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
vmovdqu 128-64(%rsi),%xmm7
vpxor %xmm14,%xmm9,%xmm9
vmovdqu 32(%rdx),%xmm15
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
vpxor %xmm1,%xmm4,%xmm4
vpshufb %xmm13,%xmm15,%xmm15
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
vmovdqu 112-64(%rsi),%xmm6
vpxor %xmm2,%xmm5,%xmm5
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
vpxor %xmm15,%xmm8,%xmm8
vmovdqu 16(%rdx),%xmm14
vpxor %xmm3,%xmm0,%xmm0
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
vpxor %xmm4,%xmm1,%xmm1
vpshufb %xmm13,%xmm14,%xmm14
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
vmovdqu 144-64(%rsi),%xmm6
vpxor %xmm5,%xmm2,%xmm2
vpunpckhqdq %xmm14,%xmm14,%xmm9
vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
vmovdqu 176-64(%rsi),%xmm7
vpxor %xmm14,%xmm9,%xmm9
vmovdqu (%rdx),%xmm15
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
vpxor %xmm1,%xmm4,%xmm4
vpshufb %xmm13,%xmm15,%xmm15
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
vmovdqu 160-64(%rsi),%xmm6
vpxor %xmm2,%xmm5,%xmm5
vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
leaq 128(%rdx),%rdx
cmpq $0x80,%rcx
jb .Ltail_avx
vpxor %xmm10,%xmm15,%xmm15
subq $0x80,%rcx
jmp .Loop8x_avx
.align 32
.Loop8x_avx:
vpunpckhqdq %xmm15,%xmm15,%xmm8
vmovdqu 112(%rdx),%xmm14
vpxor %xmm0,%xmm3,%xmm3
vpxor %xmm15,%xmm8,%xmm8
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
vpshufb %xmm13,%xmm14,%xmm14
vpxor %xmm1,%xmm4,%xmm4
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
vmovdqu 0-64(%rsi),%xmm6
vpunpckhqdq %xmm14,%xmm14,%xmm9
vpxor %xmm2,%xmm5,%xmm5
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
vmovdqu 32-64(%rsi),%xmm7
vpxor %xmm14,%xmm9,%xmm9
vmovdqu 96(%rdx),%xmm15
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
vpxor %xmm3,%xmm10,%xmm10
vpshufb %xmm13,%xmm15,%xmm15
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
vxorps %xmm4,%xmm11,%xmm11
vmovdqu 16-64(%rsi),%xmm6
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
vpxor %xmm5,%xmm12,%xmm12
vxorps %xmm15,%xmm8,%xmm8
vmovdqu 80(%rdx),%xmm14
vpxor %xmm10,%xmm12,%xmm12
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
vpxor %xmm11,%xmm12,%xmm12
vpslldq $8,%xmm12,%xmm9
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
vpsrldq $8,%xmm12,%xmm12
vpxor %xmm9,%xmm10,%xmm10
vmovdqu 48-64(%rsi),%xmm6
vpshufb %xmm13,%xmm14,%xmm14
vxorps %xmm12,%xmm11,%xmm11
vpxor %xmm1,%xmm4,%xmm4
vpunpckhqdq %xmm14,%xmm14,%xmm9
vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
vmovdqu 80-64(%rsi),%xmm7
vpxor %xmm14,%xmm9,%xmm9
vpxor %xmm2,%xmm5,%xmm5
vmovdqu 64(%rdx),%xmm15
vpalignr $8,%xmm10,%xmm10,%xmm12
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
vpshufb %xmm13,%xmm15,%xmm15
vpxor %xmm3,%xmm0,%xmm0
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
vmovdqu 64-64(%rsi),%xmm6
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpxor %xmm4,%xmm1,%xmm1
vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
vxorps %xmm15,%xmm8,%xmm8
vpxor %xmm5,%xmm2,%xmm2
vmovdqu 48(%rdx),%xmm14
vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
vpshufb %xmm13,%xmm14,%xmm14
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
vmovdqu 96-64(%rsi),%xmm6
vpunpckhqdq %xmm14,%xmm14,%xmm9
vpxor %xmm1,%xmm4,%xmm4
vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
vmovdqu 128-64(%rsi),%xmm7
vpxor %xmm14,%xmm9,%xmm9
vpxor %xmm2,%xmm5,%xmm5
vmovdqu 32(%rdx),%xmm15
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
vpshufb %xmm13,%xmm15,%xmm15
vpxor %xmm3,%xmm0,%xmm0
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
vmovdqu 112-64(%rsi),%xmm6
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpxor %xmm4,%xmm1,%xmm1
vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
vpxor %xmm15,%xmm8,%xmm8
vpxor %xmm5,%xmm2,%xmm2
vxorps %xmm12,%xmm10,%xmm10
vmovdqu 16(%rdx),%xmm14
vpalignr $8,%xmm10,%xmm10,%xmm12
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
vpshufb %xmm13,%xmm14,%xmm14
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
vmovdqu 144-64(%rsi),%xmm6
vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
vxorps %xmm11,%xmm12,%xmm12
vpunpckhqdq %xmm14,%xmm14,%xmm9
vpxor %xmm1,%xmm4,%xmm4
vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
vmovdqu 176-64(%rsi),%xmm7
vpxor %xmm14,%xmm9,%xmm9
vpxor %xmm2,%xmm5,%xmm5
vmovdqu (%rdx),%xmm15
vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
vpshufb %xmm13,%xmm15,%xmm15
vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
vmovdqu 160-64(%rsi),%xmm6
vpxor %xmm12,%xmm15,%xmm15
vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
vpxor %xmm10,%xmm15,%xmm15
leaq 128(%rdx),%rdx
subq $0x80,%rcx
jnc .Loop8x_avx
addq $0x80,%rcx
jmp .Ltail_no_xor_avx
.align 32
.Lshort_avx:
vmovdqu -16(%rdx,%rcx,1),%xmm14
leaq (%rdx,%rcx,1),%rdx
vmovdqu 0-64(%rsi),%xmm6
vmovdqu 32-64(%rsi),%xmm7
vpshufb %xmm13,%xmm14,%xmm15
vmovdqa %xmm0,%xmm3
vmovdqa %xmm1,%xmm4
vmovdqa %xmm2,%xmm5
subq $0x10,%rcx
jz .Ltail_avx
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
vpxor %xmm15,%xmm8,%xmm8
vmovdqu -32(%rdx),%xmm14
vpxor %xmm1,%xmm4,%xmm4
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
vmovdqu 16-64(%rsi),%xmm6
vpshufb %xmm13,%xmm14,%xmm15
vpxor %xmm2,%xmm5,%xmm5
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
vpsrldq $8,%xmm7,%xmm7
subq $0x10,%rcx
jz .Ltail_avx
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
vpxor %xmm15,%xmm8,%xmm8
vmovdqu -48(%rdx),%xmm14
vpxor %xmm1,%xmm4,%xmm4
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
vmovdqu 48-64(%rsi),%xmm6
vpshufb %xmm13,%xmm14,%xmm15
vpxor %xmm2,%xmm5,%xmm5
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
vmovdqu 80-64(%rsi),%xmm7
subq $0x10,%rcx
jz .Ltail_avx
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
vpxor %xmm15,%xmm8,%xmm8
vmovdqu -64(%rdx),%xmm14
vpxor %xmm1,%xmm4,%xmm4
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
vmovdqu 64-64(%rsi),%xmm6
vpshufb %xmm13,%xmm14,%xmm15
vpxor %xmm2,%xmm5,%xmm5
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
vpsrldq $8,%xmm7,%xmm7
subq $0x10,%rcx
jz .Ltail_avx
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
vpxor %xmm15,%xmm8,%xmm8
vmovdqu -80(%rdx),%xmm14
vpxor %xmm1,%xmm4,%xmm4
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
vmovdqu 96-64(%rsi),%xmm6
vpshufb %xmm13,%xmm14,%xmm15
vpxor %xmm2,%xmm5,%xmm5
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
vmovdqu 128-64(%rsi),%xmm7
subq $0x10,%rcx
jz .Ltail_avx
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
vpxor %xmm15,%xmm8,%xmm8
vmovdqu -96(%rdx),%xmm14
vpxor %xmm1,%xmm4,%xmm4
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
vmovdqu 112-64(%rsi),%xmm6
vpshufb %xmm13,%xmm14,%xmm15
vpxor %xmm2,%xmm5,%xmm5
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
vpsrldq $8,%xmm7,%xmm7
subq $0x10,%rcx
jz .Ltail_avx
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
vpxor %xmm15,%xmm8,%xmm8
vmovdqu -112(%rdx),%xmm14
vpxor %xmm1,%xmm4,%xmm4
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
vmovdqu 144-64(%rsi),%xmm6
vpshufb %xmm13,%xmm14,%xmm15
vpxor %xmm2,%xmm5,%xmm5
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
vmovq 184-64(%rsi),%xmm7
subq $0x10,%rcx
jmp .Ltail_avx
.align 32
.Ltail_avx:
vpxor %xmm10,%xmm15,%xmm15
.Ltail_no_xor_avx:
vpunpckhqdq %xmm15,%xmm15,%xmm8
vpxor %xmm0,%xmm3,%xmm3
vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
vpxor %xmm15,%xmm8,%xmm8
vpxor %xmm1,%xmm4,%xmm4
vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
vpxor %xmm2,%xmm5,%xmm5
vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
vmovdqu (%r10),%xmm12
vpxor %xmm0,%xmm3,%xmm10
vpxor %xmm1,%xmm4,%xmm11
vpxor %xmm2,%xmm5,%xmm5
vpxor %xmm10,%xmm5,%xmm5
vpxor %xmm11,%xmm5,%xmm5
vpslldq $8,%xmm5,%xmm9
vpsrldq $8,%xmm5,%xmm5
vpxor %xmm9,%xmm10,%xmm10
vpxor %xmm5,%xmm11,%xmm11
vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
vpalignr $8,%xmm10,%xmm10,%xmm10
vpxor %xmm9,%xmm10,%xmm10
vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
vpalignr $8,%xmm10,%xmm10,%xmm10
vpxor %xmm11,%xmm10,%xmm10
vpxor %xmm9,%xmm10,%xmm10
cmpq $0,%rcx
jne .Lshort_avx
vpshufb %xmm13,%xmm10,%xmm10
vmovdqu %xmm10,(%rdi)
vzeroupper
.byte 0xf3,0xc3
.size gcm_ghash_avx,.-gcm_ghash_avx .size gcm_ghash_avx,.-gcm_ghash_avx
.align 64 .align 64
.Lbswap_mask: .Lbswap_mask:

View File

@ -24,7 +24,7 @@ RC4: orq %rsi,%rsi
movb -4(%rdi),%cl movb -4(%rdi),%cl
cmpl $-1,256(%rdi) cmpl $-1,256(%rdi)
je .LRC4_CHAR je .LRC4_CHAR
movl OPENSSL_ia32cap_P@GOTPCREL(%rip),%r8d movl OPENSSL_ia32cap_P(%rip),%r8d
xorq %rbx,%rbx xorq %rbx,%rbx
incb %r10b incb %r10b
subq %r10,%rbx subq %r10,%rbx
@ -531,7 +531,7 @@ private_RC4_set_key:
xorq %r10,%r10 xorq %r10,%r10
xorq %r11,%r11 xorq %r11,%r11
movl OPENSSL_ia32cap_P@GOTPCREL(%rip),%r8d movl OPENSSL_ia32cap_P(%rip),%r8d
btl $20,%r8d btl $20,%r8d
jc .Lc1stloop jc .Lc1stloop
jmp .Lw1stloop jmp .Lw1stloop
@ -595,7 +595,7 @@ private_RC4_set_key:
.align 16 .align 16
RC4_options: RC4_options:
leaq .Lopts(%rip),%rax leaq .Lopts(%rip),%rax
movl OPENSSL_ia32cap_P@GOTPCREL(%rip),%edx movl OPENSSL_ia32cap_P(%rip),%edx
btl $20,%edx btl $20,%edx
jc .L8xchar jc .L8xchar
btl $30,%edx btl $30,%edx

File diff suppressed because it is too large Load Diff

View File

@ -20,6 +20,10 @@ rsaz_512_sqr:
movq (%rsi),%rdx movq (%rsi),%rdx
movq 8(%rsi),%rax movq 8(%rsi),%rax
movq %rcx,128(%rsp) movq %rcx,128(%rsp)
movl $0x80100,%r11d
andl OPENSSL_ia32cap_P+8(%rip),%r11d
cmpl $0x80100,%r11d
je .Loop_sqrx
jmp .Loop_sqr jmp .Loop_sqr
.align 32 .align 32
@ -383,6 +387,276 @@ rsaz_512_sqr:
decl %r8d decl %r8d
jnz .Loop_sqr jnz .Loop_sqr
jmp .Lsqr_tail
.align 32
.Loop_sqrx:
movl %r8d,128+8(%rsp)
.byte 102,72,15,110,199
.byte 102,72,15,110,205
mulxq %rax,%r8,%r9
mulxq 16(%rsi),%rcx,%r10
xorq %rbp,%rbp
mulxq 24(%rsi),%rax,%r11
adcxq %rcx,%r9
mulxq 32(%rsi),%rcx,%r12
adcxq %rax,%r10
mulxq 40(%rsi),%rax,%r13
adcxq %rcx,%r11
.byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00
adcxq %rax,%r12
adcxq %rcx,%r13
.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
adcxq %rax,%r14
adcxq %rbp,%r15
movq %r9,%rcx
shldq $1,%r8,%r9
shlq $1,%r8
xorl %ebp,%ebp
mulxq %rdx,%rax,%rdx
adcxq %rdx,%r8
movq 8(%rsi),%rdx
adcxq %rbp,%r9
movq %rax,(%rsp)
movq %r8,8(%rsp)
mulxq 16(%rsi),%rax,%rbx
adoxq %rax,%r10
adcxq %rbx,%r11
.byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00
adoxq %rdi,%r11
adcxq %r8,%r12
mulxq 32(%rsi),%rax,%rbx
adoxq %rax,%r12
adcxq %rbx,%r13
mulxq 40(%rsi),%rdi,%r8
adoxq %rdi,%r13
adcxq %r8,%r14
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
adoxq %rax,%r14
adcxq %rbx,%r15
.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00
adoxq %rdi,%r15
adcxq %rbp,%r8
adoxq %rbp,%r8
movq %r11,%rbx
shldq $1,%r10,%r11
shldq $1,%rcx,%r10
xorl %ebp,%ebp
mulxq %rdx,%rax,%rcx
movq 16(%rsi),%rdx
adcxq %rax,%r9
adcxq %rcx,%r10
adcxq %rbp,%r11
movq %r9,16(%rsp)
.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00
.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00
adoxq %rdi,%r12
adcxq %r9,%r13
mulxq 32(%rsi),%rax,%rcx
adoxq %rax,%r13
adcxq %rcx,%r14
mulxq 40(%rsi),%rdi,%r9
adoxq %rdi,%r14
adcxq %r9,%r15
.byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00
adoxq %rax,%r15
adcxq %rcx,%r8
.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00
adoxq %rdi,%r8
adcxq %rbp,%r9
adoxq %rbp,%r9
movq %r13,%rcx
shldq $1,%r12,%r13
shldq $1,%rbx,%r12
xorl %ebp,%ebp
mulxq %rdx,%rax,%rdx
adcxq %rax,%r11
adcxq %rdx,%r12
movq 24(%rsi),%rdx
adcxq %rbp,%r13
movq %r11,32(%rsp)
.byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00
adoxq %rax,%r14
adcxq %rbx,%r15
mulxq 40(%rsi),%rdi,%r10
adoxq %rdi,%r15
adcxq %r10,%r8
mulxq 48(%rsi),%rax,%rbx
adoxq %rax,%r8
adcxq %rbx,%r9
mulxq 56(%rsi),%rdi,%r10
adoxq %rdi,%r9
adcxq %rbp,%r10
adoxq %rbp,%r10
.byte 0x66
movq %r15,%rbx
shldq $1,%r14,%r15
shldq $1,%rcx,%r14
xorl %ebp,%ebp
mulxq %rdx,%rax,%rdx
adcxq %rax,%r13
adcxq %rdx,%r14
movq 32(%rsi),%rdx
adcxq %rbp,%r15
movq %r13,48(%rsp)
movq %r14,56(%rsp)
.byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00
adoxq %rdi,%r8
adcxq %r11,%r9
mulxq 48(%rsi),%rax,%rcx
adoxq %rax,%r9
adcxq %rcx,%r10
mulxq 56(%rsi),%rdi,%r11
adoxq %rdi,%r10
adcxq %rbp,%r11
adoxq %rbp,%r11
movq %r9,%rcx
shldq $1,%r8,%r9
shldq $1,%rbx,%r8
xorl %ebp,%ebp
mulxq %rdx,%rax,%rdx
adcxq %rax,%r15
adcxq %rdx,%r8
movq 40(%rsi),%rdx
adcxq %rbp,%r9
movq %r15,64(%rsp)
movq %r8,72(%rsp)
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
adoxq %rax,%r10
adcxq %rbx,%r11
.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00
adoxq %rdi,%r11
adcxq %rbp,%r12
adoxq %rbp,%r12
movq %r11,%rbx
shldq $1,%r10,%r11
shldq $1,%rcx,%r10
xorl %ebp,%ebp
mulxq %rdx,%rax,%rdx
adcxq %rax,%r9
adcxq %rdx,%r10
movq 48(%rsi),%rdx
adcxq %rbp,%r11
movq %r9,80(%rsp)
movq %r10,88(%rsp)
.byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00
adoxq %rax,%r12
adoxq %rbp,%r13
xorq %r14,%r14
shldq $1,%r13,%r14
shldq $1,%r12,%r13
shldq $1,%rbx,%r12
xorl %ebp,%ebp
mulxq %rdx,%rax,%rdx
adcxq %rax,%r11
adcxq %rdx,%r12
movq 56(%rsi),%rdx
adcxq %rbp,%r13
.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00
.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00
mulxq %rdx,%rax,%rdx
adoxq %rax,%r13
adoxq %rbp,%rdx
.byte 0x66
addq %rdx,%r14
movq %r13,112(%rsp)
movq %r14,120(%rsp)
.byte 102,72,15,126,199
.byte 102,72,15,126,205
movq 128(%rsp),%rdx
movq (%rsp),%r8
movq 8(%rsp),%r9
movq 16(%rsp),%r10
movq 24(%rsp),%r11
movq 32(%rsp),%r12
movq 40(%rsp),%r13
movq 48(%rsp),%r14
movq 56(%rsp),%r15
call __rsaz_512_reducex
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
adcq 88(%rsp),%r11
adcq 96(%rsp),%r12
adcq 104(%rsp),%r13
adcq 112(%rsp),%r14
adcq 120(%rsp),%r15
sbbq %rcx,%rcx
call __rsaz_512_subtract
movq %r8,%rdx
movq %r9,%rax
movl 128+8(%rsp),%r8d
movq %rdi,%rsi
decl %r8d
jnz .Loop_sqrx
.Lsqr_tail:
leaq 128+24+48(%rsp),%rax leaq 128+24+48(%rsp),%rax
movq -48(%rax),%r15 movq -48(%rax),%r15
@ -411,6 +685,10 @@ rsaz_512_mul:
.byte 102,72,15,110,199 .byte 102,72,15,110,199
.byte 102,72,15,110,201 .byte 102,72,15,110,201
movq %r8,128(%rsp) movq %r8,128(%rsp)
movl $0x80100,%r11d
andl OPENSSL_ia32cap_P+8(%rip),%r11d
cmpl $0x80100,%r11d
je .Lmulx
movq (%rdx),%rbx movq (%rdx),%rbx
movq %rdx,%rbp movq %rdx,%rbp
call __rsaz_512_mul call __rsaz_512_mul
@ -428,6 +706,29 @@ rsaz_512_mul:
movq 56(%rsp),%r15 movq 56(%rsp),%r15
call __rsaz_512_reduce call __rsaz_512_reduce
jmp .Lmul_tail
.align 32
.Lmulx:
movq %rdx,%rbp
movq (%rdx),%rdx
call __rsaz_512_mulx
.byte 102,72,15,126,199
.byte 102,72,15,126,205
movq 128(%rsp),%rdx
movq (%rsp),%r8
movq 8(%rsp),%r9
movq 16(%rsp),%r10
movq 24(%rsp),%r11
movq 32(%rsp),%r12
movq 40(%rsp),%r13
movq 48(%rsp),%r14
movq 56(%rsp),%r15
call __rsaz_512_reducex
.Lmul_tail:
addq 64(%rsp),%r8 addq 64(%rsp),%r8
adcq 72(%rsp),%r9 adcq 72(%rsp),%r9
adcq 80(%rsp),%r10 adcq 80(%rsp),%r10
@ -518,6 +819,10 @@ rsaz_512_mul_gather4:
por %xmm9,%xmm8 por %xmm9,%xmm8
pshufd $0x4e,%xmm8,%xmm9 pshufd $0x4e,%xmm8,%xmm9
por %xmm9,%xmm8 por %xmm9,%xmm8
movl $0x80100,%r11d
andl OPENSSL_ia32cap_P+8(%rip),%r11d
cmpl $0x80100,%r11d
je .Lmulx_gather
.byte 102,76,15,126,195 .byte 102,76,15,126,195
movq %r8,128(%rsp) movq %r8,128(%rsp)
@ -698,6 +1003,142 @@ rsaz_512_mul_gather4:
movq 56(%rsp),%r15 movq 56(%rsp),%r15
call __rsaz_512_reduce call __rsaz_512_reduce
jmp .Lmul_gather_tail
.align 32
.Lmulx_gather:
.byte 102,76,15,126,194
movq %r8,128(%rsp)
movq %rdi,128+8(%rsp)
movq %rcx,128+16(%rsp)
mulxq (%rsi),%rbx,%r8
movq %rbx,(%rsp)
xorl %edi,%edi
mulxq 8(%rsi),%rax,%r9
mulxq 16(%rsi),%rbx,%r10
adcxq %rax,%r8
mulxq 24(%rsi),%rax,%r11
adcxq %rbx,%r9
mulxq 32(%rsi),%rbx,%r12
adcxq %rax,%r10
mulxq 40(%rsi),%rax,%r13
adcxq %rbx,%r11
mulxq 48(%rsi),%rbx,%r14
adcxq %rax,%r12
mulxq 56(%rsi),%rax,%r15
adcxq %rbx,%r13
adcxq %rax,%r14
.byte 0x67
movq %r8,%rbx
adcxq %rdi,%r15
movq $-7,%rcx
jmp .Loop_mulx_gather
.align 32
.Loop_mulx_gather:
movdqa 0(%rbp),%xmm8
movdqa 16(%rbp),%xmm9
movdqa 32(%rbp),%xmm10
movdqa 48(%rbp),%xmm11
pand %xmm0,%xmm8
movdqa 64(%rbp),%xmm12
pand %xmm1,%xmm9
movdqa 80(%rbp),%xmm13
pand %xmm2,%xmm10
movdqa 96(%rbp),%xmm14
pand %xmm3,%xmm11
movdqa 112(%rbp),%xmm15
leaq 128(%rbp),%rbp
pand %xmm4,%xmm12
pand %xmm5,%xmm13
pand %xmm6,%xmm14
pand %xmm7,%xmm15
por %xmm10,%xmm8
por %xmm11,%xmm9
por %xmm12,%xmm8
por %xmm13,%xmm9
por %xmm14,%xmm8
por %xmm15,%xmm9
por %xmm9,%xmm8
pshufd $0x4e,%xmm8,%xmm9
por %xmm9,%xmm8
.byte 102,76,15,126,194
.byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00
adcxq %rax,%rbx
adoxq %r9,%r8
mulxq 8(%rsi),%rax,%r9
adcxq %rax,%r8
adoxq %r10,%r9
mulxq 16(%rsi),%rax,%r10
adcxq %rax,%r9
adoxq %r11,%r10
.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00
adcxq %rax,%r10
adoxq %r12,%r11
mulxq 32(%rsi),%rax,%r12
adcxq %rax,%r11
adoxq %r13,%r12
mulxq 40(%rsi),%rax,%r13
adcxq %rax,%r12
adoxq %r14,%r13
.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
adcxq %rax,%r13
.byte 0x67
adoxq %r15,%r14
mulxq 56(%rsi),%rax,%r15
movq %rbx,64(%rsp,%rcx,8)
adcxq %rax,%r14
adoxq %rdi,%r15
movq %r8,%rbx
adcxq %rdi,%r15
incq %rcx
jnz .Loop_mulx_gather
movq %r8,64(%rsp)
movq %r9,64+8(%rsp)
movq %r10,64+16(%rsp)
movq %r11,64+24(%rsp)
movq %r12,64+32(%rsp)
movq %r13,64+40(%rsp)
movq %r14,64+48(%rsp)
movq %r15,64+56(%rsp)
movq 128(%rsp),%rdx
movq 128+8(%rsp),%rdi
movq 128+16(%rsp),%rbp
movq (%rsp),%r8
movq 8(%rsp),%r9
movq 16(%rsp),%r10
movq 24(%rsp),%r11
movq 32(%rsp),%r12
movq 40(%rsp),%r13
movq 48(%rsp),%r14
movq 56(%rsp),%r15
call __rsaz_512_reducex
.Lmul_gather_tail:
addq 64(%rsp),%r8 addq 64(%rsp),%r8
adcq 72(%rsp),%r9 adcq 72(%rsp),%r9
adcq 80(%rsp),%r10 adcq 80(%rsp),%r10
@ -742,6 +1183,10 @@ rsaz_512_mul_scatter4:
movq %rcx,128(%rsp) movq %rcx,128(%rsp)
movq %rdi,%rbp movq %rdi,%rbp
movl $0x80100,%r11d
andl OPENSSL_ia32cap_P+8(%rip),%r11d
cmpl $0x80100,%r11d
je .Lmulx_scatter
movq (%rdi),%rbx movq (%rdi),%rbx
call __rsaz_512_mul call __rsaz_512_mul
@ -758,6 +1203,29 @@ rsaz_512_mul_scatter4:
movq 56(%rsp),%r15 movq 56(%rsp),%r15
call __rsaz_512_reduce call __rsaz_512_reduce
jmp .Lmul_scatter_tail
.align 32
.Lmulx_scatter:
movq (%rdi),%rdx
call __rsaz_512_mulx
.byte 102,72,15,126,199
.byte 102,72,15,126,205
movq 128(%rsp),%rdx
movq (%rsp),%r8
movq 8(%rsp),%r9
movq 16(%rsp),%r10
movq 24(%rsp),%r11
movq 32(%rsp),%r12
movq 40(%rsp),%r13
movq 48(%rsp),%r14
movq 56(%rsp),%r15
call __rsaz_512_reducex
.Lmul_scatter_tail:
addq 64(%rsp),%r8 addq 64(%rsp),%r8
adcq 72(%rsp),%r9 adcq 72(%rsp),%r9
adcq 80(%rsp),%r10 adcq 80(%rsp),%r10
@ -804,6 +1272,7 @@ rsaz_512_mul_by_one:
subq $128+24,%rsp subq $128+24,%rsp
.Lmul_by_one_body: .Lmul_by_one_body:
movl OPENSSL_ia32cap_P+8(%rip),%eax
movq %rdx,%rbp movq %rdx,%rbp
movq %rcx,128(%rsp) movq %rcx,128(%rsp)
@ -824,7 +1293,16 @@ rsaz_512_mul_by_one:
movdqa %xmm0,64(%rsp) movdqa %xmm0,64(%rsp)
movdqa %xmm0,80(%rsp) movdqa %xmm0,80(%rsp)
movdqa %xmm0,96(%rsp) movdqa %xmm0,96(%rsp)
andl $0x80100,%eax
cmpl $0x80100,%eax
je .Lby_one_callx
call __rsaz_512_reduce call __rsaz_512_reduce
jmp .Lby_one_tail
.align 32
.Lby_one_callx:
movq 128(%rsp),%rdx
call __rsaz_512_reducex
.Lby_one_tail:
movq %r8,(%rdi) movq %r8,(%rdi)
movq %r9,8(%rdi) movq %r9,8(%rdi)
movq %r10,16(%rdi) movq %r10,16(%rdi)
@ -928,6 +1406,62 @@ __rsaz_512_reduce:
.byte 0xf3,0xc3 .byte 0xf3,0xc3
.size __rsaz_512_reduce,.-__rsaz_512_reduce .size __rsaz_512_reduce,.-__rsaz_512_reduce
.type __rsaz_512_reducex,@function
.align 32
__rsaz_512_reducex:
imulq %r8,%rdx
xorq %rsi,%rsi
movl $8,%ecx
jmp .Lreduction_loopx
.align 32
.Lreduction_loopx:
movq %r8,%rbx
mulxq 0(%rbp),%rax,%r8
adcxq %rbx,%rax
adoxq %r9,%r8
mulxq 8(%rbp),%rax,%r9
adcxq %rax,%r8
adoxq %r10,%r9
mulxq 16(%rbp),%rbx,%r10
adcxq %rbx,%r9
adoxq %r11,%r10
mulxq 24(%rbp),%rbx,%r11
adcxq %rbx,%r10
adoxq %r12,%r11
.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
movq %rdx,%rax
movq %r8,%rdx
adcxq %rbx,%r11
adoxq %r13,%r12
mulxq 128+8(%rsp),%rbx,%rdx
movq %rax,%rdx
mulxq 40(%rbp),%rax,%r13
adcxq %rax,%r12
adoxq %r14,%r13
.byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00
adcxq %rax,%r13
adoxq %r15,%r14
mulxq 56(%rbp),%rax,%r15
movq %rbx,%rdx
adcxq %rax,%r14
adoxq %rsi,%r15
adcxq %rsi,%r15
decl %ecx
jne .Lreduction_loopx
.byte 0xf3,0xc3
.size __rsaz_512_reducex,.-__rsaz_512_reducex
.type __rsaz_512_subtract,@function .type __rsaz_512_subtract,@function
.align 32 .align 32
__rsaz_512_subtract: __rsaz_512_subtract:
@ -1127,6 +1661,126 @@ __rsaz_512_mul:
.byte 0xf3,0xc3 .byte 0xf3,0xc3
.size __rsaz_512_mul,.-__rsaz_512_mul .size __rsaz_512_mul,.-__rsaz_512_mul
.type __rsaz_512_mulx,@function
.align 32
__rsaz_512_mulx:
mulxq (%rsi),%rbx,%r8
movq $-6,%rcx
mulxq 8(%rsi),%rax,%r9
movq %rbx,8(%rsp)
mulxq 16(%rsi),%rbx,%r10
adcq %rax,%r8
mulxq 24(%rsi),%rax,%r11
adcq %rbx,%r9
mulxq 32(%rsi),%rbx,%r12
adcq %rax,%r10
mulxq 40(%rsi),%rax,%r13
adcq %rbx,%r11
mulxq 48(%rsi),%rbx,%r14
adcq %rax,%r12
mulxq 56(%rsi),%rax,%r15
movq 8(%rbp),%rdx
adcq %rbx,%r13
adcq %rax,%r14
adcq $0,%r15
xorq %rdi,%rdi
jmp .Loop_mulx
.align 32
.Loop_mulx:
movq %r8,%rbx
mulxq (%rsi),%rax,%r8
adcxq %rax,%rbx
adoxq %r9,%r8
mulxq 8(%rsi),%rax,%r9
adcxq %rax,%r8
adoxq %r10,%r9
mulxq 16(%rsi),%rax,%r10
adcxq %rax,%r9
adoxq %r11,%r10
mulxq 24(%rsi),%rax,%r11
adcxq %rax,%r10
adoxq %r12,%r11
.byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00
adcxq %rax,%r11
adoxq %r13,%r12
mulxq 40(%rsi),%rax,%r13
adcxq %rax,%r12
adoxq %r14,%r13
mulxq 48(%rsi),%rax,%r14
adcxq %rax,%r13
adoxq %r15,%r14
mulxq 56(%rsi),%rax,%r15
movq 64(%rbp,%rcx,8),%rdx
movq %rbx,8+64-8(%rsp,%rcx,8)
adcxq %rax,%r14
adoxq %rdi,%r15
adcxq %rdi,%r15
incq %rcx
jnz .Loop_mulx
movq %r8,%rbx
mulxq (%rsi),%rax,%r8
adcxq %rax,%rbx
adoxq %r9,%r8
.byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00
adcxq %rax,%r8
adoxq %r10,%r9
.byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00
adcxq %rax,%r9
adoxq %r11,%r10
mulxq 24(%rsi),%rax,%r11
adcxq %rax,%r10
adoxq %r12,%r11
mulxq 32(%rsi),%rax,%r12
adcxq %rax,%r11
adoxq %r13,%r12
mulxq 40(%rsi),%rax,%r13
adcxq %rax,%r12
adoxq %r14,%r13
.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
adcxq %rax,%r13
adoxq %r15,%r14
.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
adcxq %rax,%r14
adoxq %rdi,%r15
adcxq %rdi,%r15
movq %rbx,8+64-8(%rsp)
movq %r8,8+64(%rsp)
movq %r9,8+64+8(%rsp)
movq %r10,8+64+16(%rsp)
movq %r11,8+64+24(%rsp)
movq %r12,8+64+32(%rsp)
movq %r13,8+64+40(%rsp)
movq %r14,8+64+48(%rsp)
movq %r15,8+64+56(%rsp)
.byte 0xf3,0xc3
.size __rsaz_512_mulx,.-__rsaz_512_mulx
.globl rsaz_512_scatter4 .globl rsaz_512_scatter4
.type rsaz_512_scatter4,@function .type rsaz_512_scatter4,@function
.align 16 .align 16

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -10,14 +10,14 @@ _lazy1:
movq %rdx,56(%rsp) movq %rdx,56(%rsp)
movq %r8,64(%rsp) movq %r8,64(%rsp)
movq %r9,72(%rsp) movq %r9,72(%rsp)
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rcx leaq OPENSSL_UplinkTable(%rip),%rcx
movq $1,%rdx movq $1,%rdx
call OPENSSL_Uplink call OPENSSL_Uplink
movq 48(%rsp),%rcx movq 48(%rsp),%rcx
movq 56(%rsp),%rdx movq 56(%rsp),%rdx
movq 64(%rsp),%r8 movq 64(%rsp),%r8
movq 72(%rsp),%r9 movq 72(%rsp),%r9
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rax leaq OPENSSL_UplinkTable(%rip),%rax
addq $40,%rsp addq $40,%rsp
jmp *8(%rax) jmp *8(%rax)
_lazy1_end: _lazy1_end:
@ -30,14 +30,14 @@ _lazy2:
movq %rdx,56(%rsp) movq %rdx,56(%rsp)
movq %r8,64(%rsp) movq %r8,64(%rsp)
movq %r9,72(%rsp) movq %r9,72(%rsp)
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rcx leaq OPENSSL_UplinkTable(%rip),%rcx
movq $2,%rdx movq $2,%rdx
call OPENSSL_Uplink call OPENSSL_Uplink
movq 48(%rsp),%rcx movq 48(%rsp),%rcx
movq 56(%rsp),%rdx movq 56(%rsp),%rdx
movq 64(%rsp),%r8 movq 64(%rsp),%r8
movq 72(%rsp),%r9 movq 72(%rsp),%r9
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rax leaq OPENSSL_UplinkTable(%rip),%rax
addq $40,%rsp addq $40,%rsp
jmp *16(%rax) jmp *16(%rax)
_lazy2_end: _lazy2_end:
@ -50,14 +50,14 @@ _lazy3:
movq %rdx,56(%rsp) movq %rdx,56(%rsp)
movq %r8,64(%rsp) movq %r8,64(%rsp)
movq %r9,72(%rsp) movq %r9,72(%rsp)
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rcx leaq OPENSSL_UplinkTable(%rip),%rcx
movq $3,%rdx movq $3,%rdx
call OPENSSL_Uplink call OPENSSL_Uplink
movq 48(%rsp),%rcx movq 48(%rsp),%rcx
movq 56(%rsp),%rdx movq 56(%rsp),%rdx
movq 64(%rsp),%r8 movq 64(%rsp),%r8
movq 72(%rsp),%r9 movq 72(%rsp),%r9
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rax leaq OPENSSL_UplinkTable(%rip),%rax
addq $40,%rsp addq $40,%rsp
jmp *24(%rax) jmp *24(%rax)
_lazy3_end: _lazy3_end:
@ -70,14 +70,14 @@ _lazy4:
movq %rdx,56(%rsp) movq %rdx,56(%rsp)
movq %r8,64(%rsp) movq %r8,64(%rsp)
movq %r9,72(%rsp) movq %r9,72(%rsp)
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rcx leaq OPENSSL_UplinkTable(%rip),%rcx
movq $4,%rdx movq $4,%rdx
call OPENSSL_Uplink call OPENSSL_Uplink
movq 48(%rsp),%rcx movq 48(%rsp),%rcx
movq 56(%rsp),%rdx movq 56(%rsp),%rdx
movq 64(%rsp),%r8 movq 64(%rsp),%r8
movq 72(%rsp),%r9 movq 72(%rsp),%r9
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rax leaq OPENSSL_UplinkTable(%rip),%rax
addq $40,%rsp addq $40,%rsp
jmp *32(%rax) jmp *32(%rax)
_lazy4_end: _lazy4_end:
@ -90,14 +90,14 @@ _lazy5:
movq %rdx,56(%rsp) movq %rdx,56(%rsp)
movq %r8,64(%rsp) movq %r8,64(%rsp)
movq %r9,72(%rsp) movq %r9,72(%rsp)
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rcx leaq OPENSSL_UplinkTable(%rip),%rcx
movq $5,%rdx movq $5,%rdx
call OPENSSL_Uplink call OPENSSL_Uplink
movq 48(%rsp),%rcx movq 48(%rsp),%rcx
movq 56(%rsp),%rdx movq 56(%rsp),%rdx
movq 64(%rsp),%r8 movq 64(%rsp),%r8
movq 72(%rsp),%r9 movq 72(%rsp),%r9
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rax leaq OPENSSL_UplinkTable(%rip),%rax
addq $40,%rsp addq $40,%rsp
jmp *40(%rax) jmp *40(%rax)
_lazy5_end: _lazy5_end:
@ -110,14 +110,14 @@ _lazy6:
movq %rdx,56(%rsp) movq %rdx,56(%rsp)
movq %r8,64(%rsp) movq %r8,64(%rsp)
movq %r9,72(%rsp) movq %r9,72(%rsp)
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rcx leaq OPENSSL_UplinkTable(%rip),%rcx
movq $6,%rdx movq $6,%rdx
call OPENSSL_Uplink call OPENSSL_Uplink
movq 48(%rsp),%rcx movq 48(%rsp),%rcx
movq 56(%rsp),%rdx movq 56(%rsp),%rdx
movq 64(%rsp),%r8 movq 64(%rsp),%r8
movq 72(%rsp),%r9 movq 72(%rsp),%r9
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rax leaq OPENSSL_UplinkTable(%rip),%rax
addq $40,%rsp addq $40,%rsp
jmp *48(%rax) jmp *48(%rax)
_lazy6_end: _lazy6_end:
@ -130,14 +130,14 @@ _lazy7:
movq %rdx,56(%rsp) movq %rdx,56(%rsp)
movq %r8,64(%rsp) movq %r8,64(%rsp)
movq %r9,72(%rsp) movq %r9,72(%rsp)
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rcx leaq OPENSSL_UplinkTable(%rip),%rcx
movq $7,%rdx movq $7,%rdx
call OPENSSL_Uplink call OPENSSL_Uplink
movq 48(%rsp),%rcx movq 48(%rsp),%rcx
movq 56(%rsp),%rdx movq 56(%rsp),%rdx
movq 64(%rsp),%r8 movq 64(%rsp),%r8
movq 72(%rsp),%r9 movq 72(%rsp),%r9
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rax leaq OPENSSL_UplinkTable(%rip),%rax
addq $40,%rsp addq $40,%rsp
jmp *56(%rax) jmp *56(%rax)
_lazy7_end: _lazy7_end:
@ -150,14 +150,14 @@ _lazy8:
movq %rdx,56(%rsp) movq %rdx,56(%rsp)
movq %r8,64(%rsp) movq %r8,64(%rsp)
movq %r9,72(%rsp) movq %r9,72(%rsp)
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rcx leaq OPENSSL_UplinkTable(%rip),%rcx
movq $8,%rdx movq $8,%rdx
call OPENSSL_Uplink call OPENSSL_Uplink
movq 48(%rsp),%rcx movq 48(%rsp),%rcx
movq 56(%rsp),%rdx movq 56(%rsp),%rdx
movq 64(%rsp),%r8 movq 64(%rsp),%r8
movq 72(%rsp),%r9 movq 72(%rsp),%r9
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rax leaq OPENSSL_UplinkTable(%rip),%rax
addq $40,%rsp addq $40,%rsp
jmp *64(%rax) jmp *64(%rax)
_lazy8_end: _lazy8_end:
@ -170,14 +170,14 @@ _lazy9:
movq %rdx,56(%rsp) movq %rdx,56(%rsp)
movq %r8,64(%rsp) movq %r8,64(%rsp)
movq %r9,72(%rsp) movq %r9,72(%rsp)
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rcx leaq OPENSSL_UplinkTable(%rip),%rcx
movq $9,%rdx movq $9,%rdx
call OPENSSL_Uplink call OPENSSL_Uplink
movq 48(%rsp),%rcx movq 48(%rsp),%rcx
movq 56(%rsp),%rdx movq 56(%rsp),%rdx
movq 64(%rsp),%r8 movq 64(%rsp),%r8
movq 72(%rsp),%r9 movq 72(%rsp),%r9
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rax leaq OPENSSL_UplinkTable(%rip),%rax
addq $40,%rsp addq $40,%rsp
jmp *72(%rax) jmp *72(%rax)
_lazy9_end: _lazy9_end:
@ -190,14 +190,14 @@ _lazy10:
movq %rdx,56(%rsp) movq %rdx,56(%rsp)
movq %r8,64(%rsp) movq %r8,64(%rsp)
movq %r9,72(%rsp) movq %r9,72(%rsp)
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rcx leaq OPENSSL_UplinkTable(%rip),%rcx
movq $10,%rdx movq $10,%rdx
call OPENSSL_Uplink call OPENSSL_Uplink
movq 48(%rsp),%rcx movq 48(%rsp),%rcx
movq 56(%rsp),%rdx movq 56(%rsp),%rdx
movq 64(%rsp),%r8 movq 64(%rsp),%r8
movq 72(%rsp),%r9 movq 72(%rsp),%r9
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rax leaq OPENSSL_UplinkTable(%rip),%rax
addq $40,%rsp addq $40,%rsp
jmp *80(%rax) jmp *80(%rax)
_lazy10_end: _lazy10_end:
@ -210,14 +210,14 @@ _lazy11:
movq %rdx,56(%rsp) movq %rdx,56(%rsp)
movq %r8,64(%rsp) movq %r8,64(%rsp)
movq %r9,72(%rsp) movq %r9,72(%rsp)
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rcx leaq OPENSSL_UplinkTable(%rip),%rcx
movq $11,%rdx movq $11,%rdx
call OPENSSL_Uplink call OPENSSL_Uplink
movq 48(%rsp),%rcx movq 48(%rsp),%rcx
movq 56(%rsp),%rdx movq 56(%rsp),%rdx
movq 64(%rsp),%r8 movq 64(%rsp),%r8
movq 72(%rsp),%r9 movq 72(%rsp),%r9
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rax leaq OPENSSL_UplinkTable(%rip),%rax
addq $40,%rsp addq $40,%rsp
jmp *88(%rax) jmp *88(%rax)
_lazy11_end: _lazy11_end:
@ -230,14 +230,14 @@ _lazy12:
movq %rdx,56(%rsp) movq %rdx,56(%rsp)
movq %r8,64(%rsp) movq %r8,64(%rsp)
movq %r9,72(%rsp) movq %r9,72(%rsp)
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rcx leaq OPENSSL_UplinkTable(%rip),%rcx
movq $12,%rdx movq $12,%rdx
call OPENSSL_Uplink call OPENSSL_Uplink
movq 48(%rsp),%rcx movq 48(%rsp),%rcx
movq 56(%rsp),%rdx movq 56(%rsp),%rdx
movq 64(%rsp),%r8 movq 64(%rsp),%r8
movq 72(%rsp),%r9 movq 72(%rsp),%r9
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rax leaq OPENSSL_UplinkTable(%rip),%rax
addq $40,%rsp addq $40,%rsp
jmp *96(%rax) jmp *96(%rax)
_lazy12_end: _lazy12_end:
@ -250,14 +250,14 @@ _lazy13:
movq %rdx,56(%rsp) movq %rdx,56(%rsp)
movq %r8,64(%rsp) movq %r8,64(%rsp)
movq %r9,72(%rsp) movq %r9,72(%rsp)
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rcx leaq OPENSSL_UplinkTable(%rip),%rcx
movq $13,%rdx movq $13,%rdx
call OPENSSL_Uplink call OPENSSL_Uplink
movq 48(%rsp),%rcx movq 48(%rsp),%rcx
movq 56(%rsp),%rdx movq 56(%rsp),%rdx
movq 64(%rsp),%r8 movq 64(%rsp),%r8
movq 72(%rsp),%r9 movq 72(%rsp),%r9
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rax leaq OPENSSL_UplinkTable(%rip),%rax
addq $40,%rsp addq $40,%rsp
jmp *104(%rax) jmp *104(%rax)
_lazy13_end: _lazy13_end:
@ -270,14 +270,14 @@ _lazy14:
movq %rdx,56(%rsp) movq %rdx,56(%rsp)
movq %r8,64(%rsp) movq %r8,64(%rsp)
movq %r9,72(%rsp) movq %r9,72(%rsp)
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rcx leaq OPENSSL_UplinkTable(%rip),%rcx
movq $14,%rdx movq $14,%rdx
call OPENSSL_Uplink call OPENSSL_Uplink
movq 48(%rsp),%rcx movq 48(%rsp),%rcx
movq 56(%rsp),%rdx movq 56(%rsp),%rdx
movq 64(%rsp),%r8 movq 64(%rsp),%r8
movq 72(%rsp),%r9 movq 72(%rsp),%r9
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rax leaq OPENSSL_UplinkTable(%rip),%rax
addq $40,%rsp addq $40,%rsp
jmp *112(%rax) jmp *112(%rax)
_lazy14_end: _lazy14_end:
@ -290,14 +290,14 @@ _lazy15:
movq %rdx,56(%rsp) movq %rdx,56(%rsp)
movq %r8,64(%rsp) movq %r8,64(%rsp)
movq %r9,72(%rsp) movq %r9,72(%rsp)
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rcx leaq OPENSSL_UplinkTable(%rip),%rcx
movq $15,%rdx movq $15,%rdx
call OPENSSL_Uplink call OPENSSL_Uplink
movq 48(%rsp),%rcx movq 48(%rsp),%rcx
movq 56(%rsp),%rdx movq 56(%rsp),%rdx
movq 64(%rsp),%r8 movq 64(%rsp),%r8
movq 72(%rsp),%r9 movq 72(%rsp),%r9
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rax leaq OPENSSL_UplinkTable(%rip),%rax
addq $40,%rsp addq $40,%rsp
jmp *120(%rax) jmp *120(%rax)
_lazy15_end: _lazy15_end:
@ -310,14 +310,14 @@ _lazy16:
movq %rdx,56(%rsp) movq %rdx,56(%rsp)
movq %r8,64(%rsp) movq %r8,64(%rsp)
movq %r9,72(%rsp) movq %r9,72(%rsp)
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rcx leaq OPENSSL_UplinkTable(%rip),%rcx
movq $16,%rdx movq $16,%rdx
call OPENSSL_Uplink call OPENSSL_Uplink
movq 48(%rsp),%rcx movq 48(%rsp),%rcx
movq 56(%rsp),%rdx movq 56(%rsp),%rdx
movq 64(%rsp),%r8 movq 64(%rsp),%r8
movq 72(%rsp),%r9 movq 72(%rsp),%r9
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rax leaq OPENSSL_UplinkTable(%rip),%rax
addq $40,%rsp addq $40,%rsp
jmp *128(%rax) jmp *128(%rax)
_lazy16_end: _lazy16_end:
@ -330,14 +330,14 @@ _lazy17:
movq %rdx,56(%rsp) movq %rdx,56(%rsp)
movq %r8,64(%rsp) movq %r8,64(%rsp)
movq %r9,72(%rsp) movq %r9,72(%rsp)
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rcx leaq OPENSSL_UplinkTable(%rip),%rcx
movq $17,%rdx movq $17,%rdx
call OPENSSL_Uplink call OPENSSL_Uplink
movq 48(%rsp),%rcx movq 48(%rsp),%rcx
movq 56(%rsp),%rdx movq 56(%rsp),%rdx
movq 64(%rsp),%r8 movq 64(%rsp),%r8
movq 72(%rsp),%r9 movq 72(%rsp),%r9
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rax leaq OPENSSL_UplinkTable(%rip),%rax
addq $40,%rsp addq $40,%rsp
jmp *136(%rax) jmp *136(%rax)
_lazy17_end: _lazy17_end:
@ -350,14 +350,14 @@ _lazy18:
movq %rdx,56(%rsp) movq %rdx,56(%rsp)
movq %r8,64(%rsp) movq %r8,64(%rsp)
movq %r9,72(%rsp) movq %r9,72(%rsp)
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rcx leaq OPENSSL_UplinkTable(%rip),%rcx
movq $18,%rdx movq $18,%rdx
call OPENSSL_Uplink call OPENSSL_Uplink
movq 48(%rsp),%rcx movq 48(%rsp),%rcx
movq 56(%rsp),%rdx movq 56(%rsp),%rdx
movq 64(%rsp),%r8 movq 64(%rsp),%r8
movq 72(%rsp),%r9 movq 72(%rsp),%r9
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rax leaq OPENSSL_UplinkTable(%rip),%rax
addq $40,%rsp addq $40,%rsp
jmp *144(%rax) jmp *144(%rax)
_lazy18_end: _lazy18_end:
@ -370,14 +370,14 @@ _lazy19:
movq %rdx,56(%rsp) movq %rdx,56(%rsp)
movq %r8,64(%rsp) movq %r8,64(%rsp)
movq %r9,72(%rsp) movq %r9,72(%rsp)
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rcx leaq OPENSSL_UplinkTable(%rip),%rcx
movq $19,%rdx movq $19,%rdx
call OPENSSL_Uplink call OPENSSL_Uplink
movq 48(%rsp),%rcx movq 48(%rsp),%rcx
movq 56(%rsp),%rdx movq 56(%rsp),%rdx
movq 64(%rsp),%r8 movq 64(%rsp),%r8
movq 72(%rsp),%r9 movq 72(%rsp),%r9
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rax leaq OPENSSL_UplinkTable(%rip),%rax
addq $40,%rsp addq $40,%rsp
jmp *152(%rax) jmp *152(%rax)
_lazy19_end: _lazy19_end:
@ -390,14 +390,14 @@ _lazy20:
movq %rdx,56(%rsp) movq %rdx,56(%rsp)
movq %r8,64(%rsp) movq %r8,64(%rsp)
movq %r9,72(%rsp) movq %r9,72(%rsp)
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rcx leaq OPENSSL_UplinkTable(%rip),%rcx
movq $20,%rdx movq $20,%rdx
call OPENSSL_Uplink call OPENSSL_Uplink
movq 48(%rsp),%rcx movq 48(%rsp),%rcx
movq 56(%rsp),%rdx movq 56(%rsp),%rdx
movq 64(%rsp),%r8 movq 64(%rsp),%r8
movq 72(%rsp),%r9 movq 72(%rsp),%r9
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rax leaq OPENSSL_UplinkTable(%rip),%rax
addq $40,%rsp addq $40,%rsp
jmp *160(%rax) jmp *160(%rax)
_lazy20_end: _lazy20_end:
@ -410,14 +410,14 @@ _lazy21:
movq %rdx,56(%rsp) movq %rdx,56(%rsp)
movq %r8,64(%rsp) movq %r8,64(%rsp)
movq %r9,72(%rsp) movq %r9,72(%rsp)
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rcx leaq OPENSSL_UplinkTable(%rip),%rcx
movq $21,%rdx movq $21,%rdx
call OPENSSL_Uplink call OPENSSL_Uplink
movq 48(%rsp),%rcx movq 48(%rsp),%rcx
movq 56(%rsp),%rdx movq 56(%rsp),%rdx
movq 64(%rsp),%r8 movq 64(%rsp),%r8
movq 72(%rsp),%r9 movq 72(%rsp),%r9
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rax leaq OPENSSL_UplinkTable(%rip),%rax
addq $40,%rsp addq $40,%rsp
jmp *168(%rax) jmp *168(%rax)
_lazy21_end: _lazy21_end:
@ -430,14 +430,14 @@ _lazy22:
movq %rdx,56(%rsp) movq %rdx,56(%rsp)
movq %r8,64(%rsp) movq %r8,64(%rsp)
movq %r9,72(%rsp) movq %r9,72(%rsp)
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rcx leaq OPENSSL_UplinkTable(%rip),%rcx
movq $22,%rdx movq $22,%rdx
call OPENSSL_Uplink call OPENSSL_Uplink
movq 48(%rsp),%rcx movq 48(%rsp),%rcx
movq 56(%rsp),%rdx movq 56(%rsp),%rdx
movq 64(%rsp),%r8 movq 64(%rsp),%r8
movq 72(%rsp),%r9 movq 72(%rsp),%r9
leaq OPENSSL_UplinkTable@GOTPCREL(%rip),%rax leaq OPENSSL_UplinkTable(%rip),%rax
addq $40,%rsp addq $40,%rsp
jmp *176(%rax) jmp *176(%rax)
_lazy22_end: _lazy22_end:

View File

@ -202,8 +202,9 @@ _mul_1x1:
.type bn_GF2m_mul_2x2,@function .type bn_GF2m_mul_2x2,@function
.align 16 .align 16
bn_GF2m_mul_2x2: bn_GF2m_mul_2x2:
movq OPENSSL_ia32cap_P@GOTPCREL(%rip),%rax movq OPENSSL_ia32cap_P+4(%rip),%rax
btq $33,%rax btq $1,%rax
jnc .Lvanilla_mul_2x2 jnc .Lvanilla_mul_2x2
.byte 102,72,15,110,198 .byte 102,72,15,110,198

View File

@ -13,6 +13,7 @@ bn_mul_mont:
jnz .Lmul_enter jnz .Lmul_enter
cmpl $8,%r9d cmpl $8,%r9d
jb .Lmul_enter jb .Lmul_enter
movl OPENSSL_ia32cap_P+8(%rip),%r11d
cmpq %rsi,%rdx cmpq %rsi,%rdx
jne .Lmul4x_enter jne .Lmul4x_enter
testl $7,%r9d testl $7,%r9d
@ -241,6 +242,9 @@ bn_mul4x_mont:
movl %r9d,%r9d movl %r9d,%r9d
movq %rsp,%rax movq %rsp,%rax
.Lmul4x_enter: .Lmul4x_enter:
andl $0x80100,%r11d
cmpl $0x80100,%r11d
je .Lmulx4x_enter
pushq %rbx pushq %rbx
pushq %rbp pushq %rbp
pushq %r12 pushq %r12
@ -651,6 +655,7 @@ bn_mul4x_mont:
.size bn_mul4x_mont,.-bn_mul4x_mont .size bn_mul4x_mont,.-bn_mul4x_mont
.type bn_sqr8x_mont,@function .type bn_sqr8x_mont,@function
.align 32 .align 32
bn_sqr8x_mont: bn_sqr8x_mont:
@ -723,6 +728,25 @@ bn_sqr8x_mont:
pxor %xmm0,%xmm0 pxor %xmm0,%xmm0
.byte 102,72,15,110,207 .byte 102,72,15,110,207
.byte 102,73,15,110,218 .byte 102,73,15,110,218
movl OPENSSL_ia32cap_P+8(%rip),%eax
andl $0x80100,%eax
cmpl $0x80100,%eax
jne .Lsqr8x_nox
call bn_sqrx8x_internal
leaq (%r8,%rcx,1),%rbx
movq %rcx,%r9
movq %rcx,%rdx
.byte 102,72,15,126,207
sarq $3+2,%rcx
jmp .Lsqr8x_sub
.align 32
.Lsqr8x_nox:
call bn_sqr8x_internal call bn_sqr8x_internal
@ -801,5 +825,344 @@ bn_sqr8x_mont:
.Lsqr8x_epilogue: .Lsqr8x_epilogue:
.byte 0xf3,0xc3 .byte 0xf3,0xc3
.size bn_sqr8x_mont,.-bn_sqr8x_mont .size bn_sqr8x_mont,.-bn_sqr8x_mont
.type bn_mulx4x_mont,@function
.align 32
bn_mulx4x_mont:
movq %rsp,%rax
.Lmulx4x_enter:
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
.Lmulx4x_prologue:
shll $3,%r9d
xorq %r10,%r10
subq %r9,%r10
movq (%r8),%r8
leaq -72(%rsp,%r10,1),%rbp
andq $-128,%rbp
movq %rsp,%r11
subq %rbp,%r11
andq $-4096,%r11
leaq (%r11,%rbp,1),%rsp
movq (%rsp),%r10
cmpq %rbp,%rsp
ja .Lmulx4x_page_walk
jmp .Lmulx4x_page_walk_done
.align 16
.Lmulx4x_page_walk:
leaq -4096(%rsp),%rsp
movq (%rsp),%r10
cmpq %rbp,%rsp
ja .Lmulx4x_page_walk
.Lmulx4x_page_walk_done:
leaq (%rdx,%r9,1),%r10
movq %r9,0(%rsp)
shrq $5,%r9
movq %r10,16(%rsp)
subq $1,%r9
movq %r8,24(%rsp)
movq %rdi,32(%rsp)
movq %rax,40(%rsp)
movq %r9,48(%rsp)
jmp .Lmulx4x_body
.align 32
.Lmulx4x_body:
leaq 8(%rdx),%rdi
movq (%rdx),%rdx
leaq 64+32(%rsp),%rbx
movq %rdx,%r9
mulxq 0(%rsi),%r8,%rax
mulxq 8(%rsi),%r11,%r14
addq %rax,%r11
movq %rdi,8(%rsp)
mulxq 16(%rsi),%r12,%r13
adcq %r14,%r12
adcq $0,%r13
movq %r8,%rdi
imulq 24(%rsp),%r8
xorq %rbp,%rbp
mulxq 24(%rsi),%rax,%r14
movq %r8,%rdx
leaq 32(%rsi),%rsi
adcxq %rax,%r13
adcxq %rbp,%r14
mulxq 0(%rcx),%rax,%r10
adcxq %rax,%rdi
adoxq %r11,%r10
mulxq 8(%rcx),%rax,%r11
adcxq %rax,%r10
adoxq %r12,%r11
.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
movq 48(%rsp),%rdi
movq %r10,-32(%rbx)
adcxq %rax,%r11
adoxq %r13,%r12
mulxq 24(%rcx),%rax,%r15
movq %r9,%rdx
movq %r11,-24(%rbx)
adcxq %rax,%r12
adoxq %rbp,%r15
leaq 32(%rcx),%rcx
movq %r12,-16(%rbx)
jmp .Lmulx4x_1st
.align 32
.Lmulx4x_1st:
adcxq %rbp,%r15
mulxq 0(%rsi),%r10,%rax
adcxq %r14,%r10
mulxq 8(%rsi),%r11,%r14
adcxq %rax,%r11
mulxq 16(%rsi),%r12,%rax
adcxq %r14,%r12
mulxq 24(%rsi),%r13,%r14
.byte 0x67,0x67
movq %r8,%rdx
adcxq %rax,%r13
adcxq %rbp,%r14
leaq 32(%rsi),%rsi
leaq 32(%rbx),%rbx
adoxq %r15,%r10
mulxq 0(%rcx),%rax,%r15
adcxq %rax,%r10
adoxq %r15,%r11
mulxq 8(%rcx),%rax,%r15
adcxq %rax,%r11
adoxq %r15,%r12
mulxq 16(%rcx),%rax,%r15
movq %r10,-40(%rbx)
adcxq %rax,%r12
movq %r11,-32(%rbx)
adoxq %r15,%r13
mulxq 24(%rcx),%rax,%r15
movq %r9,%rdx
movq %r12,-24(%rbx)
adcxq %rax,%r13
adoxq %rbp,%r15
leaq 32(%rcx),%rcx
movq %r13,-16(%rbx)
decq %rdi
jnz .Lmulx4x_1st
movq 0(%rsp),%rax
movq 8(%rsp),%rdi
adcq %rbp,%r15
addq %r15,%r14
sbbq %r15,%r15
movq %r14,-8(%rbx)
jmp .Lmulx4x_outer
.align 32
.Lmulx4x_outer:
movq (%rdi),%rdx
leaq 8(%rdi),%rdi
subq %rax,%rsi
movq %r15,(%rbx)
leaq 64+32(%rsp),%rbx
subq %rax,%rcx
mulxq 0(%rsi),%r8,%r11
xorl %ebp,%ebp
movq %rdx,%r9
mulxq 8(%rsi),%r14,%r12
adoxq -32(%rbx),%r8
adcxq %r14,%r11
mulxq 16(%rsi),%r15,%r13
adoxq -24(%rbx),%r11
adcxq %r15,%r12
adoxq %rbp,%r12
adcxq %rbp,%r13
movq %rdi,8(%rsp)
.byte 0x67
movq %r8,%r15
imulq 24(%rsp),%r8
xorl %ebp,%ebp
mulxq 24(%rsi),%rax,%r14
movq %r8,%rdx
adoxq -16(%rbx),%r12
adcxq %rax,%r13
adoxq -8(%rbx),%r13
adcxq %rbp,%r14
leaq 32(%rsi),%rsi
adoxq %rbp,%r14
mulxq 0(%rcx),%rax,%r10
adcxq %rax,%r15
adoxq %r11,%r10
mulxq 8(%rcx),%rax,%r11
adcxq %rax,%r10
adoxq %r12,%r11
mulxq 16(%rcx),%rax,%r12
movq %r10,-32(%rbx)
adcxq %rax,%r11
adoxq %r13,%r12
mulxq 24(%rcx),%rax,%r15
movq %r9,%rdx
movq %r11,-24(%rbx)
leaq 32(%rcx),%rcx
adcxq %rax,%r12
adoxq %rbp,%r15
movq 48(%rsp),%rdi
movq %r12,-16(%rbx)
jmp .Lmulx4x_inner
.align 32
.Lmulx4x_inner:
mulxq 0(%rsi),%r10,%rax
adcxq %rbp,%r15
adoxq %r14,%r10
mulxq 8(%rsi),%r11,%r14
adcxq 0(%rbx),%r10
adoxq %rax,%r11
mulxq 16(%rsi),%r12,%rax
adcxq 8(%rbx),%r11
adoxq %r14,%r12
mulxq 24(%rsi),%r13,%r14
movq %r8,%rdx
adcxq 16(%rbx),%r12
adoxq %rax,%r13
adcxq 24(%rbx),%r13
adoxq %rbp,%r14
leaq 32(%rsi),%rsi
leaq 32(%rbx),%rbx
adcxq %rbp,%r14
adoxq %r15,%r10
mulxq 0(%rcx),%rax,%r15
adcxq %rax,%r10
adoxq %r15,%r11
mulxq 8(%rcx),%rax,%r15
adcxq %rax,%r11
adoxq %r15,%r12
mulxq 16(%rcx),%rax,%r15
movq %r10,-40(%rbx)
adcxq %rax,%r12
adoxq %r15,%r13
mulxq 24(%rcx),%rax,%r15
movq %r9,%rdx
movq %r11,-32(%rbx)
movq %r12,-24(%rbx)
adcxq %rax,%r13
adoxq %rbp,%r15
leaq 32(%rcx),%rcx
movq %r13,-16(%rbx)
decq %rdi
jnz .Lmulx4x_inner
movq 0(%rsp),%rax
movq 8(%rsp),%rdi
adcq %rbp,%r15
subq 0(%rbx),%rbp
adcq %r15,%r14
sbbq %r15,%r15
movq %r14,-8(%rbx)
cmpq 16(%rsp),%rdi
jne .Lmulx4x_outer
leaq 64(%rsp),%rbx
subq %rax,%rcx
negq %r15
movq %rax,%rdx
shrq $3+2,%rax
movq 32(%rsp),%rdi
jmp .Lmulx4x_sub
.align 32
.Lmulx4x_sub:
movq 0(%rbx),%r11
movq 8(%rbx),%r12
movq 16(%rbx),%r13
movq 24(%rbx),%r14
leaq 32(%rbx),%rbx
sbbq 0(%rcx),%r11
sbbq 8(%rcx),%r12
sbbq 16(%rcx),%r13
sbbq 24(%rcx),%r14
leaq 32(%rcx),%rcx
movq %r11,0(%rdi)
movq %r12,8(%rdi)
movq %r13,16(%rdi)
movq %r14,24(%rdi)
leaq 32(%rdi),%rdi
decq %rax
jnz .Lmulx4x_sub
sbbq $0,%r15
leaq 64(%rsp),%rbx
subq %rdx,%rdi
.byte 102,73,15,110,207
pxor %xmm0,%xmm0
pshufd $0,%xmm1,%xmm1
movq 40(%rsp),%rsi
jmp .Lmulx4x_cond_copy
.align 32
.Lmulx4x_cond_copy:
movdqa 0(%rbx),%xmm2
movdqa 16(%rbx),%xmm3
leaq 32(%rbx),%rbx
movdqu 0(%rdi),%xmm4
movdqu 16(%rdi),%xmm5
leaq 32(%rdi),%rdi
movdqa %xmm0,-32(%rbx)
movdqa %xmm0,-16(%rbx)
pcmpeqd %xmm1,%xmm0
pand %xmm1,%xmm2
pand %xmm1,%xmm3
pand %xmm0,%xmm4
pand %xmm0,%xmm5
pxor %xmm0,%xmm0
por %xmm2,%xmm4
por %xmm3,%xmm5
movdqu %xmm4,-32(%rdi)
movdqu %xmm5,-16(%rdi)
subq $32,%rdx
jnz .Lmulx4x_cond_copy
movq %rdx,(%rbx)
movq $1,%rax
movq -48(%rsi),%r15
movq -40(%rsi),%r14
movq -32(%rsi),%r13
movq -24(%rsi),%r12
movq -16(%rsi),%rbp
movq -8(%rsi),%rbx
leaq (%rsi),%rsp
.Lmulx4x_epilogue:
.byte 0xf3,0xc3
.size bn_mulx4x_mont,.-bn_mulx4x_mont
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 16 .align 16

File diff suppressed because it is too large Load Diff