Find CPU-specific variants of the long number routines. Regenerate.

This commit is contained in:
joerg 2015-05-16 22:23:31 +00:00
parent df78cf4a7d
commit d555156c33
4 changed files with 2454 additions and 1 deletions

View File

@ -1,4 +1,4 @@
# $NetBSD: Makefile,v 1.7 2015/05/16 19:08:37 joerg Exp $
# $NetBSD: Makefile,v 1.8 2015/05/16 22:23:31 joerg Exp $
.include "bsd.own.mk"
@ -11,6 +11,7 @@ CC+= -fno-integrated-as
regen:
for i in $$(find ${OPENSSLSRC} -name \*${MACHINE_ARCH}.pl) \
$$(find ${OPENSSLSRC}/crypto/bn/asm -name ${MACHINE_ARCH}-\*.pl) \
${OPENSSLSRC}/crypto/${MACHINE_ARCH}cpuid.pl ; do \
(echo "#include <machine/asm.h>"; CC=${CC:Q} perl $$i elf | sed \
-e 's/\(OPENSSL[A-Za-z0-9_+]*\)(%rip)/\1@GOTPCREL(%rip)/' \

View File

@ -0,0 +1,292 @@
#include <machine/asm.h>
.text
.type _mul_1x1,@function
.align 16
_mul_1x1:
subq $128+8,%rsp
movq $-1,%r9
leaq (%rax,%rax,1),%rsi
shrq $3,%r9
leaq (,%rax,4),%rdi
andq %rax,%r9
leaq (,%rax,8),%r12
sarq $63,%rax
leaq (%r9,%r9,1),%r10
sarq $63,%rsi
leaq (,%r9,4),%r11
andq %rbp,%rax
sarq $63,%rdi
movq %rax,%rdx
shlq $63,%rax
andq %rbp,%rsi
shrq $1,%rdx
movq %rsi,%rcx
shlq $62,%rsi
andq %rbp,%rdi
shrq $2,%rcx
xorq %rsi,%rax
movq %rdi,%rbx
shlq $61,%rdi
xorq %rcx,%rdx
shrq $3,%rbx
xorq %rdi,%rax
xorq %rbx,%rdx
movq %r9,%r13
movq $0,0(%rsp)
xorq %r10,%r13
movq %r9,8(%rsp)
movq %r11,%r14
movq %r10,16(%rsp)
xorq %r12,%r14
movq %r13,24(%rsp)
xorq %r11,%r9
movq %r11,32(%rsp)
xorq %r11,%r10
movq %r9,40(%rsp)
xorq %r11,%r13
movq %r10,48(%rsp)
xorq %r14,%r9
movq %r13,56(%rsp)
xorq %r14,%r10
movq %r12,64(%rsp)
xorq %r14,%r13
movq %r9,72(%rsp)
xorq %r11,%r9
movq %r10,80(%rsp)
xorq %r11,%r10
movq %r13,88(%rsp)
xorq %r11,%r13
movq %r14,96(%rsp)
movq %r8,%rsi
movq %r9,104(%rsp)
andq %rbp,%rsi
movq %r10,112(%rsp)
shrq $4,%rbp
movq %r13,120(%rsp)
movq %r8,%rdi
andq %rbp,%rdi
shrq $4,%rbp
movq (%rsp,%rsi,8),%xmm0
movq %r8,%rsi
andq %rbp,%rsi
shrq $4,%rbp
movq (%rsp,%rdi,8),%rcx
movq %r8,%rdi
movq %rcx,%rbx
shlq $4,%rcx
andq %rbp,%rdi
movq (%rsp,%rsi,8),%xmm1
shrq $60,%rbx
xorq %rcx,%rax
pslldq $1,%xmm1
movq %r8,%rsi
shrq $4,%rbp
xorq %rbx,%rdx
andq %rbp,%rsi
shrq $4,%rbp
pxor %xmm1,%xmm0
movq (%rsp,%rdi,8),%rcx
movq %r8,%rdi
movq %rcx,%rbx
shlq $12,%rcx
andq %rbp,%rdi
movq (%rsp,%rsi,8),%xmm1
shrq $52,%rbx
xorq %rcx,%rax
pslldq $2,%xmm1
movq %r8,%rsi
shrq $4,%rbp
xorq %rbx,%rdx
andq %rbp,%rsi
shrq $4,%rbp
pxor %xmm1,%xmm0
movq (%rsp,%rdi,8),%rcx
movq %r8,%rdi
movq %rcx,%rbx
shlq $20,%rcx
andq %rbp,%rdi
movq (%rsp,%rsi,8),%xmm1
shrq $44,%rbx
xorq %rcx,%rax
pslldq $3,%xmm1
movq %r8,%rsi
shrq $4,%rbp
xorq %rbx,%rdx
andq %rbp,%rsi
shrq $4,%rbp
pxor %xmm1,%xmm0
movq (%rsp,%rdi,8),%rcx
movq %r8,%rdi
movq %rcx,%rbx
shlq $28,%rcx
andq %rbp,%rdi
movq (%rsp,%rsi,8),%xmm1
shrq $36,%rbx
xorq %rcx,%rax
pslldq $4,%xmm1
movq %r8,%rsi
shrq $4,%rbp
xorq %rbx,%rdx
andq %rbp,%rsi
shrq $4,%rbp
pxor %xmm1,%xmm0
movq (%rsp,%rdi,8),%rcx
movq %r8,%rdi
movq %rcx,%rbx
shlq $36,%rcx
andq %rbp,%rdi
movq (%rsp,%rsi,8),%xmm1
shrq $28,%rbx
xorq %rcx,%rax
pslldq $5,%xmm1
movq %r8,%rsi
shrq $4,%rbp
xorq %rbx,%rdx
andq %rbp,%rsi
shrq $4,%rbp
pxor %xmm1,%xmm0
movq (%rsp,%rdi,8),%rcx
movq %r8,%rdi
movq %rcx,%rbx
shlq $44,%rcx
andq %rbp,%rdi
movq (%rsp,%rsi,8),%xmm1
shrq $20,%rbx
xorq %rcx,%rax
pslldq $6,%xmm1
movq %r8,%rsi
shrq $4,%rbp
xorq %rbx,%rdx
andq %rbp,%rsi
shrq $4,%rbp
pxor %xmm1,%xmm0
movq (%rsp,%rdi,8),%rcx
movq %r8,%rdi
movq %rcx,%rbx
shlq $52,%rcx
andq %rbp,%rdi
movq (%rsp,%rsi,8),%xmm1
shrq $12,%rbx
xorq %rcx,%rax
pslldq $7,%xmm1
movq %r8,%rsi
shrq $4,%rbp
xorq %rbx,%rdx
andq %rbp,%rsi
shrq $4,%rbp
pxor %xmm1,%xmm0
movq (%rsp,%rdi,8),%rcx
movq %rcx,%rbx
shlq $60,%rcx
.byte 102,72,15,126,198
shrq $4,%rbx
xorq %rcx,%rax
psrldq $8,%xmm0
xorq %rbx,%rdx
.byte 102,72,15,126,199
xorq %rsi,%rax
xorq %rdi,%rdx
addq $128+8,%rsp
.byte 0xf3,0xc3
.Lend_mul_1x1:
.size _mul_1x1,.-_mul_1x1
.globl bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,@function
.align 16
bn_GF2m_mul_2x2:
movq OPENSSL_ia32cap_P@GOTPCREL(%rip),%rax
btq $33,%rax
jnc .Lvanilla_mul_2x2
.byte 102,72,15,110,198
.byte 102,72,15,110,201
.byte 102,72,15,110,210
.byte 102,73,15,110,216
movdqa %xmm0,%xmm4
movdqa %xmm1,%xmm5
.byte 102,15,58,68,193,0
pxor %xmm2,%xmm4
pxor %xmm3,%xmm5
.byte 102,15,58,68,211,0
.byte 102,15,58,68,229,0
xorps %xmm0,%xmm4
xorps %xmm2,%xmm4
movdqa %xmm4,%xmm5
pslldq $8,%xmm4
psrldq $8,%xmm5
pxor %xmm4,%xmm2
pxor %xmm5,%xmm0
movdqu %xmm2,0(%rdi)
movdqu %xmm0,16(%rdi)
.byte 0xf3,0xc3
.align 16
.Lvanilla_mul_2x2:
leaq -136(%rsp),%rsp
movq %r14,80(%rsp)
movq %r13,88(%rsp)
movq %r12,96(%rsp)
movq %rbp,104(%rsp)
movq %rbx,112(%rsp)
.Lbody_mul_2x2:
movq %rdi,32(%rsp)
movq %rsi,40(%rsp)
movq %rdx,48(%rsp)
movq %rcx,56(%rsp)
movq %r8,64(%rsp)
movq $15,%r8
movq %rsi,%rax
movq %rcx,%rbp
call _mul_1x1
movq %rax,16(%rsp)
movq %rdx,24(%rsp)
movq 48(%rsp),%rax
movq 64(%rsp),%rbp
call _mul_1x1
movq %rax,0(%rsp)
movq %rdx,8(%rsp)
movq 40(%rsp),%rax
movq 56(%rsp),%rbp
xorq 48(%rsp),%rax
xorq 64(%rsp),%rbp
call _mul_1x1
movq 0(%rsp),%rbx
movq 8(%rsp),%rcx
movq 16(%rsp),%rdi
movq 24(%rsp),%rsi
movq 32(%rsp),%rbp
xorq %rdx,%rax
xorq %rcx,%rdx
xorq %rbx,%rax
movq %rbx,0(%rbp)
xorq %rdi,%rdx
movq %rsi,24(%rbp)
xorq %rsi,%rax
xorq %rsi,%rdx
xorq %rdx,%rax
movq %rdx,16(%rbp)
movq %rax,8(%rbp)
movq 80(%rsp),%r14
movq 88(%rsp),%r13
movq 96(%rsp),%r12
movq 104(%rsp),%rbp
movq 112(%rsp),%rbx
leaq 136(%rsp),%rsp
.byte 0xf3,0xc3
.Lend_mul_2x2:
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
.byte 71,70,40,50,94,109,41,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 16

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,785 @@
#include <machine/asm.h>
.text
.globl bn_mul_mont_gather5
.type bn_mul_mont_gather5,@function
.align 64
bn_mul_mont_gather5:
testl $3,%r9d
jnz .Lmul_enter
cmpl $8,%r9d
jb .Lmul_enter
jmp .Lmul4x_enter
.align 16
.Lmul_enter:
movl %r9d,%r9d
movl 8(%rsp),%r10d
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
movq %rsp,%rax
leaq 2(%r9),%r11
negq %r11
leaq (%rsp,%r11,8),%rsp
andq $-1024,%rsp
movq %rax,8(%rsp,%r9,8)
.Lmul_body:
movq %rdx,%r12
movq %r10,%r11
shrq $3,%r10
andq $7,%r11
notq %r10
leaq .Lmagic_masks(%rip),%rax
andq $3,%r10
leaq 96(%r12,%r11,8),%r12
movq 0(%rax,%r10,8),%xmm4
movq 8(%rax,%r10,8),%xmm5
movq 16(%rax,%r10,8),%xmm6
movq 24(%rax,%r10,8),%xmm7
movq -96(%r12),%xmm0
movq -32(%r12),%xmm1
pand %xmm4,%xmm0
movq 32(%r12),%xmm2
pand %xmm5,%xmm1
movq 96(%r12),%xmm3
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
por %xmm2,%xmm0
leaq 256(%r12),%r12
por %xmm3,%xmm0
.byte 102,72,15,126,195
movq (%r8),%r8
movq (%rsi),%rax
xorq %r14,%r14
xorq %r15,%r15
movq -96(%r12),%xmm0
movq -32(%r12),%xmm1
pand %xmm4,%xmm0
movq 32(%r12),%xmm2
pand %xmm5,%xmm1
movq %r8,%rbp
mulq %rbx
movq %rax,%r10
movq (%rcx),%rax
movq 96(%r12),%xmm3
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
imulq %r10,%rbp
movq %rdx,%r11
por %xmm2,%xmm0
leaq 256(%r12),%r12
por %xmm3,%xmm0
mulq %rbp
addq %rax,%r10
movq 8(%rsi),%rax
adcq $0,%rdx
movq %rdx,%r13
leaq 1(%r15),%r15
jmp .L1st_enter
.align 16
.L1st:
addq %rax,%r13
movq (%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r11,%r13
movq %r10,%r11
adcq $0,%rdx
movq %r13,-16(%rsp,%r15,8)
movq %rdx,%r13
.L1st_enter:
mulq %rbx
addq %rax,%r11
movq (%rcx,%r15,8),%rax
adcq $0,%rdx
leaq 1(%r15),%r15
movq %rdx,%r10
mulq %rbp
cmpq %r9,%r15
jne .L1st
.byte 102,72,15,126,195
addq %rax,%r13
movq (%rsi),%rax
adcq $0,%rdx
addq %r11,%r13
adcq $0,%rdx
movq %r13,-16(%rsp,%r15,8)
movq %rdx,%r13
movq %r10,%r11
xorq %rdx,%rdx
addq %r11,%r13
adcq $0,%rdx
movq %r13,-8(%rsp,%r9,8)
movq %rdx,(%rsp,%r9,8)
leaq 1(%r14),%r14
jmp .Louter
.align 16
.Louter:
xorq %r15,%r15
movq %r8,%rbp
movq (%rsp),%r10
movq -96(%r12),%xmm0
movq -32(%r12),%xmm1
pand %xmm4,%xmm0
movq 32(%r12),%xmm2
pand %xmm5,%xmm1
mulq %rbx
addq %rax,%r10
movq (%rcx),%rax
adcq $0,%rdx
movq 96(%r12),%xmm3
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
imulq %r10,%rbp
movq %rdx,%r11
por %xmm2,%xmm0
leaq 256(%r12),%r12
por %xmm3,%xmm0
mulq %rbp
addq %rax,%r10
movq 8(%rsi),%rax
adcq $0,%rdx
movq 8(%rsp),%r10
movq %rdx,%r13
leaq 1(%r15),%r15
jmp .Linner_enter
.align 16
.Linner:
addq %rax,%r13
movq (%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r10,%r13
movq (%rsp,%r15,8),%r10
adcq $0,%rdx
movq %r13,-16(%rsp,%r15,8)
movq %rdx,%r13
.Linner_enter:
mulq %rbx
addq %rax,%r11
movq (%rcx,%r15,8),%rax
adcq $0,%rdx
addq %r11,%r10
movq %rdx,%r11
adcq $0,%r11
leaq 1(%r15),%r15
mulq %rbp
cmpq %r9,%r15
jne .Linner
.byte 102,72,15,126,195
addq %rax,%r13
movq (%rsi),%rax
adcq $0,%rdx
addq %r10,%r13
movq (%rsp,%r15,8),%r10
adcq $0,%rdx
movq %r13,-16(%rsp,%r15,8)
movq %rdx,%r13
xorq %rdx,%rdx
addq %r11,%r13
adcq $0,%rdx
addq %r10,%r13
adcq $0,%rdx
movq %r13,-8(%rsp,%r9,8)
movq %rdx,(%rsp,%r9,8)
leaq 1(%r14),%r14
cmpq %r9,%r14
jl .Louter
xorq %r14,%r14
movq (%rsp),%rax
leaq (%rsp),%rsi
movq %r9,%r15
jmp .Lsub
.align 16
.Lsub: sbbq (%rcx,%r14,8),%rax
movq %rax,(%rdi,%r14,8)
movq 8(%rsi,%r14,8),%rax
leaq 1(%r14),%r14
decq %r15
jnz .Lsub
sbbq $0,%rax
xorq %r14,%r14
andq %rax,%rsi
notq %rax
movq %rdi,%rcx
andq %rax,%rcx
movq %r9,%r15
orq %rcx,%rsi
.align 16
.Lcopy:
movq (%rsi,%r14,8),%rax
movq %r14,(%rsp,%r14,8)
movq %rax,(%rdi,%r14,8)
leaq 1(%r14),%r14
subq $1,%r15
jnz .Lcopy
movq 8(%rsp,%r9,8),%rsi
movq $1,%rax
movq (%rsi),%r15
movq 8(%rsi),%r14
movq 16(%rsi),%r13
movq 24(%rsi),%r12
movq 32(%rsi),%rbp
movq 40(%rsi),%rbx
leaq 48(%rsi),%rsp
.Lmul_epilogue:
.byte 0xf3,0xc3
.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
.type bn_mul4x_mont_gather5,@function
.align 16
bn_mul4x_mont_gather5:
.Lmul4x_enter:
movl %r9d,%r9d
movl 8(%rsp),%r10d
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
movq %rsp,%rax
leaq 4(%r9),%r11
negq %r11
leaq (%rsp,%r11,8),%rsp
andq $-1024,%rsp
movq %rax,8(%rsp,%r9,8)
.Lmul4x_body:
movq %rdi,16(%rsp,%r9,8)
movq %rdx,%r12
movq %r10,%r11
shrq $3,%r10
andq $7,%r11
notq %r10
leaq .Lmagic_masks(%rip),%rax
andq $3,%r10
leaq 96(%r12,%r11,8),%r12
movq 0(%rax,%r10,8),%xmm4
movq 8(%rax,%r10,8),%xmm5
movq 16(%rax,%r10,8),%xmm6
movq 24(%rax,%r10,8),%xmm7
movq -96(%r12),%xmm0
movq -32(%r12),%xmm1
pand %xmm4,%xmm0
movq 32(%r12),%xmm2
pand %xmm5,%xmm1
movq 96(%r12),%xmm3
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
por %xmm2,%xmm0
leaq 256(%r12),%r12
por %xmm3,%xmm0
.byte 102,72,15,126,195
movq (%r8),%r8
movq (%rsi),%rax
xorq %r14,%r14
xorq %r15,%r15
movq -96(%r12),%xmm0
movq -32(%r12),%xmm1
pand %xmm4,%xmm0
movq 32(%r12),%xmm2
pand %xmm5,%xmm1
movq %r8,%rbp
mulq %rbx
movq %rax,%r10
movq (%rcx),%rax
movq 96(%r12),%xmm3
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
imulq %r10,%rbp
movq %rdx,%r11
por %xmm2,%xmm0
leaq 256(%r12),%r12
por %xmm3,%xmm0
mulq %rbp
addq %rax,%r10
movq 8(%rsi),%rax
adcq $0,%rdx
movq %rdx,%rdi
mulq %rbx
addq %rax,%r11
movq 8(%rcx),%rax
adcq $0,%rdx
movq %rdx,%r10
mulq %rbp
addq %rax,%rdi
movq 16(%rsi),%rax
adcq $0,%rdx
addq %r11,%rdi
leaq 4(%r15),%r15
adcq $0,%rdx
movq %rdi,(%rsp)
movq %rdx,%r13
jmp .L1st4x
.align 16
.L1st4x:
mulq %rbx
addq %rax,%r10
movq -16(%rcx,%r15,8),%rax
adcq $0,%rdx
movq %rdx,%r11
mulq %rbp
addq %rax,%r13
movq -8(%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r10,%r13
adcq $0,%rdx
movq %r13,-24(%rsp,%r15,8)
movq %rdx,%rdi
mulq %rbx
addq %rax,%r11
movq -8(%rcx,%r15,8),%rax
adcq $0,%rdx
movq %rdx,%r10
mulq %rbp
addq %rax,%rdi
movq (%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r11,%rdi
adcq $0,%rdx
movq %rdi,-16(%rsp,%r15,8)
movq %rdx,%r13
mulq %rbx
addq %rax,%r10
movq (%rcx,%r15,8),%rax
adcq $0,%rdx
movq %rdx,%r11
mulq %rbp
addq %rax,%r13
movq 8(%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r10,%r13
adcq $0,%rdx
movq %r13,-8(%rsp,%r15,8)
movq %rdx,%rdi
mulq %rbx
addq %rax,%r11
movq 8(%rcx,%r15,8),%rax
adcq $0,%rdx
leaq 4(%r15),%r15
movq %rdx,%r10
mulq %rbp
addq %rax,%rdi
movq -16(%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r11,%rdi
adcq $0,%rdx
movq %rdi,-32(%rsp,%r15,8)
movq %rdx,%r13
cmpq %r9,%r15
jl .L1st4x
mulq %rbx
addq %rax,%r10
movq -16(%rcx,%r15,8),%rax
adcq $0,%rdx
movq %rdx,%r11
mulq %rbp
addq %rax,%r13
movq -8(%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r10,%r13
adcq $0,%rdx
movq %r13,-24(%rsp,%r15,8)
movq %rdx,%rdi
mulq %rbx
addq %rax,%r11
movq -8(%rcx,%r15,8),%rax
adcq $0,%rdx
movq %rdx,%r10
mulq %rbp
addq %rax,%rdi
movq (%rsi),%rax
adcq $0,%rdx
addq %r11,%rdi
adcq $0,%rdx
movq %rdi,-16(%rsp,%r15,8)
movq %rdx,%r13
.byte 102,72,15,126,195
xorq %rdi,%rdi
addq %r10,%r13
adcq $0,%rdi
movq %r13,-8(%rsp,%r15,8)
movq %rdi,(%rsp,%r15,8)
leaq 1(%r14),%r14
.align 4
.Louter4x:
xorq %r15,%r15
movq -96(%r12),%xmm0
movq -32(%r12),%xmm1
pand %xmm4,%xmm0
movq 32(%r12),%xmm2
pand %xmm5,%xmm1
movq (%rsp),%r10
movq %r8,%rbp
mulq %rbx
addq %rax,%r10
movq (%rcx),%rax
adcq $0,%rdx
movq 96(%r12),%xmm3
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
imulq %r10,%rbp
movq %rdx,%r11
por %xmm2,%xmm0
leaq 256(%r12),%r12
por %xmm3,%xmm0
mulq %rbp
addq %rax,%r10
movq 8(%rsi),%rax
adcq $0,%rdx
movq %rdx,%rdi
mulq %rbx
addq %rax,%r11
movq 8(%rcx),%rax
adcq $0,%rdx
addq 8(%rsp),%r11
adcq $0,%rdx
movq %rdx,%r10
mulq %rbp
addq %rax,%rdi
movq 16(%rsi),%rax
adcq $0,%rdx
addq %r11,%rdi
leaq 4(%r15),%r15
adcq $0,%rdx
movq %rdx,%r13
jmp .Linner4x
.align 16
.Linner4x:
mulq %rbx
addq %rax,%r10
movq -16(%rcx,%r15,8),%rax
adcq $0,%rdx
addq -16(%rsp,%r15,8),%r10
adcq $0,%rdx
movq %rdx,%r11
mulq %rbp
addq %rax,%r13
movq -8(%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r10,%r13
adcq $0,%rdx
movq %rdi,-32(%rsp,%r15,8)
movq %rdx,%rdi
mulq %rbx
addq %rax,%r11
movq -8(%rcx,%r15,8),%rax
adcq $0,%rdx
addq -8(%rsp,%r15,8),%r11
adcq $0,%rdx
movq %rdx,%r10
mulq %rbp
addq %rax,%rdi
movq (%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r11,%rdi
adcq $0,%rdx
movq %r13,-24(%rsp,%r15,8)
movq %rdx,%r13
mulq %rbx
addq %rax,%r10
movq (%rcx,%r15,8),%rax
adcq $0,%rdx
addq (%rsp,%r15,8),%r10
adcq $0,%rdx
movq %rdx,%r11
mulq %rbp
addq %rax,%r13
movq 8(%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r10,%r13
adcq $0,%rdx
movq %rdi,-16(%rsp,%r15,8)
movq %rdx,%rdi
mulq %rbx
addq %rax,%r11
movq 8(%rcx,%r15,8),%rax
adcq $0,%rdx
addq 8(%rsp,%r15,8),%r11
adcq $0,%rdx
leaq 4(%r15),%r15
movq %rdx,%r10
mulq %rbp
addq %rax,%rdi
movq -16(%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r11,%rdi
adcq $0,%rdx
movq %r13,-40(%rsp,%r15,8)
movq %rdx,%r13
cmpq %r9,%r15
jl .Linner4x
mulq %rbx
addq %rax,%r10
movq -16(%rcx,%r15,8),%rax
adcq $0,%rdx
addq -16(%rsp,%r15,8),%r10
adcq $0,%rdx
movq %rdx,%r11
mulq %rbp
addq %rax,%r13
movq -8(%rsi,%r15,8),%rax
adcq $0,%rdx
addq %r10,%r13
adcq $0,%rdx
movq %rdi,-32(%rsp,%r15,8)
movq %rdx,%rdi
mulq %rbx
addq %rax,%r11
movq -8(%rcx,%r15,8),%rax
adcq $0,%rdx
addq -8(%rsp,%r15,8),%r11
adcq $0,%rdx
leaq 1(%r14),%r14
movq %rdx,%r10
mulq %rbp
addq %rax,%rdi
movq (%rsi),%rax
adcq $0,%rdx
addq %r11,%rdi
adcq $0,%rdx
movq %r13,-24(%rsp,%r15,8)
movq %rdx,%r13
.byte 102,72,15,126,195
movq %rdi,-16(%rsp,%r15,8)
xorq %rdi,%rdi
addq %r10,%r13
adcq $0,%rdi
addq (%rsp,%r9,8),%r13
adcq $0,%rdi
movq %r13,-8(%rsp,%r15,8)
movq %rdi,(%rsp,%r15,8)
cmpq %r9,%r14
jl .Louter4x
movq 16(%rsp,%r9,8),%rdi
movq 0(%rsp),%rax
pxor %xmm0,%xmm0
movq 8(%rsp),%rdx
shrq $2,%r9
leaq (%rsp),%rsi
xorq %r14,%r14
subq 0(%rcx),%rax
movq 16(%rsi),%rbx
movq 24(%rsi),%rbp
sbbq 8(%rcx),%rdx
leaq -1(%r9),%r15
jmp .Lsub4x
.align 16
.Lsub4x:
movq %rax,0(%rdi,%r14,8)
movq %rdx,8(%rdi,%r14,8)
sbbq 16(%rcx,%r14,8),%rbx
movq 32(%rsi,%r14,8),%rax
movq 40(%rsi,%r14,8),%rdx
sbbq 24(%rcx,%r14,8),%rbp
movq %rbx,16(%rdi,%r14,8)
movq %rbp,24(%rdi,%r14,8)
sbbq 32(%rcx,%r14,8),%rax
movq 48(%rsi,%r14,8),%rbx
movq 56(%rsi,%r14,8),%rbp
sbbq 40(%rcx,%r14,8),%rdx
leaq 4(%r14),%r14
decq %r15
jnz .Lsub4x
movq %rax,0(%rdi,%r14,8)
movq 32(%rsi,%r14,8),%rax
sbbq 16(%rcx,%r14,8),%rbx
movq %rdx,8(%rdi,%r14,8)
sbbq 24(%rcx,%r14,8),%rbp
movq %rbx,16(%rdi,%r14,8)
sbbq $0,%rax
movq %rbp,24(%rdi,%r14,8)
xorq %r14,%r14
andq %rax,%rsi
notq %rax
movq %rdi,%rcx
andq %rax,%rcx
leaq -1(%r9),%r15
orq %rcx,%rsi
movdqu (%rsi),%xmm1
movdqa %xmm0,(%rsp)
movdqu %xmm1,(%rdi)
jmp .Lcopy4x
.align 16
.Lcopy4x:
movdqu 16(%rsi,%r14,1),%xmm2
movdqu 32(%rsi,%r14,1),%xmm1
movdqa %xmm0,16(%rsp,%r14,1)
movdqu %xmm2,16(%rdi,%r14,1)
movdqa %xmm0,32(%rsp,%r14,1)
movdqu %xmm1,32(%rdi,%r14,1)
leaq 32(%r14),%r14
decq %r15
jnz .Lcopy4x
shlq $2,%r9
movdqu 16(%rsi,%r14,1),%xmm2
movdqa %xmm0,16(%rsp,%r14,1)
movdqu %xmm2,16(%rdi,%r14,1)
movq 8(%rsp,%r9,8),%rsi
movq $1,%rax
movq (%rsi),%r15
movq 8(%rsi),%r14
movq 16(%rsi),%r13
movq 24(%rsi),%r12
movq 32(%rsi),%rbp
movq 40(%rsi),%rbx
leaq 48(%rsi),%rsp
.Lmul4x_epilogue:
.byte 0xf3,0xc3
.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
.globl bn_scatter5
.type bn_scatter5,@function
.align 16
bn_scatter5:
cmpq $0,%rsi
jz .Lscatter_epilogue
leaq (%rdx,%rcx,8),%rdx
.Lscatter:
movq (%rdi),%rax
leaq 8(%rdi),%rdi
movq %rax,(%rdx)
leaq 256(%rdx),%rdx
subq $1,%rsi
jnz .Lscatter
.Lscatter_epilogue:
.byte 0xf3,0xc3
.size bn_scatter5,.-bn_scatter5
.globl bn_gather5
.type bn_gather5,@function
.align 16
bn_gather5:
movq %rcx,%r11
shrq $3,%rcx
andq $7,%r11
notq %rcx
leaq .Lmagic_masks(%rip),%rax
andq $3,%rcx
leaq 96(%rdx,%r11,8),%rdx
movq 0(%rax,%rcx,8),%xmm4
movq 8(%rax,%rcx,8),%xmm5
movq 16(%rax,%rcx,8),%xmm6
movq 24(%rax,%rcx,8),%xmm7
jmp .Lgather
.align 16
.Lgather:
movq -96(%rdx),%xmm0
movq -32(%rdx),%xmm1
pand %xmm4,%xmm0
movq 32(%rdx),%xmm2
pand %xmm5,%xmm1
movq 96(%rdx),%xmm3
pand %xmm6,%xmm2
por %xmm1,%xmm0
pand %xmm7,%xmm3
por %xmm2,%xmm0
leaq 256(%rdx),%rdx
por %xmm3,%xmm0
movq %xmm0,(%rdi)
leaq 8(%rdi),%rdi
subq $1,%rsi
jnz .Lgather
.byte 0xf3,0xc3
.LSEH_end_bn_gather5:
.size bn_gather5,.-bn_gather5
.align 64
.Lmagic_masks:
.long 0,0, 0,0, 0,0, -1,-1
.long 0,0, 0,0, 0,0, 0,0
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0