Optimize i386 support in libcrypto:

- Enable optional SSE2 assembler versions. Regenerate.
- Hook up assembler version of GCM.
This commit is contained in:
joerg 2015-05-16 17:32:54 +00:00
parent 74ab1f3972
commit c70ac7a37b
7 changed files with 2850 additions and 612 deletions

View File

@ -1,4 +1,4 @@
# $NetBSD: Makefile,v 1.6 2012/07/30 10:25:24 christos Exp $
# $NetBSD: Makefile,v 1.7 2015/05/16 17:32:54 joerg Exp $
.include "bsd.own.mk"
@ -9,7 +9,7 @@ regen:
for i in $$(find ${OPENSSLSRC} -name \*86.pl) \
${OPENSSLSRC}/crypto/x86cpuid.pl; do \
perl -I${OPENSSLSRC}/crypto/perlasm \
-I${OPENSSLSRC}/crypto/bn/asm $$i elf -fPIC \
-I${OPENSSLSRC}/crypto/bn/asm $$i elf -fPIC -DOPENSSL_IA32_SSE2 \
| sed -e 's,^\.file.*$$,#include <machine/asm.h>,' \
-e 's/ call OPENSSL_cpuid_setup/ PIC_PROLOGUE! call PIC_PLT(OPENSSL_cpuid_setup)! PIC_EPILOGUE/' | tr '!' '\n' \
> $$(basename $$i .pl).S; \

View File

@ -5,6 +5,103 @@
.align 16
bn_mul_add_words:
.L_bn_mul_add_words_begin:
call .L000PIC_me_up
.L000PIC_me_up:
popl %eax
leal _GLOBAL_OFFSET_TABLE_+[.-.L000PIC_me_up](%eax),%eax
movl OPENSSL_ia32cap_P@GOT(%eax),%eax
btl $26,(%eax)
jnc .L001maw_non_sse2
movl 4(%esp),%eax
movl 8(%esp),%edx
movl 12(%esp),%ecx
movd 16(%esp),%mm0
pxor %mm1,%mm1
jmp .L002maw_sse2_entry
.align 16
.L003maw_sse2_unrolled:
movd (%eax),%mm3
paddq %mm3,%mm1
movd (%edx),%mm2
pmuludq %mm0,%mm2
movd 4(%edx),%mm4
pmuludq %mm0,%mm4
movd 8(%edx),%mm6
pmuludq %mm0,%mm6
movd 12(%edx),%mm7
pmuludq %mm0,%mm7
paddq %mm2,%mm1
movd 4(%eax),%mm3
paddq %mm4,%mm3
movd 8(%eax),%mm5
paddq %mm6,%mm5
movd 12(%eax),%mm4
paddq %mm4,%mm7
movd %mm1,(%eax)
movd 16(%edx),%mm2
pmuludq %mm0,%mm2
psrlq $32,%mm1
movd 20(%edx),%mm4
pmuludq %mm0,%mm4
paddq %mm3,%mm1
movd 24(%edx),%mm6
pmuludq %mm0,%mm6
movd %mm1,4(%eax)
psrlq $32,%mm1
movd 28(%edx),%mm3
addl $32,%edx
pmuludq %mm0,%mm3
paddq %mm5,%mm1
movd 16(%eax),%mm5
paddq %mm5,%mm2
movd %mm1,8(%eax)
psrlq $32,%mm1
paddq %mm7,%mm1
movd 20(%eax),%mm5
paddq %mm5,%mm4
movd %mm1,12(%eax)
psrlq $32,%mm1
paddq %mm2,%mm1
movd 24(%eax),%mm5
paddq %mm5,%mm6
movd %mm1,16(%eax)
psrlq $32,%mm1
paddq %mm4,%mm1
movd 28(%eax),%mm5
paddq %mm5,%mm3
movd %mm1,20(%eax)
psrlq $32,%mm1
paddq %mm6,%mm1
movd %mm1,24(%eax)
psrlq $32,%mm1
paddq %mm3,%mm1
movd %mm1,28(%eax)
leal 32(%eax),%eax
psrlq $32,%mm1
subl $8,%ecx
jz .L004maw_sse2_exit
.L002maw_sse2_entry:
testl $4294967288,%ecx
jnz .L003maw_sse2_unrolled
.align 4
.L005maw_sse2_loop:
movd (%edx),%mm2
movd (%eax),%mm3
pmuludq %mm0,%mm2
leal 4(%edx),%edx
paddq %mm3,%mm1
paddq %mm2,%mm1
movd %mm1,(%eax)
subl $1,%ecx
psrlq $32,%mm1
leal 4(%eax),%eax
jnz .L005maw_sse2_loop
.L004maw_sse2_exit:
movd %mm1,%eax
emms
ret
.align 16
.L001maw_non_sse2:
pushl %ebp
pushl %ebx
pushl %esi
@ -17,9 +114,9 @@ bn_mul_add_words:
andl $4294967288,%ecx
movl 32(%esp),%ebp
pushl %ecx
jz .L000maw_finish
jz .L006maw_finish
.align 16
.L001maw_loop:
.L007maw_loop:
movl (%ebx),%eax
mull %ebp
@ -96,13 +193,13 @@ bn_mul_add_words:
subl $8,%ecx
leal 32(%ebx),%ebx
leal 32(%edi),%edi
jnz .L001maw_loop
.L000maw_finish:
jnz .L007maw_loop
.L006maw_finish:
movl 32(%esp),%ecx
andl $7,%ecx
jnz .L002maw_finish2
jmp .L003maw_end
.L002maw_finish2:
jnz .L008maw_finish2
jmp .L009maw_end
.L008maw_finish2:
movl (%ebx),%eax
mull %ebp
@ -113,7 +210,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,(%edi)
movl %edx,%esi
jz .L003maw_end
jz .L009maw_end
movl 4(%ebx),%eax
mull %ebp
@ -124,7 +221,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,4(%edi)
movl %edx,%esi
jz .L003maw_end
jz .L009maw_end
movl 8(%ebx),%eax
mull %ebp
@ -135,7 +232,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,8(%edi)
movl %edx,%esi
jz .L003maw_end
jz .L009maw_end
movl 12(%ebx),%eax
mull %ebp
@ -146,7 +243,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,12(%edi)
movl %edx,%esi
jz .L003maw_end
jz .L009maw_end
movl 16(%ebx),%eax
mull %ebp
@ -157,7 +254,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,16(%edi)
movl %edx,%esi
jz .L003maw_end
jz .L009maw_end
movl 20(%ebx),%eax
mull %ebp
@ -168,7 +265,7 @@ bn_mul_add_words:
decl %ecx
movl %eax,20(%edi)
movl %edx,%esi
jz .L003maw_end
jz .L009maw_end
movl 24(%ebx),%eax
mull %ebp
@ -178,7 +275,7 @@ bn_mul_add_words:
adcl $0,%edx
movl %eax,24(%edi)
movl %edx,%esi
.L003maw_end:
.L009maw_end:
movl %esi,%eax
popl %ecx
popl %edi
@ -192,6 +289,34 @@ bn_mul_add_words:
.align 16
bn_mul_words:
.L_bn_mul_words_begin:
call .L010PIC_me_up
.L010PIC_me_up:
popl %eax
leal _GLOBAL_OFFSET_TABLE_+[.-.L010PIC_me_up](%eax),%eax
movl OPENSSL_ia32cap_P@GOT(%eax),%eax
btl $26,(%eax)
jnc .L011mw_non_sse2
movl 4(%esp),%eax
movl 8(%esp),%edx
movl 12(%esp),%ecx
movd 16(%esp),%mm0
pxor %mm1,%mm1
.align 16
.L012mw_sse2_loop:
movd (%edx),%mm2
pmuludq %mm0,%mm2
leal 4(%edx),%edx
paddq %mm2,%mm1
movd %mm1,(%eax)
subl $1,%ecx
psrlq $32,%mm1
leal 4(%eax),%eax
jnz .L012mw_sse2_loop
movd %mm1,%eax
emms
ret
.align 16
.L011mw_non_sse2:
pushl %ebp
pushl %ebx
pushl %esi
@ -203,8 +328,8 @@ bn_mul_words:
movl 28(%esp),%ebp
movl 32(%esp),%ecx
andl $4294967288,%ebp
jz .L004mw_finish
.L005mw_loop:
jz .L013mw_finish
.L014mw_loop:
movl (%ebx),%eax
mull %ecx
@ -265,14 +390,14 @@ bn_mul_words:
addl $32,%ebx
addl $32,%edi
subl $8,%ebp
jz .L004mw_finish
jmp .L005mw_loop
.L004mw_finish:
jz .L013mw_finish
jmp .L014mw_loop
.L013mw_finish:
movl 28(%esp),%ebp
andl $7,%ebp
jnz .L006mw_finish2
jmp .L007mw_end
.L006mw_finish2:
jnz .L015mw_finish2
jmp .L016mw_end
.L015mw_finish2:
movl (%ebx),%eax
mull %ecx
@ -281,7 +406,7 @@ bn_mul_words:
movl %eax,(%edi)
movl %edx,%esi
decl %ebp
jz .L007mw_end
jz .L016mw_end
movl 4(%ebx),%eax
mull %ecx
@ -290,7 +415,7 @@ bn_mul_words:
movl %eax,4(%edi)
movl %edx,%esi
decl %ebp
jz .L007mw_end
jz .L016mw_end
movl 8(%ebx),%eax
mull %ecx
@ -299,7 +424,7 @@ bn_mul_words:
movl %eax,8(%edi)
movl %edx,%esi
decl %ebp
jz .L007mw_end
jz .L016mw_end
movl 12(%ebx),%eax
mull %ecx
@ -308,7 +433,7 @@ bn_mul_words:
movl %eax,12(%edi)
movl %edx,%esi
decl %ebp
jz .L007mw_end
jz .L016mw_end
movl 16(%ebx),%eax
mull %ecx
@ -317,7 +442,7 @@ bn_mul_words:
movl %eax,16(%edi)
movl %edx,%esi
decl %ebp
jz .L007mw_end
jz .L016mw_end
movl 20(%ebx),%eax
mull %ecx
@ -326,7 +451,7 @@ bn_mul_words:
movl %eax,20(%edi)
movl %edx,%esi
decl %ebp
jz .L007mw_end
jz .L016mw_end
movl 24(%ebx),%eax
mull %ecx
@ -334,7 +459,7 @@ bn_mul_words:
adcl $0,%edx
movl %eax,24(%edi)
movl %edx,%esi
.L007mw_end:
.L016mw_end:
movl %esi,%eax
popl %edi
popl %esi
@ -347,6 +472,29 @@ bn_mul_words:
.align 16
bn_sqr_words:
.L_bn_sqr_words_begin:
call .L017PIC_me_up
.L017PIC_me_up:
popl %eax
leal _GLOBAL_OFFSET_TABLE_+[.-.L017PIC_me_up](%eax),%eax
movl OPENSSL_ia32cap_P@GOT(%eax),%eax
btl $26,(%eax)
jnc .L018sqr_non_sse2
movl 4(%esp),%eax
movl 8(%esp),%edx
movl 12(%esp),%ecx
.align 16
.L019sqr_sse2_loop:
movd (%edx),%mm0
pmuludq %mm0,%mm0
leal 4(%edx),%edx
movq %mm0,(%eax)
subl $1,%ecx
leal 8(%eax),%eax
jnz .L019sqr_sse2_loop
emms
ret
.align 16
.L018sqr_non_sse2:
pushl %ebp
pushl %ebx
pushl %esi
@ -356,8 +504,8 @@ bn_sqr_words:
movl 24(%esp),%edi
movl 28(%esp),%ebx
andl $4294967288,%ebx
jz .L008sw_finish
.L009sw_loop:
jz .L020sw_finish
.L021sw_loop:
movl (%edi),%eax
mull %eax
@ -402,59 +550,59 @@ bn_sqr_words:
addl $32,%edi
addl $64,%esi
subl $8,%ebx
jnz .L009sw_loop
.L008sw_finish:
jnz .L021sw_loop
.L020sw_finish:
movl 28(%esp),%ebx
andl $7,%ebx
jz .L010sw_end
jz .L022sw_end
movl (%edi),%eax
mull %eax
movl %eax,(%esi)
decl %ebx
movl %edx,4(%esi)
jz .L010sw_end
jz .L022sw_end
movl 4(%edi),%eax
mull %eax
movl %eax,8(%esi)
decl %ebx
movl %edx,12(%esi)
jz .L010sw_end
jz .L022sw_end
movl 8(%edi),%eax
mull %eax
movl %eax,16(%esi)
decl %ebx
movl %edx,20(%esi)
jz .L010sw_end
jz .L022sw_end
movl 12(%edi),%eax
mull %eax
movl %eax,24(%esi)
decl %ebx
movl %edx,28(%esi)
jz .L010sw_end
jz .L022sw_end
movl 16(%edi),%eax
mull %eax
movl %eax,32(%esi)
decl %ebx
movl %edx,36(%esi)
jz .L010sw_end
jz .L022sw_end
movl 20(%edi),%eax
mull %eax
movl %eax,40(%esi)
decl %ebx
movl %edx,44(%esi)
jz .L010sw_end
jz .L022sw_end
movl 24(%edi),%eax
mull %eax
movl %eax,48(%esi)
movl %edx,52(%esi)
.L010sw_end:
.L022sw_end:
popl %edi
popl %esi
popl %ebx
@ -488,8 +636,8 @@ bn_add_words:
movl 32(%esp),%ebp
xorl %eax,%eax
andl $4294967288,%ebp
jz .L011aw_finish
.L012aw_loop:
jz .L023aw_finish
.L024aw_loop:
movl (%esi),%ecx
movl (%edi),%edx
@ -567,11 +715,11 @@ bn_add_words:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
jnz .L012aw_loop
.L011aw_finish:
jnz .L024aw_loop
.L023aw_finish:
movl 32(%esp),%ebp
andl $7,%ebp
jz .L013aw_end
jz .L025aw_end
movl (%esi),%ecx
movl (%edi),%edx
@ -582,7 +730,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,(%ebx)
jz .L013aw_end
jz .L025aw_end
movl 4(%esi),%ecx
movl 4(%edi),%edx
@ -593,7 +741,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,4(%ebx)
jz .L013aw_end
jz .L025aw_end
movl 8(%esi),%ecx
movl 8(%edi),%edx
@ -604,7 +752,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,8(%ebx)
jz .L013aw_end
jz .L025aw_end
movl 12(%esi),%ecx
movl 12(%edi),%edx
@ -615,7 +763,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,12(%ebx)
jz .L013aw_end
jz .L025aw_end
movl 16(%esi),%ecx
movl 16(%edi),%edx
@ -626,7 +774,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,16(%ebx)
jz .L013aw_end
jz .L025aw_end
movl 20(%esi),%ecx
movl 20(%edi),%edx
@ -637,7 +785,7 @@ bn_add_words:
adcl $0,%eax
decl %ebp
movl %ecx,20(%ebx)
jz .L013aw_end
jz .L025aw_end
movl 24(%esi),%ecx
movl 24(%edi),%edx
@ -647,7 +795,7 @@ bn_add_words:
addl %edx,%ecx
adcl $0,%eax
movl %ecx,24(%ebx)
.L013aw_end:
.L025aw_end:
popl %edi
popl %esi
popl %ebx
@ -670,8 +818,8 @@ bn_sub_words:
movl 32(%esp),%ebp
xorl %eax,%eax
andl $4294967288,%ebp
jz .L014aw_finish
.L015aw_loop:
jz .L026aw_finish
.L027aw_loop:
movl (%esi),%ecx
movl (%edi),%edx
@ -749,11 +897,11 @@ bn_sub_words:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
jnz .L015aw_loop
.L014aw_finish:
jnz .L027aw_loop
.L026aw_finish:
movl 32(%esp),%ebp
andl $7,%ebp
jz .L016aw_end
jz .L028aw_end
movl (%esi),%ecx
movl (%edi),%edx
@ -764,7 +912,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,(%ebx)
jz .L016aw_end
jz .L028aw_end
movl 4(%esi),%ecx
movl 4(%edi),%edx
@ -775,7 +923,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,4(%ebx)
jz .L016aw_end
jz .L028aw_end
movl 8(%esi),%ecx
movl 8(%edi),%edx
@ -786,7 +934,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,8(%ebx)
jz .L016aw_end
jz .L028aw_end
movl 12(%esi),%ecx
movl 12(%edi),%edx
@ -797,7 +945,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,12(%ebx)
jz .L016aw_end
jz .L028aw_end
movl 16(%esi),%ecx
movl 16(%edi),%edx
@ -808,7 +956,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,16(%ebx)
jz .L016aw_end
jz .L028aw_end
movl 20(%esi),%ecx
movl 20(%edi),%edx
@ -819,7 +967,7 @@ bn_sub_words:
adcl $0,%eax
decl %ebp
movl %ecx,20(%ebx)
jz .L016aw_end
jz .L028aw_end
movl 24(%esi),%ecx
movl 24(%edi),%edx
@ -829,7 +977,7 @@ bn_sub_words:
subl %edx,%ecx
adcl $0,%eax
movl %ecx,24(%ebx)
.L016aw_end:
.L028aw_end:
popl %edi
popl %esi
popl %ebx
@ -852,8 +1000,8 @@ bn_sub_part_words:
movl 32(%esp),%ebp
xorl %eax,%eax
andl $4294967288,%ebp
jz .L017aw_finish
.L018aw_loop:
jz .L029aw_finish
.L030aw_loop:
movl (%esi),%ecx
movl (%edi),%edx
@ -931,11 +1079,11 @@ bn_sub_part_words:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
jnz .L018aw_loop
.L017aw_finish:
jnz .L030aw_loop
.L029aw_finish:
movl 32(%esp),%ebp
andl $7,%ebp
jz .L019aw_end
jz .L031aw_end
movl (%esi),%ecx
movl (%edi),%edx
@ -949,7 +1097,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
jz .L019aw_end
jz .L031aw_end
movl (%esi),%ecx
movl (%edi),%edx
@ -963,7 +1111,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
jz .L019aw_end
jz .L031aw_end
movl (%esi),%ecx
movl (%edi),%edx
@ -977,7 +1125,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
jz .L019aw_end
jz .L031aw_end
movl (%esi),%ecx
movl (%edi),%edx
@ -991,7 +1139,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
jz .L019aw_end
jz .L031aw_end
movl (%esi),%ecx
movl (%edi),%edx
@ -1005,7 +1153,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
jz .L019aw_end
jz .L031aw_end
movl (%esi),%ecx
movl (%edi),%edx
@ -1019,7 +1167,7 @@ bn_sub_part_words:
addl $4,%edi
addl $4,%ebx
decl %ebp
jz .L019aw_end
jz .L031aw_end
movl (%esi),%ecx
movl (%edi),%edx
@ -1032,20 +1180,20 @@ bn_sub_part_words:
addl $4,%esi
addl $4,%edi
addl $4,%ebx
.L019aw_end:
.L031aw_end:
cmpl $0,36(%esp)
je .L020pw_end
je .L032pw_end
movl 36(%esp),%ebp
cmpl $0,%ebp
je .L020pw_end
jge .L021pw_pos
je .L032pw_end
jge .L033pw_pos
movl $0,%edx
subl %ebp,%edx
movl %edx,%ebp
andl $4294967288,%ebp
jz .L022pw_neg_finish
.L023pw_neg_loop:
jz .L034pw_neg_finish
.L035pw_neg_loop:
movl $0,%ecx
movl (%edi),%edx
@ -1122,13 +1270,13 @@ bn_sub_part_words:
addl $32,%edi
addl $32,%ebx
subl $8,%ebp
jnz .L023pw_neg_loop
.L022pw_neg_finish:
jnz .L035pw_neg_loop
.L034pw_neg_finish:
movl 36(%esp),%edx
movl $0,%ebp
subl %edx,%ebp
andl $7,%ebp
jz .L020pw_end
jz .L032pw_end
movl $0,%ecx
movl (%edi),%edx
@ -1139,7 +1287,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,(%ebx)
jz .L020pw_end
jz .L032pw_end
movl $0,%ecx
movl 4(%edi),%edx
@ -1150,7 +1298,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,4(%ebx)
jz .L020pw_end
jz .L032pw_end
movl $0,%ecx
movl 8(%edi),%edx
@ -1161,7 +1309,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,8(%ebx)
jz .L020pw_end
jz .L032pw_end
movl $0,%ecx
movl 12(%edi),%edx
@ -1172,7 +1320,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,12(%ebx)
jz .L020pw_end
jz .L032pw_end
movl $0,%ecx
movl 16(%edi),%edx
@ -1183,7 +1331,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,16(%ebx)
jz .L020pw_end
jz .L032pw_end
movl $0,%ecx
movl 20(%edi),%edx
@ -1194,7 +1342,7 @@ bn_sub_part_words:
adcl $0,%eax
decl %ebp
movl %ecx,20(%ebx)
jz .L020pw_end
jz .L032pw_end
movl $0,%ecx
movl 24(%edi),%edx
@ -1204,181 +1352,182 @@ bn_sub_part_words:
subl %edx,%ecx
adcl $0,%eax
movl %ecx,24(%ebx)
jmp .L020pw_end
.L021pw_pos:
jmp .L032pw_end
.L033pw_pos:
andl $4294967288,%ebp
jz .L024pw_pos_finish
.L025pw_pos_loop:
jz .L036pw_pos_finish
.L037pw_pos_loop:
movl (%esi),%ecx
subl %eax,%ecx
movl %ecx,(%ebx)
jnc .L026pw_nc0
jnc .L038pw_nc0
movl 4(%esi),%ecx
subl %eax,%ecx
movl %ecx,4(%ebx)
jnc .L027pw_nc1
jnc .L039pw_nc1
movl 8(%esi),%ecx
subl %eax,%ecx
movl %ecx,8(%ebx)
jnc .L028pw_nc2
jnc .L040pw_nc2
movl 12(%esi),%ecx
subl %eax,%ecx
movl %ecx,12(%ebx)
jnc .L029pw_nc3
jnc .L041pw_nc3
movl 16(%esi),%ecx
subl %eax,%ecx
movl %ecx,16(%ebx)
jnc .L030pw_nc4
jnc .L042pw_nc4
movl 20(%esi),%ecx
subl %eax,%ecx
movl %ecx,20(%ebx)
jnc .L031pw_nc5
jnc .L043pw_nc5
movl 24(%esi),%ecx
subl %eax,%ecx
movl %ecx,24(%ebx)
jnc .L032pw_nc6
jnc .L044pw_nc6
movl 28(%esi),%ecx
subl %eax,%ecx
movl %ecx,28(%ebx)
jnc .L033pw_nc7
jnc .L045pw_nc7
addl $32,%esi
addl $32,%ebx
subl $8,%ebp
jnz .L025pw_pos_loop
.L024pw_pos_finish:
jnz .L037pw_pos_loop
.L036pw_pos_finish:
movl 36(%esp),%ebp
andl $7,%ebp
jz .L020pw_end
jz .L032pw_end
movl (%esi),%ecx
subl %eax,%ecx
movl %ecx,(%ebx)
jnc .L034pw_tail_nc0
jnc .L046pw_tail_nc0
decl %ebp
jz .L020pw_end
jz .L032pw_end
movl 4(%esi),%ecx
subl %eax,%ecx
movl %ecx,4(%ebx)
jnc .L035pw_tail_nc1
jnc .L047pw_tail_nc1
decl %ebp
jz .L020pw_end
jz .L032pw_end
movl 8(%esi),%ecx
subl %eax,%ecx
movl %ecx,8(%ebx)
jnc .L036pw_tail_nc2
jnc .L048pw_tail_nc2
decl %ebp
jz .L020pw_end
jz .L032pw_end
movl 12(%esi),%ecx
subl %eax,%ecx
movl %ecx,12(%ebx)
jnc .L037pw_tail_nc3
jnc .L049pw_tail_nc3
decl %ebp
jz .L020pw_end
jz .L032pw_end
movl 16(%esi),%ecx
subl %eax,%ecx
movl %ecx,16(%ebx)
jnc .L038pw_tail_nc4
jnc .L050pw_tail_nc4
decl %ebp
jz .L020pw_end
jz .L032pw_end
movl 20(%esi),%ecx
subl %eax,%ecx
movl %ecx,20(%ebx)
jnc .L039pw_tail_nc5
jnc .L051pw_tail_nc5
decl %ebp
jz .L020pw_end
jz .L032pw_end
movl 24(%esi),%ecx
subl %eax,%ecx
movl %ecx,24(%ebx)
jnc .L040pw_tail_nc6
jnc .L052pw_tail_nc6
movl $1,%eax
jmp .L020pw_end
.L041pw_nc_loop:
jmp .L032pw_end
.L053pw_nc_loop:
movl (%esi),%ecx
movl %ecx,(%ebx)
.L026pw_nc0:
.L038pw_nc0:
movl 4(%esi),%ecx
movl %ecx,4(%ebx)
.L027pw_nc1:
.L039pw_nc1:
movl 8(%esi),%ecx
movl %ecx,8(%ebx)
.L028pw_nc2:
.L040pw_nc2:
movl 12(%esi),%ecx
movl %ecx,12(%ebx)
.L029pw_nc3:
.L041pw_nc3:
movl 16(%esi),%ecx
movl %ecx,16(%ebx)
.L030pw_nc4:
.L042pw_nc4:
movl 20(%esi),%ecx
movl %ecx,20(%ebx)
.L031pw_nc5:
.L043pw_nc5:
movl 24(%esi),%ecx
movl %ecx,24(%ebx)
.L032pw_nc6:
.L044pw_nc6:
movl 28(%esi),%ecx
movl %ecx,28(%ebx)
.L033pw_nc7:
.L045pw_nc7:
addl $32,%esi
addl $32,%ebx
subl $8,%ebp
jnz .L041pw_nc_loop
jnz .L053pw_nc_loop
movl 36(%esp),%ebp
andl $7,%ebp
jz .L042pw_nc_end
jz .L054pw_nc_end
movl (%esi),%ecx
movl %ecx,(%ebx)
.L034pw_tail_nc0:
.L046pw_tail_nc0:
decl %ebp
jz .L042pw_nc_end
jz .L054pw_nc_end
movl 4(%esi),%ecx
movl %ecx,4(%ebx)
.L035pw_tail_nc1:
.L047pw_tail_nc1:
decl %ebp
jz .L042pw_nc_end
jz .L054pw_nc_end
movl 8(%esi),%ecx
movl %ecx,8(%ebx)
.L036pw_tail_nc2:
.L048pw_tail_nc2:
decl %ebp
jz .L042pw_nc_end
jz .L054pw_nc_end
movl 12(%esi),%ecx
movl %ecx,12(%ebx)
.L037pw_tail_nc3:
.L049pw_tail_nc3:
decl %ebp
jz .L042pw_nc_end
jz .L054pw_nc_end
movl 16(%esi),%ecx
movl %ecx,16(%ebx)
.L038pw_tail_nc4:
.L050pw_tail_nc4:
decl %ebp
jz .L042pw_nc_end
jz .L054pw_nc_end
movl 20(%esi),%ecx
movl %ecx,20(%ebx)
.L039pw_tail_nc5:
.L051pw_tail_nc5:
decl %ebp
jz .L042pw_nc_end
jz .L054pw_nc_end
movl 24(%esi),%ecx
movl %ecx,24(%ebx)
.L040pw_tail_nc6:
.L042pw_nc_end:
.L052pw_tail_nc6:
.L054pw_nc_end:
movl $0,%eax
.L020pw_end:
.L032pw_end:
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.size bn_sub_part_words,.-.L_bn_sub_part_words_begin
.comm OPENSSL_ia32cap_P,8,4

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,4 @@
.PATH.S: ${.PARSEDIR}
MODES_SRCS += ghash-x86.o
MODESCPPFLAGS = -DGHASH_ASM
.include "../../modes.inc"

File diff suppressed because it is too large Load Diff

View File

@ -25,6 +25,278 @@ sha512_block_data_order:
movl %edi,4(%esp)
movl %eax,8(%esp)
movl %ebx,12(%esp)
leal _GLOBAL_OFFSET_TABLE_+[.-.L001K512](%ebp),%edx
movl OPENSSL_ia32cap_P@GOT(%edx),%edx
btl $26,(%edx)
jnc .L002loop_x86
movq (%esi),%mm0
movq 8(%esi),%mm1
movq 16(%esi),%mm2
movq 24(%esi),%mm3
movq 32(%esi),%mm4
movq 40(%esi),%mm5
movq 48(%esi),%mm6
movq 56(%esi),%mm7
subl $80,%esp
.align 16
.L003loop_sse2:
movq %mm1,8(%esp)
movq %mm2,16(%esp)
movq %mm3,24(%esp)
movq %mm5,40(%esp)
movq %mm6,48(%esp)
movq %mm7,56(%esp)
movl (%edi),%ecx
movl 4(%edi),%edx
addl $8,%edi
bswap %ecx
bswap %edx
movl %ecx,76(%esp)
movl %edx,72(%esp)
.align 16
.L00400_14_sse2:
movl (%edi),%eax
movl 4(%edi),%ebx
addl $8,%edi
bswap %eax
bswap %ebx
movl %eax,68(%esp)
movl %ebx,64(%esp)
movq 40(%esp),%mm5
movq 48(%esp),%mm6
movq 56(%esp),%mm7
movq %mm4,%mm1
movq %mm4,%mm2
psrlq $14,%mm1
movq %mm4,32(%esp)
psllq $23,%mm2
movq %mm1,%mm3
psrlq $4,%mm1
pxor %mm2,%mm3
psllq $23,%mm2
pxor %mm1,%mm3
psrlq $23,%mm1
pxor %mm2,%mm3
psllq $4,%mm2
pxor %mm1,%mm3
paddq (%ebp),%mm7
pxor %mm2,%mm3
pxor %mm6,%mm5
movq 8(%esp),%mm1
pand %mm4,%mm5
movq 16(%esp),%mm2
pxor %mm6,%mm5
movq 24(%esp),%mm4
paddq %mm5,%mm3
movq %mm0,(%esp)
paddq %mm7,%mm3
movq %mm0,%mm5
movq %mm0,%mm6
paddq 72(%esp),%mm3
psrlq $28,%mm5
paddq %mm3,%mm4
psllq $25,%mm6
movq %mm5,%mm7
psrlq $6,%mm5
pxor %mm6,%mm7
psllq $5,%mm6
pxor %mm5,%mm7
psrlq $5,%mm5
pxor %mm6,%mm7
psllq $6,%mm6
pxor %mm5,%mm7
subl $8,%esp
pxor %mm6,%mm7
movq %mm0,%mm5
por %mm2,%mm0
pand %mm2,%mm5
pand %mm1,%mm0
por %mm0,%mm5
paddq %mm5,%mm7
movq %mm3,%mm0
movb (%ebp),%dl
paddq %mm7,%mm0
addl $8,%ebp
cmpb $53,%dl
jne .L00400_14_sse2
movq 40(%esp),%mm5
movq 48(%esp),%mm6
movq 56(%esp),%mm7
movq %mm4,%mm1
movq %mm4,%mm2
psrlq $14,%mm1
movq %mm4,32(%esp)
psllq $23,%mm2
movq %mm1,%mm3
psrlq $4,%mm1
pxor %mm2,%mm3
psllq $23,%mm2
pxor %mm1,%mm3
psrlq $23,%mm1
pxor %mm2,%mm3
psllq $4,%mm2
pxor %mm1,%mm3
paddq (%ebp),%mm7
pxor %mm2,%mm3
pxor %mm6,%mm5
movq 8(%esp),%mm1
pand %mm4,%mm5
movq 16(%esp),%mm2
pxor %mm6,%mm5
movq 24(%esp),%mm4
paddq %mm5,%mm3
movq %mm0,(%esp)
paddq %mm7,%mm3
movq %mm0,%mm5
movq %mm0,%mm6
paddq 72(%esp),%mm3
psrlq $28,%mm5
paddq %mm3,%mm4
psllq $25,%mm6
movq %mm5,%mm7
psrlq $6,%mm5
pxor %mm6,%mm7
psllq $5,%mm6
pxor %mm5,%mm7
psrlq $5,%mm5
pxor %mm6,%mm7
psllq $6,%mm6
pxor %mm5,%mm7
subl $8,%esp
pxor %mm6,%mm7
movq %mm0,%mm5
por %mm2,%mm0
movq 88(%esp),%mm6
pand %mm2,%mm5
pand %mm1,%mm0
movq 192(%esp),%mm2
por %mm0,%mm5
paddq %mm5,%mm7
movq %mm3,%mm0
movb (%ebp),%dl
paddq %mm7,%mm0
addl $8,%ebp
.align 16
.L00516_79_sse2:
movq %mm2,%mm1
psrlq $1,%mm2
movq %mm6,%mm7
psrlq $6,%mm6
movq %mm2,%mm3
psrlq $6,%mm2
movq %mm6,%mm5
psrlq $13,%mm6
pxor %mm2,%mm3
psrlq $1,%mm2
pxor %mm6,%mm5
psrlq $42,%mm6
pxor %mm2,%mm3
movq 200(%esp),%mm2
psllq $56,%mm1
pxor %mm6,%mm5
psllq $3,%mm7
pxor %mm1,%mm3
paddq 128(%esp),%mm2
psllq $7,%mm1
pxor %mm7,%mm5
psllq $42,%mm7
pxor %mm1,%mm3
pxor %mm7,%mm5
paddq %mm5,%mm3
paddq %mm2,%mm3
movq %mm3,72(%esp)
movq 40(%esp),%mm5
movq 48(%esp),%mm6
movq 56(%esp),%mm7
movq %mm4,%mm1
movq %mm4,%mm2
psrlq $14,%mm1
movq %mm4,32(%esp)
psllq $23,%mm2
movq %mm1,%mm3
psrlq $4,%mm1
pxor %mm2,%mm3
psllq $23,%mm2
pxor %mm1,%mm3
psrlq $23,%mm1
pxor %mm2,%mm3
psllq $4,%mm2
pxor %mm1,%mm3
paddq (%ebp),%mm7
pxor %mm2,%mm3
pxor %mm6,%mm5
movq 8(%esp),%mm1
pand %mm4,%mm5
movq 16(%esp),%mm2
pxor %mm6,%mm5
movq 24(%esp),%mm4
paddq %mm5,%mm3
movq %mm0,(%esp)
paddq %mm7,%mm3
movq %mm0,%mm5
movq %mm0,%mm6
paddq 72(%esp),%mm3
psrlq $28,%mm5
paddq %mm3,%mm4
psllq $25,%mm6
movq %mm5,%mm7
psrlq $6,%mm5
pxor %mm6,%mm7
psllq $5,%mm6
pxor %mm5,%mm7
psrlq $5,%mm5
pxor %mm6,%mm7
psllq $6,%mm6
pxor %mm5,%mm7
subl $8,%esp
pxor %mm6,%mm7
movq %mm0,%mm5
por %mm2,%mm0
movq 88(%esp),%mm6
pand %mm2,%mm5
pand %mm1,%mm0
movq 192(%esp),%mm2
por %mm0,%mm5
paddq %mm5,%mm7
movq %mm3,%mm0
movb (%ebp),%dl
paddq %mm7,%mm0
addl $8,%ebp
cmpb $23,%dl
jne .L00516_79_sse2
movq 8(%esp),%mm1
movq 16(%esp),%mm2
movq 24(%esp),%mm3
movq 40(%esp),%mm5
movq 48(%esp),%mm6
movq 56(%esp),%mm7
paddq (%esi),%mm0
paddq 8(%esi),%mm1
paddq 16(%esi),%mm2
paddq 24(%esi),%mm3
paddq 32(%esi),%mm4
paddq 40(%esi),%mm5
paddq 48(%esi),%mm6
paddq 56(%esi),%mm7
movq %mm0,(%esi)
movq %mm1,8(%esi)
movq %mm2,16(%esi)
movq %mm3,24(%esi)
movq %mm4,32(%esi)
movq %mm5,40(%esi)
movq %mm6,48(%esi)
movq %mm7,56(%esi)
addl $640,%esp
subl $640,%ebp
cmpl 88(%esp),%edi
jb .L003loop_sse2
emms
movl 92(%esp),%esp
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.align 16
.L002loop_x86:
movl (%edi),%eax
@ -130,7 +402,7 @@ sha512_block_data_order:
movl $16,%ecx
.long 2784229001
.align 16
.L00300_15_x86:
.L00600_15_x86:
movl 40(%esp),%ecx
movl 44(%esp),%edx
movl %ecx,%esi
@ -237,9 +509,9 @@ sha512_block_data_order:
subl $8,%esp
leal 8(%ebp),%ebp
cmpb $148,%dl
jne .L00300_15_x86
jne .L00600_15_x86
.align 16
.L00416_79_x86:
.L00716_79_x86:
movl 312(%esp),%ecx
movl 316(%esp),%edx
movl %ecx,%esi
@ -412,7 +684,7 @@ sha512_block_data_order:
subl $8,%esp
leal 8(%ebp),%ebp
cmpb $23,%dl
jne .L00416_79_x86
jne .L00716_79_x86
movl 840(%esp),%esi
movl 844(%esp),%edi
movl (%esi),%eax
@ -561,3 +833,4 @@ sha512_block_data_order:
.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
.byte 62,0
.comm OPENSSL_ia32cap_P,8,4

View File

@ -226,6 +226,18 @@ OPENSSL_wipe_cpu:
movl (%ecx),%ecx
btl $1,(%ecx)
jnc .L015no_x87
andl $83886080,%ecx
cmpl $83886080,%ecx
jne .L016no_sse2
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
.L016no_sse2:
.long 4007259865,4007259865,4007259865,4007259865,2430851995
.L015no_x87:
leal 4(%esp),%eax
@ -241,11 +253,11 @@ OPENSSL_atomic_add:
pushl %ebx
nop
movl (%edx),%eax
.L016spin:
.L017spin:
leal (%eax,%ecx,1),%ebx
nop
.long 447811568
jne .L016spin
jne .L017spin
movl %ebx,%eax
popl %ebx
ret
@ -286,32 +298,32 @@ OPENSSL_cleanse:
movl 8(%esp),%ecx
xorl %eax,%eax
cmpl $7,%ecx
jae .L017lot
jae .L018lot
cmpl $0,%ecx
je .L018ret
.L019little:
je .L019ret
.L020little:
movb %al,(%edx)
subl $1,%ecx
leal 1(%edx),%edx
jnz .L019little
.L018ret:
jnz .L020little
.L019ret:
ret
.align 16
.L017lot:
.L018lot:
testl $3,%edx
jz .L020aligned
jz .L021aligned
movb %al,(%edx)
leal -1(%ecx),%ecx
leal 1(%edx),%edx
jmp .L017lot
.L020aligned:
jmp .L018lot
.L021aligned:
movl %eax,(%edx)
leal -4(%ecx),%ecx
testl $-4,%ecx
leal 4(%edx),%edx
jnz .L020aligned
jnz .L021aligned
cmpl $0,%ecx
jne .L019little
jne .L020little
ret
.size OPENSSL_cleanse,.-.L_OPENSSL_cleanse_begin
.globl OPENSSL_ia32_rdrand
@ -320,11 +332,11 @@ OPENSSL_cleanse:
OPENSSL_ia32_rdrand:
.L_OPENSSL_ia32_rdrand_begin:
movl $8,%ecx
.L021loop:
.L022loop:
.byte 15,199,240
jc .L022break
loop .L021loop
.L022break:
jc .L023break
loop .L022loop
.L023break:
cmpl $0,%eax
cmovel %ecx,%eax
ret