Backout my over-eager backout - we need this for sparc64 building netbsd32

compat libs, but still lots of #ifdef's missing to make this ready for
usage by sparc.
This commit is contained in:
martin 2018-02-20 13:14:02 +00:00
parent 32bba72871
commit 031637411f
16 changed files with 8295 additions and 637 deletions

View File

@ -1,3 +1,9 @@
#include "sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
.section ".text",#alloc,#execinstr
.align 256
@ -296,8 +302,8 @@ AES_Te:
.align 64
.skip 16
_sparcv9_AES_encrypt:
save %sp,-112-16,%sp
stx %i7,[%sp+0+112+0] ! off-load return address
save %sp,-STACK_FRAME-16,%sp
stx %i7,[%sp+STACK_BIAS+STACK_FRAME+0] ! off-load return address
ld [%i5+240],%i7
ld [%i5+0],%l4
ld [%i5+4],%l5 !
@ -585,7 +591,7 @@ _sparcv9_AES_encrypt:
ldub [%i7+%g5],%g5
sll %o0,16,%o0
xor %l0,%i0,%i0
ldx [%sp+0+112+0],%i7 ! restore return address
ldx [%sp+STACK_BIAS+STACK_FRAME+0],%i7 ! restore return address
sll %o1,8,%o1 !
xor %o0,%i0,%i0
@ -623,7 +629,7 @@ AES_encrypt:
or %o0,%o1,%g1
andcc %g1,3,%g0
bnz,pn %xcc,.Lunaligned_enc
save %sp,-112,%sp
save %sp,-STACK_FRAME,%sp
ld [%i0+0],%o0
ld [%i0+4],%o1
@ -1028,8 +1034,8 @@ AES_Td:
.align 64
.skip 16
_sparcv9_AES_decrypt:
save %sp,-112-16,%sp
stx %i7,[%sp+0+112+0] ! off-load return address
save %sp,-STACK_FRAME-16,%sp
stx %i7,[%sp+STACK_BIAS+STACK_FRAME+0] ! off-load return address
ld [%i5+240],%i7
ld [%i5+0],%l4
ld [%i5+4],%l5 !
@ -1317,7 +1323,7 @@ _sparcv9_AES_decrypt:
ldub [%i7+%g5],%g5
sll %o0,16,%o0
xor %l0,%i0,%i0
ldx [%sp+0+112+0],%i7 ! restore return address
ldx [%sp+STACK_BIAS+STACK_FRAME+0],%i7 ! restore return address
sll %o1,8,%o1 !
xor %o0,%i0,%i0
@ -1355,7 +1361,7 @@ AES_decrypt:
or %o0,%o1,%g1
andcc %g1,3,%g0
bnz,pn %xcc,.Lunaligned_dec
save %sp,-112,%sp
save %sp,-STACK_FRAME,%sp
ld [%i0+0],%o0
ld [%i0+4],%o1

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,10 @@
#include "sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
.text
.globl aes_t4_encrypt
@ -508,9 +515,9 @@ _aes128_load_deckey=_aes128_loadkey
.globl aes128_t4_cbc_encrypt
.align 32
aes128_t4_cbc_encrypt:
save %sp, -112, %sp
save %sp, -STACK_FRAME, %sp
cmp %i2, 0
be,pn %icc, .L128_cbc_enc_abort
be,pn SIZE_T_CC, .L128_cbc_enc_abort
srln %i2, 0, %i2 ! needed on v8+, "nop" on v9
sub %i0, %i1, %l5 ! %i0!=%i1
ld [%i4 + 0], %f0
@ -529,7 +536,7 @@ aes128_t4_cbc_encrypt:
and %i1, 7, %l2
cmp %i2, 127
movrnz %l2, 0, %l5 ! if ( %i1&7 ||
movleu %icc, 0, %l5 ! %i2<128 ||
movleu SIZE_T_CC, 0, %l5 ! %i2<128 ||
brnz,pn %l5, .L128cbc_enc_blk ! %i0==%i1)
srl %l3, %l2, %l3
@ -655,7 +662,7 @@ aes128_t4_cbc_encrypt:
.globl aes128_t4_ctr32_encrypt
.align 32
aes128_t4_ctr32_encrypt:
save %sp, -112, %sp
save %sp, -STACK_FRAME, %sp
srln %i2, 0, %i2 ! needed on v8+, "nop" on v9
prefetch [%i0], 20
@ -685,7 +692,7 @@ aes128_t4_ctr32_encrypt:
and %i1, 7, %l2
cmp %i2, 255
movrnz %l2, 0, %l5 ! if ( %i1&7 ||
movleu %icc, 0, %l5 ! %i2<256 ||
movleu SIZE_T_CC, 0, %l5 ! %i2<256 ||
brnz,pn %l5, .L128_ctr32_blk ! %i0==%i1)
srl %l3, %l2, %l3
@ -904,7 +911,7 @@ aes128_t4_ctr32_encrypt:
stda %f4, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
add %i1, 8, %i1
stda %f6, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
bgu,pt %icc, .L128_ctr32_blk_loop2x
bgu,pt SIZE_T_CC, .L128_ctr32_blk_loop2x
add %i1, 8, %i1
add %l5, %i2, %i2
@ -922,17 +929,17 @@ aes128_t4_ctr32_encrypt:
.globl aes128_t4_xts_encrypt
.align 32
aes128_t4_xts_encrypt:
save %sp, -112-16, %sp
save %sp, -STACK_FRAME-16, %sp
srln %i2, 0, %i2 ! needed on v8+, "nop" on v9
mov %i5, %o0
add %fp, 0-16, %o1
add %fp, STACK_BIAS-16, %o1
call aes_t4_encrypt
mov %i4, %o2
add %fp, 0-16, %l7
add %fp, STACK_BIAS-16, %l7
ldxa [%l7]0x88, %g2
add %fp, 0-8, %l7
add %fp, STACK_BIAS-8, %l7
ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak
sethi %hi(0x76543210), %l7
@ -955,7 +962,7 @@ aes128_t4_xts_encrypt:
and %i1, 7, %l2
cmp %i2, 255
movrnz %l2, 0, %l5 ! if ( %i1&7 ||
movleu %icc, 0, %l5 ! %i2<256 ||
movleu SIZE_T_CC, 0, %l5 ! %i2<256 ||
brnz,pn %l5, .L128_xts_enblk ! %i0==%i1)
srl %l3, %l2, %l3
@ -1253,7 +1260,7 @@ aes128_t4_xts_encrypt:
stda %f4, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
add %i1, 8, %i1
stda %f6, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
bgu,pt %icc, .L128_xts_enblk2x
bgu,pt SIZE_T_CC, .L128_xts_enblk2x
add %i1, 8, %i1
add %l5, %i2, %i2
@ -1274,11 +1281,11 @@ aes128_t4_xts_encrypt:
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align 32
.L128_xts_ensteal:
std %f0, [%fp + 0-16] ! copy of output
std %f2, [%fp + 0-8]
std %f0, [%fp + STACK_BIAS-16] ! copy of output
std %f2, [%fp + STACK_BIAS-8]
srl %l0, 3, %l0
add %fp, 0-16, %l7
add %fp, STACK_BIAS-16, %l7
add %i0, %l0, %i0 ! original %i0+%i2&-15
add %i1, %l2, %i1 ! original %i1+%i2&-15
mov 0, %l0
@ -1306,17 +1313,17 @@ aes128_t4_xts_encrypt:
.globl aes128_t4_xts_decrypt
.align 32
aes128_t4_xts_decrypt:
save %sp, -112-16, %sp
save %sp, -STACK_FRAME-16, %sp
srln %i2, 0, %i2 ! needed on v8+, "nop" on v9
mov %i5, %o0
add %fp, 0-16, %o1
add %fp, STACK_BIAS-16, %o1
call aes_t4_encrypt
mov %i4, %o2
add %fp, 0-16, %l7
add %fp, STACK_BIAS-16, %l7
ldxa [%l7]0x88, %g2
add %fp, 0-8, %l7
add %fp, STACK_BIAS-8, %l7
ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak
sethi %hi(0x76543210), %l7
@ -1342,7 +1349,7 @@ aes128_t4_xts_decrypt:
and %i1, 7, %l2
cmp %i2, 255
movrnz %l2, 0, %l5 ! if ( %i1&7 ||
movleu %icc, 0, %l5 ! %i2<256 ||
movleu SIZE_T_CC, 0, %l5 ! %i2<256 ||
brnz,pn %l5, .L128_xts_deblk ! %i0==%i1)
srl %l3, %l2, %l3
@ -1641,7 +1648,7 @@ aes128_t4_xts_decrypt:
stda %f4, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
add %i1, 8, %i1
stda %f6, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
bgu,pt %icc, .L128_xts_deblk2x
bgu,pt SIZE_T_CC, .L128_xts_deblk2x
add %i1, 8, %i1
add %l5, %i2, %i2
@ -1699,11 +1706,11 @@ aes128_t4_xts_decrypt:
.word 0x81b30d80 !fxor %f12,%f0,%f0 ! ^= tweak[0]
.word 0x85b38d82 !fxor %f14,%f2,%f2
std %f0, [%fp + 0-16]
std %f2, [%fp + 0-8]
std %f0, [%fp + STACK_BIAS-16]
std %f2, [%fp + STACK_BIAS-8]
srl %l0, 3, %l0
add %fp, 0-16, %l7
add %fp, STACK_BIAS-16, %l7
add %i0, %l0, %i0 ! original %i0+%i2&-15
add %i1, %l2, %i1 ! original %i1+%i2&-15
mov 0, %l0
@ -1732,9 +1739,9 @@ aes128_t4_xts_decrypt:
.globl aes128_t4_cbc_decrypt
.align 32
aes128_t4_cbc_decrypt:
save %sp, -112, %sp
save %sp, -STACK_FRAME, %sp
cmp %i2, 0
be,pn %icc, .L128_cbc_dec_abort
be,pn SIZE_T_CC, .L128_cbc_dec_abort
srln %i2, 0, %i2 ! needed on v8+, "nop" on v9
sub %i0, %i1, %l5 ! %i0!=%i1
ld [%i4 + 0], %f12 ! load ivec
@ -1753,7 +1760,7 @@ aes128_t4_cbc_decrypt:
and %i1, 7, %l2
cmp %i2, 255
movrnz %l2, 0, %l5 ! if ( %i1&7 ||
movleu %icc, 0, %l5 ! %i2<256 ||
movleu SIZE_T_CC, 0, %l5 ! %i2<256 ||
brnz,pn %l5, .L128cbc_dec_blk ! %i0==%i1)
srl %l3, %l2, %l3
@ -1979,7 +1986,7 @@ aes128_t4_cbc_decrypt:
stda %f4, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
add %i1, 8, %i1
stda %f6, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
bgu,pt %icc, .L128_cbc_dec_blk_loop2x
bgu,pt SIZE_T_CC, .L128_cbc_dec_blk_loop2x
add %i1, 8, %i1
add %l5, %i2, %i2
@ -2304,9 +2311,9 @@ _aes256_load_deckey=_aes192_loadkey
.globl aes256_t4_cbc_encrypt
.align 32
aes256_t4_cbc_encrypt:
save %sp, -112, %sp
save %sp, -STACK_FRAME, %sp
cmp %i2, 0
be,pn %icc, .L256_cbc_enc_abort
be,pn SIZE_T_CC, .L256_cbc_enc_abort
srln %i2, 0, %i2 ! needed on v8+, "nop" on v9
sub %i0, %i1, %l5 ! %i0!=%i1
ld [%i4 + 0], %f0
@ -2325,7 +2332,7 @@ aes256_t4_cbc_encrypt:
and %i1, 7, %l2
cmp %i2, 127
movrnz %l2, 0, %l5 ! if ( %i1&7 ||
movleu %icc, 0, %l5 ! %i2<128 ||
movleu SIZE_T_CC, 0, %l5 ! %i2<128 ||
brnz,pn %l5, .L256cbc_enc_blk ! %i0==%i1)
srl %l3, %l2, %l3
@ -2451,9 +2458,9 @@ aes256_t4_cbc_encrypt:
.globl aes192_t4_cbc_encrypt
.align 32
aes192_t4_cbc_encrypt:
save %sp, -112, %sp
save %sp, -STACK_FRAME, %sp
cmp %i2, 0
be,pn %icc, .L192_cbc_enc_abort
be,pn SIZE_T_CC, .L192_cbc_enc_abort
srln %i2, 0, %i2 ! needed on v8+, "nop" on v9
sub %i0, %i1, %l5 ! %i0!=%i1
ld [%i4 + 0], %f0
@ -2472,7 +2479,7 @@ aes192_t4_cbc_encrypt:
and %i1, 7, %l2
cmp %i2, 127
movrnz %l2, 0, %l5 ! if ( %i1&7 ||
movleu %icc, 0, %l5 ! %i2<128 ||
movleu SIZE_T_CC, 0, %l5 ! %i2<128 ||
brnz,pn %l5, .L192cbc_enc_blk ! %i0==%i1)
srl %l3, %l2, %l3
@ -2598,7 +2605,7 @@ aes192_t4_cbc_encrypt:
.globl aes256_t4_ctr32_encrypt
.align 32
aes256_t4_ctr32_encrypt:
save %sp, -112, %sp
save %sp, -STACK_FRAME, %sp
srln %i2, 0, %i2 ! needed on v8+, "nop" on v9
prefetch [%i0], 20
@ -2628,7 +2635,7 @@ aes256_t4_ctr32_encrypt:
and %i1, 7, %l2
cmp %i2, 255
movrnz %l2, 0, %l5 ! if ( %i1&7 ||
movleu %icc, 0, %l5 ! %i2<256 ||
movleu SIZE_T_CC, 0, %l5 ! %i2<256 ||
brnz,pn %l5, .L256_ctr32_blk ! %i0==%i1)
srl %l3, %l2, %l3
@ -2847,7 +2854,7 @@ aes256_t4_ctr32_encrypt:
stda %f4, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
add %i1, 8, %i1
stda %f6, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
bgu,pt %icc, .L256_ctr32_blk_loop2x
bgu,pt SIZE_T_CC, .L256_ctr32_blk_loop2x
add %i1, 8, %i1
add %l5, %i2, %i2
@ -2865,17 +2872,17 @@ aes256_t4_ctr32_encrypt:
.globl aes256_t4_xts_encrypt
.align 32
aes256_t4_xts_encrypt:
save %sp, -112-16, %sp
save %sp, -STACK_FRAME-16, %sp
srln %i2, 0, %i2 ! needed on v8+, "nop" on v9
mov %i5, %o0
add %fp, 0-16, %o1
add %fp, STACK_BIAS-16, %o1
call aes_t4_encrypt
mov %i4, %o2
add %fp, 0-16, %l7
add %fp, STACK_BIAS-16, %l7
ldxa [%l7]0x88, %g2
add %fp, 0-8, %l7
add %fp, STACK_BIAS-8, %l7
ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak
sethi %hi(0x76543210), %l7
@ -2898,7 +2905,7 @@ aes256_t4_xts_encrypt:
and %i1, 7, %l2
cmp %i2, 255
movrnz %l2, 0, %l5 ! if ( %i1&7 ||
movleu %icc, 0, %l5 ! %i2<256 ||
movleu SIZE_T_CC, 0, %l5 ! %i2<256 ||
brnz,pn %l5, .L256_xts_enblk ! %i0==%i1)
srl %l3, %l2, %l3
@ -3196,7 +3203,7 @@ aes256_t4_xts_encrypt:
stda %f4, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
add %i1, 8, %i1
stda %f6, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
bgu,pt %icc, .L256_xts_enblk2x
bgu,pt SIZE_T_CC, .L256_xts_enblk2x
add %i1, 8, %i1
add %l5, %i2, %i2
@ -3217,11 +3224,11 @@ aes256_t4_xts_encrypt:
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
.align 32
.L256_xts_ensteal:
std %f0, [%fp + 0-16] ! copy of output
std %f2, [%fp + 0-8]
std %f0, [%fp + STACK_BIAS-16] ! copy of output
std %f2, [%fp + STACK_BIAS-8]
srl %l0, 3, %l0
add %fp, 0-16, %l7
add %fp, STACK_BIAS-16, %l7
add %i0, %l0, %i0 ! original %i0+%i2&-15
add %i1, %l2, %i1 ! original %i1+%i2&-15
mov 0, %l0
@ -3249,17 +3256,17 @@ aes256_t4_xts_encrypt:
.globl aes256_t4_xts_decrypt
.align 32
aes256_t4_xts_decrypt:
save %sp, -112-16, %sp
save %sp, -STACK_FRAME-16, %sp
srln %i2, 0, %i2 ! needed on v8+, "nop" on v9
mov %i5, %o0
add %fp, 0-16, %o1
add %fp, STACK_BIAS-16, %o1
call aes_t4_encrypt
mov %i4, %o2
add %fp, 0-16, %l7
add %fp, STACK_BIAS-16, %l7
ldxa [%l7]0x88, %g2
add %fp, 0-8, %l7
add %fp, STACK_BIAS-8, %l7
ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak
sethi %hi(0x76543210), %l7
@ -3285,7 +3292,7 @@ aes256_t4_xts_decrypt:
and %i1, 7, %l2
cmp %i2, 255
movrnz %l2, 0, %l5 ! if ( %i1&7 ||
movleu %icc, 0, %l5 ! %i2<256 ||
movleu SIZE_T_CC, 0, %l5 ! %i2<256 ||
brnz,pn %l5, .L256_xts_deblk ! %i0==%i1)
srl %l3, %l2, %l3
@ -3584,7 +3591,7 @@ aes256_t4_xts_decrypt:
stda %f4, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
add %i1, 8, %i1
stda %f6, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
bgu,pt %icc, .L256_xts_deblk2x
bgu,pt SIZE_T_CC, .L256_xts_deblk2x
add %i1, 8, %i1
add %l5, %i2, %i2
@ -3642,11 +3649,11 @@ aes256_t4_xts_decrypt:
.word 0x81b30d80 !fxor %f12,%f0,%f0 ! ^= tweak[0]
.word 0x85b38d82 !fxor %f14,%f2,%f2
std %f0, [%fp + 0-16]
std %f2, [%fp + 0-8]
std %f0, [%fp + STACK_BIAS-16]
std %f2, [%fp + STACK_BIAS-8]
srl %l0, 3, %l0
add %fp, 0-16, %l7
add %fp, STACK_BIAS-16, %l7
add %i0, %l0, %i0 ! original %i0+%i2&-15
add %i1, %l2, %i1 ! original %i1+%i2&-15
mov 0, %l0
@ -3675,7 +3682,7 @@ aes256_t4_xts_decrypt:
.globl aes192_t4_ctr32_encrypt
.align 32
aes192_t4_ctr32_encrypt:
save %sp, -112, %sp
save %sp, -STACK_FRAME, %sp
srln %i2, 0, %i2 ! needed on v8+, "nop" on v9
prefetch [%i0], 20
@ -3705,7 +3712,7 @@ aes192_t4_ctr32_encrypt:
and %i1, 7, %l2
cmp %i2, 255
movrnz %l2, 0, %l5 ! if ( %i1&7 ||
movleu %icc, 0, %l5 ! %i2<256 ||
movleu SIZE_T_CC, 0, %l5 ! %i2<256 ||
brnz,pn %l5, .L192_ctr32_blk ! %i0==%i1)
srl %l3, %l2, %l3
@ -3924,7 +3931,7 @@ aes192_t4_ctr32_encrypt:
stda %f4, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
add %i1, 8, %i1
stda %f6, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
bgu,pt %icc, .L192_ctr32_blk_loop2x
bgu,pt SIZE_T_CC, .L192_ctr32_blk_loop2x
add %i1, 8, %i1
add %l5, %i2, %i2
@ -3942,9 +3949,9 @@ aes192_t4_ctr32_encrypt:
.globl aes192_t4_cbc_decrypt
.align 32
aes192_t4_cbc_decrypt:
save %sp, -112, %sp
save %sp, -STACK_FRAME, %sp
cmp %i2, 0
be,pn %icc, .L192_cbc_dec_abort
be,pn SIZE_T_CC, .L192_cbc_dec_abort
srln %i2, 0, %i2 ! needed on v8+, "nop" on v9
sub %i0, %i1, %l5 ! %i0!=%i1
ld [%i4 + 0], %f12 ! load ivec
@ -3963,7 +3970,7 @@ aes192_t4_cbc_decrypt:
and %i1, 7, %l2
cmp %i2, 255
movrnz %l2, 0, %l5 ! if ( %i1&7 ||
movleu %icc, 0, %l5 ! %i2<256 ||
movleu SIZE_T_CC, 0, %l5 ! %i2<256 ||
brnz,pn %l5, .L192cbc_dec_blk ! %i0==%i1)
srl %l3, %l2, %l3
@ -4189,7 +4196,7 @@ aes192_t4_cbc_decrypt:
stda %f4, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
add %i1, 8, %i1
stda %f6, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
bgu,pt %icc, .L192_cbc_dec_blk_loop2x
bgu,pt SIZE_T_CC, .L192_cbc_dec_blk_loop2x
add %i1, 8, %i1
add %l5, %i2, %i2
@ -4210,9 +4217,9 @@ aes192_t4_cbc_decrypt:
.globl aes256_t4_cbc_decrypt
.align 32
aes256_t4_cbc_decrypt:
save %sp, -112, %sp
save %sp, -STACK_FRAME, %sp
cmp %i2, 0
be,pn %icc, .L256_cbc_dec_abort
be,pn SIZE_T_CC, .L256_cbc_dec_abort
srln %i2, 0, %i2 ! needed on v8+, "nop" on v9
sub %i0, %i1, %l5 ! %i0!=%i1
ld [%i4 + 0], %f12 ! load ivec
@ -4231,7 +4238,7 @@ aes256_t4_cbc_decrypt:
and %i1, 7, %l2
cmp %i2, 255
movrnz %l2, 0, %l5 ! if ( %i1&7 ||
movleu %icc, 0, %l5 ! %i2<256 ||
movleu SIZE_T_CC, 0, %l5 ! %i2<256 ||
brnz,pn %l5, .L256cbc_dec_blk ! %i0==%i1)
srl %l3, %l2, %l3
@ -4457,7 +4464,7 @@ aes256_t4_cbc_decrypt:
stda %f4, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
add %i1, 8, %i1
stda %f6, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
bgu,pt %icc, .L256_cbc_dec_blk_loop2x
bgu,pt SIZE_T_CC, .L256_cbc_dec_blk_loop2x
add %i1, 8, %i1
add %l5, %i2, %i2

View File

@ -3,12 +3,12 @@
/*
* ====================================================================
* Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
* project.
* Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved.
*
* Rights for redistribution and usage in source and binary forms are
* granted according to the OpenSSL license. Warranty of any kind is
* disclaimed.
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
* ====================================================================
*/

View File

@ -1,3 +1,5 @@
#include "sparc_arch.h"
.text
.globl cmll_t4_encrypt
@ -974,9 +976,9 @@ _cmll256_decrypt_2x:
.globl cmll128_t4_cbc_encrypt
.align 32
cmll128_t4_cbc_encrypt:
save %sp, -112, %sp
save %sp, -STACK_FRAME, %sp
cmp %i2, 0
be,pn %icc, .L128_cbc_enc_abort
be,pn SIZE_T_CC, .L128_cbc_enc_abort
srln %i2, 0, %i2 ! needed on v8+, "nop" on v9
sub %i0, %i1, %l5 ! %i0!=%i1
ld [%i4 + 0], %f0
@ -995,7 +997,7 @@ cmll128_t4_cbc_encrypt:
and %i1, 7, %l2
cmp %i2, 127
movrnz %l2, 0, %l5 ! if ( %i1&7 ||
movleu %icc, 0, %l5 ! %i2<128 ||
movleu SIZE_T_CC, 0, %l5 ! %i2<128 ||
brnz,pn %l5, .L128cbc_enc_blk ! %i0==%i1)
srl %l3, %l2, %l3
@ -1121,9 +1123,9 @@ cmll128_t4_cbc_encrypt:
.globl cmll256_t4_cbc_encrypt
.align 32
cmll256_t4_cbc_encrypt:
save %sp, -112, %sp
save %sp, -STACK_FRAME, %sp
cmp %i2, 0
be,pn %icc, .L256_cbc_enc_abort
be,pn SIZE_T_CC, .L256_cbc_enc_abort
srln %i2, 0, %i2 ! needed on v8+, "nop" on v9
sub %i0, %i1, %l5 ! %i0!=%i1
ld [%i4 + 0], %f0
@ -1142,7 +1144,7 @@ cmll256_t4_cbc_encrypt:
and %i1, 7, %l2
cmp %i2, 127
movrnz %l2, 0, %l5 ! if ( %i1&7 ||
movleu %icc, 0, %l5 ! %i2<128 ||
movleu SIZE_T_CC, 0, %l5 ! %i2<128 ||
brnz,pn %l5, .L256cbc_enc_blk ! %i0==%i1)
srl %l3, %l2, %l3
@ -1268,9 +1270,9 @@ cmll256_t4_cbc_encrypt:
.globl cmll128_t4_cbc_decrypt
.align 32
cmll128_t4_cbc_decrypt:
save %sp, -112, %sp
save %sp, -STACK_FRAME, %sp
cmp %i2, 0
be,pn %icc, .L128_cbc_dec_abort
be,pn SIZE_T_CC, .L128_cbc_dec_abort
srln %i2, 0, %i2 ! needed on v8+, "nop" on v9
sub %i0, %i1, %l5 ! %i0!=%i1
ld [%i4 + 0], %f12 ! load ivec
@ -1289,7 +1291,7 @@ cmll128_t4_cbc_decrypt:
and %i1, 7, %l2
cmp %i2, 255
movrnz %l2, 0, %l5 ! if ( %i1&7 ||
movleu %icc, 0, %l5 ! %i2<256 ||
movleu SIZE_T_CC, 0, %l5 ! %i2<256 ||
brnz,pn %l5, .L128cbc_dec_blk ! %i0==%i1)
srl %l3, %l2, %l3
@ -1515,7 +1517,7 @@ cmll128_t4_cbc_decrypt:
stda %f4, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
add %i1, 8, %i1
stda %f6, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
bgu,pt %icc, .L128_cbc_dec_blk_loop2x
bgu,pt SIZE_T_CC, .L128_cbc_dec_blk_loop2x
add %i1, 8, %i1
add %l5, %i2, %i2
@ -1536,9 +1538,9 @@ cmll128_t4_cbc_decrypt:
.globl cmll256_t4_cbc_decrypt
.align 32
cmll256_t4_cbc_decrypt:
save %sp, -112, %sp
save %sp, -STACK_FRAME, %sp
cmp %i2, 0
be,pn %icc, .L256_cbc_dec_abort
be,pn SIZE_T_CC, .L256_cbc_dec_abort
srln %i2, 0, %i2 ! needed on v8+, "nop" on v9
sub %i0, %i1, %l5 ! %i0!=%i1
ld [%i4 + 0], %f12 ! load ivec
@ -1557,7 +1559,7 @@ cmll256_t4_cbc_decrypt:
and %i1, 7, %l2
cmp %i2, 255
movrnz %l2, 0, %l5 ! if ( %i1&7 ||
movleu %icc, 0, %l5 ! %i2<256 ||
movleu SIZE_T_CC, 0, %l5 ! %i2<256 ||
brnz,pn %l5, .L256cbc_dec_blk ! %i0==%i1)
srl %l3, %l2, %l3
@ -1783,7 +1785,7 @@ cmll256_t4_cbc_decrypt:
stda %f4, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
add %i1, 8, %i1
stda %f6, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
bgu,pt %icc, .L256_cbc_dec_blk_loop2x
bgu,pt SIZE_T_CC, .L256_cbc_dec_blk_loop2x
add %i1, 8, %i1
add %l5, %i2, %i2
@ -1804,7 +1806,7 @@ cmll256_t4_cbc_decrypt:
.globl cmll128_t4_ctr32_encrypt
.align 32
cmll128_t4_ctr32_encrypt:
save %sp, -112, %sp
save %sp, -STACK_FRAME, %sp
srln %i2, 0, %i2 ! needed on v8+, "nop" on v9
prefetch [%i0], 20
@ -1834,7 +1836,7 @@ cmll128_t4_ctr32_encrypt:
and %i1, 7, %l2
cmp %i2, 255
movrnz %l2, 0, %l5 ! if ( %i1&7 ||
movleu %icc, 0, %l5 ! %i2<256 ||
movleu SIZE_T_CC, 0, %l5 ! %i2<256 ||
brnz,pn %l5, .L128_ctr32_blk ! %i0==%i1)
srl %l3, %l2, %l3
@ -2053,7 +2055,7 @@ cmll128_t4_ctr32_encrypt:
stda %f4, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
add %i1, 8, %i1
stda %f6, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
bgu,pt %icc, .L128_ctr32_blk_loop2x
bgu,pt SIZE_T_CC, .L128_ctr32_blk_loop2x
add %i1, 8, %i1
add %l5, %i2, %i2
@ -2071,7 +2073,7 @@ cmll128_t4_ctr32_encrypt:
.globl cmll256_t4_ctr32_encrypt
.align 32
cmll256_t4_ctr32_encrypt:
save %sp, -112, %sp
save %sp, -STACK_FRAME, %sp
srln %i2, 0, %i2 ! needed on v8+, "nop" on v9
prefetch [%i0], 20
@ -2101,7 +2103,7 @@ cmll256_t4_ctr32_encrypt:
and %i1, 7, %l2
cmp %i2, 255
movrnz %l2, 0, %l5 ! if ( %i1&7 ||
movleu %icc, 0, %l5 ! %i2<256 ||
movleu SIZE_T_CC, 0, %l5 ! %i2<256 ||
brnz,pn %l5, .L256_ctr32_blk ! %i0==%i1)
srl %l3, %l2, %l3
@ -2320,7 +2322,7 @@ cmll256_t4_ctr32_encrypt:
stda %f4, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
add %i1, 8, %i1
stda %f6, [%i1]0xe2 ! ASI_BLK_INIT, T4-specific
bgu,pt %icc, .L256_ctr32_blk_loop2x
bgu,pt SIZE_T_CC, .L256_ctr32_blk_loop2x
add %i1, 8, %i1
add %l5, %i2, %i2

View File

@ -1,26 +1,9 @@
! des_enc.m4
! des_enc.S (generated from des_enc.m4)
! Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved.
!
! UltraSPARC assembler version of the LibDES/SSLeay/OpenSSL des_enc.c file.
!
! Version 1.0. 32-bit version.
!
! June 8, 2000.
!
! Version 2.0. 32/64-bit, PIC-ification, blended CPU adaptation
! by Andy Polyakov.
!
! January 1, 2003.
!
! Assembler version: Copyright Svend Olaf Mikkelsen.
!
! Original C code: Copyright Eric A. Young.
!
! This code can be freely used by LibDES/SSLeay/OpenSSL users.
!
! The LibDES/SSLeay/OpenSSL copyright notices must be respected.
!
! This version can be redistributed.
! Licensed under the OpenSSL license (the "License"). You may not use
! this file except in compliance with the License. You can obtain a copy
! in the file LICENSE in the source distribution or at
! https://www.openssl.org/source/license.html
!
! To expand the m4 macros: m4 -B 8192 des_enc.m4 > des_enc.S
!
@ -48,6 +31,10 @@
#include <openssl/opensslconf.h>
#ifdef OPENSSL_FIPSCANISTER
#include <openssl/fipssyms.h>
#endif
#if defined(__SUNPRO_C) && defined(__sparcv9)
# define ABI64 /* They've said -xarch=v9 at command line */
#elif defined(__GNUC__) && defined(__arch64__)
@ -63,9 +50,6 @@
# define STPTR stx
# define ARG0 128
# define ARGSZ 8
# ifndef __sparc_v9__
# define __sparc_v9__
# endif
#else
# define FRAME -96
# define BIAS 0
@ -159,7 +143,7 @@
! other half (use).
!
! In this version we do two rounds in a loop repeated 7 times
! and two rounds seperately.
! and two rounds separately.
!
! One half has the bits for the sboxes in the following positions:
!
@ -430,11 +414,7 @@
xor out5, local1, out5 ! 1 finished
xor out5, local2, out5 ! 3 finished
#ifdef __sparc_v9__
bne,pt %icc, .des_enc.1
#else
bne .des_enc.1
#endif
and local4, 252, local1 ! sbox 1 next round
! two rounds more:
@ -688,11 +668,7 @@
xor in5, local1, in5 ! 1 finished
xor in5, local2, in5 ! 3 finished
#ifdef __sparc_v9__
bne,pt %icc, .des_dec.1
#else
bne .des_dec.1
#endif
and local4, 252, local1 ! sbox 1 next round
! two rounds more:
@ -829,11 +805,7 @@ DES_encrypt1:
ld [in0], in5 ! left
cmp in2, 0 ! enc
#ifdef __sparc_v9__
be,pn %icc, .encrypt.dec ! enc/dec
#else
be .encrypt.dec
#endif
ld [in0+4], out5 ! right
! parameter 6 1/2 for include encryption/decryption
@ -1061,11 +1033,7 @@ DES_encrypt1:
xor out5, local1, out5 ! 1 finished
xor out5, local2, out5 ! 3 finished
#ifdef __sparc_v9__
bne,pt %icc, .des_encrypt1.1
#else
bne .des_encrypt1.1
#endif
and local4, 252, local1 ! sbox 1 next round
! two rounds more:
@ -1505,11 +1473,7 @@ DES_encrypt2:
! we use our own stackframe
#ifdef __sparc_v9__
be,pn %icc, .encrypt2.dec ! decryption
#else
be .encrypt2.dec
#endif
STPTR in0, [%sp+BIAS+ARG0+0*ARGSZ]
ld [in3], out0 ! key 7531 first round
@ -2033,11 +1997,7 @@ DES_ncbc_encrypt:
cmp in5, 0 ! enc
#ifdef __sparc_v9__
be,pn %icc, .ncbc.dec
#else
be .ncbc.dec
#endif
STPTR in4, [%sp+BIAS+ARG0+4*ARGSZ]
! addr left right temp label
@ -2048,18 +2008,6 @@ DES_ncbc_encrypt:
! first in memory to rightmost in register
#ifdef __sparc_v9__
andcc in4, 3, global0
bne,pn %icc, .LLE1
nop
lda [in4] 0x88, in5
add in4, 4, local3
ba,pt %icc, .LLE1a
lda [local3] 0x88, out5
#endif
.LLE1:
ldub [in4+3], in5
@ -2095,11 +2043,7 @@ DES_ncbc_encrypt:
addcc in2, -8, in2 ! bytes missing when first block done
#ifdef __sparc_v9__
bl,pn %icc, .ncbc.enc.seven.or.less
#else
bl .ncbc.enc.seven.or.less
#endif
mov in3, in4 ! schedule
.ncbc.enc.next.block:
@ -2111,18 +2055,6 @@ DES_ncbc_encrypt:
! first in memory to rightmost in register
#ifdef __sparc_v9__
andcc in0, 3, global0
bne,pn %icc, .LLE2
nop
lda [in0] 0x88, out4
add in0, 4, local3
ba,pt %icc, .LLE2a
lda [local3] 0x88, global4
#endif
.LLE2:
ldub [in0+3], out4
@ -2390,11 +2322,7 @@ DES_ncbc_encrypt:
xor out5, local1, out5 ! 1 finished
xor out5, local2, out5 ! 3 finished
#ifdef __sparc_v9__
bne,pt %icc, .ncbc.enc.1
#else
bne .ncbc.enc.1
#endif
and local4, 252, local1 ! sbox 1 next round
! two rounds more:
@ -2509,11 +2437,7 @@ DES_ncbc_encrypt:
xor out5, local2, out5
! include encryption ks in3
#ifdef __sparc_v9__
bl,pn %icc, .ncbc.enc.next.block_fp
#else
bl .ncbc.enc.next.block_fp
#endif
add in0, 8, in0 ! input address
! If 8 or more bytes are to be encrypted after this block,
@ -2527,18 +2451,6 @@ DES_ncbc_encrypt:
! first in memory to rightmost in register
#ifdef __sparc_v9__
andcc in0, 3, global0
bne,pn %icc, .LLE12
nop
lda [in0] 0x88, global3
add in0, 4, local5
ba,pt %icc, .LLE12a
lda [local5] 0x88, global4
#endif
.LLE12:
ldub [in0+3], global3
@ -2712,18 +2624,6 @@ DES_ncbc_encrypt:
! rightmost in register to first in memory
#ifdef __sparc_v9__
andcc in1, 3, global0
bne,pn %icc, .SLE10
nop
sta out0, [in1] 0x88
add in1, 4, local3
ba,pt %icc, .SLE10a
sta out1, [local3] 0x88
#endif
.SLE10:
and out0, 255, local3
stub local3, [in1+0]
@ -2767,7 +2667,7 @@ DES_ncbc_encrypt:
xor global4, local1, out5 ! iv xor next block
ba .ncbc.enc.next.block_2
add in1, 8, in1 ! output adress
add in1, 8, in1 ! output address
.ncbc.enc.next.block_fp:
@ -2854,18 +2754,6 @@ DES_ncbc_encrypt:
! rightmost in register to first in memory
#ifdef __sparc_v9__
andcc in1, 3, global0
bne,pn %icc, .SLE1
nop
sta in5, [in1] 0x88
add in1, 4, local3
ba,pt %icc, .SLE1a
sta out5, [local3] 0x88
#endif
.SLE1:
and in5, 255, local3
stub local3, [in1+0]
@ -2902,22 +2790,14 @@ DES_ncbc_encrypt:
addcc in2, -8, in2 ! bytes missing when next block done
#ifdef __sparc_v9__
bpos,pt %icc, .ncbc.enc.next.block ! also jumps if 0
#else
bpos .ncbc.enc.next.block
#endif
add in1, 8, in1
.ncbc.enc.seven.or.less:
cmp in2, -8
#ifdef __sparc_v9__
ble,pt %icc, .ncbc.enc.finish
#else
ble .ncbc.enc.finish
#endif
nop
add in2, 8, local1 ! bytes to load
@ -2995,18 +2875,6 @@ DES_ncbc_encrypt:
! rightmost in register to first in memory
#ifdef __sparc_v9__
andcc local4, 3, global0
bne,pn %icc, .SLE2
nop
sta in5, [local4] 0x88
add local4, 4, local5
ba,pt %icc, .SLE2a
sta out5, [local5] 0x88
#endif
.SLE2:
and in5, 255, local5
stub local5, [local4+0]
@ -3052,11 +2920,7 @@ DES_ncbc_encrypt:
add in3, 120, in3
LDPTR [%sp+BIAS+ARG0+4*ARGSZ] , local7 ! ivec
#ifdef __sparc_v9__
ble,pn %icc, .ncbc.dec.finish
#else
ble .ncbc.dec.finish
#endif
mov in3, in4 ! schedule
STPTR in1, [%sp+BIAS+ARG0+1*ARGSZ]
@ -3069,18 +2933,6 @@ DES_ncbc_encrypt:
! first in memory to rightmost in register
#ifdef __sparc_v9__
andcc local7, 3, global0
bne,pn %icc, .LLE3
nop
lda [local7] 0x88, in0
add local7, 4, local3
ba,pt %icc, .LLE3a
lda [local3] 0x88, in1
#endif
.LLE3:
ldub [local7+3], in0
@ -3123,18 +2975,6 @@ DES_ncbc_encrypt:
! first in memory to rightmost in register
#ifdef __sparc_v9__
andcc local5, 3, global0
bne,pn %icc, .LLE4
nop
lda [local5] 0x88, in5
add local5, 4, local3
ba,pt %icc, .LLE4a
lda [local3] 0x88, out5
#endif
.LLE4:
ldub [local5+3], in5
@ -3354,11 +3194,7 @@ DES_ncbc_encrypt:
! in2 is compared to 8 in the rounds
xor out5, in0, out4 ! iv xor
#ifdef __sparc_v9__
bl,pn %icc, .ncbc.dec.seven.or.less
#else
bl .ncbc.dec.seven.or.less
#endif
xor in5, in1, global4 ! iv xor
! Load ivec next block now, since input and output address might be the same.
@ -3370,19 +3206,6 @@ DES_ncbc_encrypt:
! first in memory to rightmost in register
#ifdef __sparc_v9__
andcc local5, 3, global0
bne,pn %icc, .LLE5
nop
lda [local5] 0x88, in0
add local5, 4, local5
lda [local5] 0x88, in1
ba,pt %icc, .LLE5a
add local5, 4, local5
#endif
.LLE5:
ldub [local5+3], in0
@ -3423,18 +3246,6 @@ DES_ncbc_encrypt:
! rightmost in register to first in memory
#ifdef __sparc_v9__
andcc local7, 3, global0
bne,pn %icc, .SLE3
nop
sta out4, [local7] 0x88
add local7, 4, local3
ba,pt %icc, .SLE3a
sta global4, [local3] 0x88
#endif
.SLE3:
and out4, 255, local3
stub local3, [local7+0]
@ -3473,11 +3284,7 @@ DES_ncbc_encrypt:
add local7, 8, local7
addcc in2, -8, in2
#ifdef __sparc_v9__
bg,pt %icc, .ncbc.dec.next.block
#else
bg .ncbc.dec.next.block
#endif
STPTR local7, [%sp+BIAS+ARG0+1*ARGSZ]
@ -3491,18 +3298,6 @@ DES_ncbc_encrypt:
! rightmost in register to first in memory
#ifdef __sparc_v9__
andcc local4, 3, global0
bne,pn %icc, .SLE4
nop
sta in0, [local4] 0x88
add local4, 4, local5
ba,pt %icc, .SLE4a
sta in1, [local5] 0x88
#endif
.SLE4:
and in0, 255, local5
stub local5, [local4+0]
@ -3551,19 +3346,6 @@ DES_ncbc_encrypt:
! first in memory to rightmost in register
#ifdef __sparc_v9__
andcc local5, 3, global0
bne,pn %icc, .LLE13
nop
lda [local5] 0x88, in0
add local5, 4, local5
lda [local5] 0x88, in1
ba,pt %icc, .LLE13a
add local5, 4, local5
#endif
.LLE13:
ldub [local5+3], in0
@ -3688,11 +3470,7 @@ DES_ede3_cbc_encrypt:
LDPTR [%fp+BIAS+ARG0+6*ARGSZ], local4 ! ivec
cmp local3, 0 ! enc
#ifdef __sparc_v9__
be,pn %icc, .ede3.dec
#else
be .ede3.dec
#endif
STPTR in4, [%sp+BIAS+ARG0+4*ARGSZ]
STPTR in5, [%sp+BIAS+ARG0+5*ARGSZ]
@ -3704,18 +3482,6 @@ DES_ede3_cbc_encrypt:
! first in memory to rightmost in register
#ifdef __sparc_v9__
andcc local4, 3, global0
bne,pn %icc, .LLE6
nop
lda [local4] 0x88, in5
add local4, 4, local3
ba,pt %icc, .LLE6a
lda [local3] 0x88, out5
#endif
.LLE6:
ldub [local4+3], in5
@ -3751,11 +3517,7 @@ DES_ede3_cbc_encrypt:
addcc in2, -8, in2 ! bytes missing after next block
#ifdef __sparc_v9__
bl,pn %icc, .ede3.enc.seven.or.less
#else
bl .ede3.enc.seven.or.less
#endif
STPTR in3, [%sp+BIAS+ARG0+3*ARGSZ]
.ede3.enc.next.block:
@ -3767,18 +3529,6 @@ DES_ede3_cbc_encrypt:
! first in memory to rightmost in register
#ifdef __sparc_v9__
andcc in0, 3, global0
bne,pn %icc, .LLE7
nop
lda [in0] 0x88, out4
add in0, 4, local3
ba,pt %icc, .LLE7a
lda [local3] 0x88, global4
#endif
.LLE7:
ldub [in0+3], out4
@ -3926,11 +3676,7 @@ DES_ede3_cbc_encrypt:
call .des_enc ! ks3 in3 compares in2 to 8
nop
#ifdef __sparc_v9__
bl,pn %icc, .ede3.enc.next.block_fp
#else
bl .ede3.enc.next.block_fp
#endif
add in0, 8, in0
! If 8 or more bytes are to be encrypted after this block,
@ -3944,18 +3690,6 @@ DES_ede3_cbc_encrypt:
! first in memory to rightmost in register
#ifdef __sparc_v9__
andcc in0, 3, global0
bne,pn %icc, .LLE11
nop
lda [in0] 0x88, global3
add in0, 4, local5
ba,pt %icc, .LLE11a
lda [local5] 0x88, global4
#endif
.LLE11:
ldub [in0+3], global3
@ -4129,18 +3863,6 @@ DES_ede3_cbc_encrypt:
! rightmost in register to first in memory
#ifdef __sparc_v9__
andcc in1, 3, global0
bne,pn %icc, .SLE9
nop
sta out0, [in1] 0x88
add in1, 4, local3
ba,pt %icc, .SLE9a
sta out1, [local3] 0x88
#endif
.SLE9:
and out0, 255, local3
stub local3, [in1+0]
@ -4272,18 +3994,6 @@ DES_ede3_cbc_encrypt:
! rightmost in register to first in memory
#ifdef __sparc_v9__
andcc in1, 3, global0
bne,pn %icc, .SLE5
nop
sta in5, [in1] 0x88
add in1, 4, local3
ba,pt %icc, .SLE5a
sta out5, [local3] 0x88
#endif
.SLE5:
and in5, 255, local3
stub local3, [in1+0]
@ -4320,22 +4030,14 @@ DES_ede3_cbc_encrypt:
addcc in2, -8, in2 ! bytes missing when next block done
#ifdef __sparc_v9__
bpos,pt %icc, .ede3.enc.next.block
#else
bpos .ede3.enc.next.block
#endif
add in1, 8, in1
.ede3.enc.seven.or.less:
cmp in2, -8
#ifdef __sparc_v9__
ble,pt %icc, .ede3.enc.finish
#else
ble .ede3.enc.finish
#endif
nop
add in2, 8, local1 ! bytes to load
@ -4410,18 +4112,6 @@ DES_ede3_cbc_encrypt:
! rightmost in register to first in memory
#ifdef __sparc_v9__
andcc local4, 3, global0
bne,pn %icc, .SLE6
nop
sta in5, [local4] 0x88
add local4, 4, local5
ba,pt %icc, .SLE6a
sta out5, [local5] 0x88
#endif
.SLE6:
and in5, 255, local5
stub local5, [local4+0]
@ -4471,11 +4161,7 @@ DES_ede3_cbc_encrypt:
STPTR in3, [%sp+BIAS+ARG0+3*ARGSZ]
cmp in2, 0
#ifdef __sparc_v9__
ble %icc, .ede3.dec.finish
#else
ble .ede3.dec.finish
#endif
STPTR in5, [%sp+BIAS+ARG0+5*ARGSZ]
LDPTR [%fp+BIAS+ARG0+6*ARGSZ], local7 ! iv
@ -4486,18 +4172,6 @@ DES_ede3_cbc_encrypt:
! first in memory to rightmost in register
#ifdef __sparc_v9__
andcc local7, 3, global0
bne,pn %icc, .LLE8
nop
lda [local7] 0x88, in0
add local7, 4, local3
ba,pt %icc, .LLE8a
lda [local3] 0x88, in1
#endif
.LLE8:
ldub [local7+3], in0
@ -4540,18 +4214,6 @@ DES_ede3_cbc_encrypt:
! first in memory to rightmost in register
#ifdef __sparc_v9__
andcc local5, 3, global0
bne,pn %icc, .LLE9
nop
lda [local5] 0x88, in5
add local5, 4, local3
ba,pt %icc, .LLE9a
lda [local3] 0x88, out5
#endif
.LLE9:
ldub [local5+3], in5
@ -4778,11 +4440,7 @@ DES_ede3_cbc_encrypt:
! in2 is compared to 8 in the rounds
xor out5, in0, out4
#ifdef __sparc_v9__
bl,pn %icc, .ede3.dec.seven.or.less
#else
bl .ede3.dec.seven.or.less
#endif
xor in5, in1, global4
@ -4792,19 +4450,6 @@ DES_ede3_cbc_encrypt:
! first in memory to rightmost in register
#ifdef __sparc_v9__
andcc local5, 3, global0
bne,pn %icc, .LLE10
nop
lda [local5] 0x88, in0
add local5, 4, local5
lda [local5] 0x88, in1
ba,pt %icc, .LLE10a
add local5, 4, local5
#endif
.LLE10:
ldub [local5+3], in0
@ -4845,18 +4490,6 @@ DES_ede3_cbc_encrypt:
! rightmost in register to first in memory
#ifdef __sparc_v9__
andcc local7, 3, global0
bne,pn %icc, .SLE7
nop
sta out4, [local7] 0x88
add local7, 4, local3
ba,pt %icc, .SLE7a
sta global4, [local3] 0x88
#endif
.SLE7:
and out4, 255, local3
stub local3, [local7+0]
@ -4895,11 +4528,7 @@ DES_ede3_cbc_encrypt:
addcc in2, -8, in2
add local7, 8, local7
#ifdef __sparc_v9__
bg,pt %icc, .ede3.dec.next.block
#else
bg .ede3.dec.next.block
#endif
STPTR local7, [%sp+BIAS+ARG0+1*ARGSZ]
.ede3.dec.store.iv:
@ -4912,18 +4541,6 @@ DES_ede3_cbc_encrypt:
! rightmost in register to first in memory
#ifdef __sparc_v9__
andcc local4, 3, global0
bne,pn %icc, .SLE8
nop
sta in0, [local4] 0x88
add local4, 4, local5
ba,pt %icc, .SLE8a
sta in1, [local5] 0x88
#endif
.SLE8:
and in0, 255, local5
stub local5, [local4+0]
@ -4972,19 +4589,6 @@ DES_ede3_cbc_encrypt:
! first in memory to rightmost in register
#ifdef __sparc_v9__
andcc local5, 3, global0
bne,pn %icc, .LLE14
nop
lda [local5] 0x88, in0
add local5, 4, local5
lda [local5] 0x88, in1
ba,pt %icc, .LLE14a
add local5, 4, local5
#endif
.LLE14:
ldub [local5+3], in0

View File

@ -1,3 +1,10 @@
#include "sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
.text
.align 32
.globl des_t4_key_expand
@ -47,7 +54,7 @@ des_t4_key_expand:
.align 32
des_t4_cbc_encrypt:
cmp %o2, 0
be,pn %icc, .Lcbc_abort
be,pn SIZE_T_CC, .Lcbc_abort
srln %o2, 0, %o2 ! needed on v8+, "nop" on v9
ld [%o4 + 0], %f0 ! load ivec
ld [%o4 + 4], %f1
@ -148,7 +155,7 @@ des_t4_cbc_encrypt:
.align 32
des_t4_cbc_decrypt:
cmp %o2, 0
be,pn %icc, .Lcbc_abort
be,pn SIZE_T_CC, .Lcbc_abort
srln %o2, 0, %o2 ! needed on v8+, "nop" on v9
ld [%o4 + 0], %f2 ! load ivec
ld [%o4 + 4], %f3
@ -247,7 +254,7 @@ des_t4_cbc_decrypt:
.align 32
des_t4_ede3_cbc_encrypt:
cmp %o2, 0
be,pn %icc, .Lcbc_abort
be,pn SIZE_T_CC, .Lcbc_abort
srln %o2, 0, %o2 ! needed on v8+, "nop" on v9
ld [%o4 + 0], %f0 ! load ivec
ld [%o4 + 4], %f1
@ -399,7 +406,7 @@ des_t4_ede3_cbc_encrypt:
.align 32
des_t4_ede3_cbc_decrypt:
cmp %o2, 0
be,pn %icc, .Lcbc_abort
be,pn SIZE_T_CC, .Lcbc_abort
srln %o2, 0, %o2 ! needed on v8+, "nop" on v9
ld [%o4 + 0], %f2 ! load ivec
ld [%o4 + 4], %f3

View File

@ -0,0 +1,6 @@
.PATH.S: ${.PARSEDIR}
EC_SRCS += \
ecp_nistz256-sparcv9.S
ECNI = yes
.include "../../ec.inc"

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,10 @@
#include "sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
.section ".text",#alloc,#execinstr
.align 64
@ -12,7 +19,7 @@ rem_4bit:
.globl gcm_ghash_4bit
.align 32
gcm_ghash_4bit:
save %sp,-112,%sp
save %sp,-STACK_FRAME,%sp
ldub [%i2+15],%l1
ldub [%i0+15],%l2
ldub [%i0+14],%l3
@ -101,7 +108,7 @@ gcm_ghash_4bit:
add %i2,16,%i2
cmp %i2,%i3
be,pn %icc,.Ldone
be,pn SIZE_T_CC,.Ldone
and %o1,0xf,%l5
ldx [%l6+%l0],%o3
@ -147,7 +154,7 @@ gcm_ghash_4bit:
.globl gcm_gmult_4bit
.align 32
gcm_gmult_4bit:
save %sp,-112,%sp
save %sp,-STACK_FRAME,%sp
ldub [%i0+15],%l1
add %i1,8,%l6
@ -249,7 +256,7 @@ gcm_gmult_4bit:
.globl gcm_init_vis3
.align 32
gcm_init_vis3:
save %sp,-112,%sp
save %sp,-STACK_FRAME,%sp
ldx [%i1+0],%o2
ldx [%i1+8],%o1
@ -282,7 +289,7 @@ gcm_init_vis3:
.globl gcm_gmult_vis3
.align 32
gcm_gmult_vis3:
save %sp,-112,%sp
save %sp,-STACK_FRAME,%sp
ldx [%i0+8],%o3 ! load Xi
ldx [%i0+0],%o4
@ -336,7 +343,7 @@ gcm_gmult_vis3:
.globl gcm_ghash_vis3
.align 32
gcm_ghash_vis3:
save %sp,-112,%sp
save %sp,-STACK_FRAME,%sp
nop
srln %i3,0,%i3 ! needed on v8+, "nop" on v9

View File

@ -0,0 +1,946 @@
#include "sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
# define STPTR stx
# define SIZE_T 8
#else
# define STPTR st
# define SIZE_T 4
#endif
#define LOCALS (STACK_BIAS+STACK_FRAME)
.section ".text",#alloc,#execinstr
#ifdef __PIC__
SPARC_PIC_THUNK(%g1)
#endif
.globl poly1305_init
.align 32
poly1305_init:
save %sp,-STACK_FRAME-16,%sp
nop
SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1)
ld [%g1],%g1
and %g1,SPARCV9_FMADD|SPARCV9_VIS3,%g1
cmp %g1,SPARCV9_FMADD
be .Lpoly1305_init_fma
nop
stx %g0,[%i0+0]
stx %g0,[%i0+8] ! zero hash value
brz,pn %i1,.Lno_key
stx %g0,[%i0+16]
and %i1,7,%i5 ! alignment factor
andn %i1,7,%i1
sll %i5,3,%i5 ! *8
neg %i5,%i4
sethi %hi(0x0ffffffc),%o4
set 8,%o1
or %o4,%lo(0x0ffffffc),%o4
set 16,%o2
sllx %o4,32,%o5
or %o4,%o5,%o5 ! 0x0ffffffc0ffffffc
or %o5,3,%o4 ! 0x0ffffffc0fffffff
ldxa [%i1+%g0]0x88,%o0 ! load little-endian key
brz,pt %i5,.Lkey_aligned
ldxa [%i1+%o1]0x88,%o1
ldxa [%i1+%o2]0x88,%o2
srlx %o0,%i5,%o0
sllx %o1,%i4,%o7
srlx %o1,%i5,%o1
or %o7,%o0,%o0
sllx %o2,%i4,%o2
or %o2,%o1,%o1
.Lkey_aligned:
and %o4,%o0,%o0
and %o5,%o1,%o1
stx %o0,[%i0+32+0] ! store key
stx %o1,[%i0+32+8]
andcc %g1,SPARCV9_VIS3,%g0
be .Lno_key
nop
1: call .+8
add %o7,poly1305_blocks_vis3-1b,%o7
add %o7,poly1305_emit-poly1305_blocks_vis3,%o5
STPTR %o7,[%i2]
STPTR %o5,[%i2+SIZE_T]
ret
restore %g0,1,%o0 ! return 1
.Lno_key:
ret
restore %g0,%g0,%o0 ! return 0
.type poly1305_init,#function
.size poly1305_init,.-poly1305_init
.globl poly1305_blocks
.align 32
poly1305_blocks:
save %sp,-STACK_FRAME,%sp
srln %i2,4,%i2
brz,pn %i2,.Lno_data
nop
ld [%i0+32+0],%l1 ! load key
ld [%i0+32+4],%l0
ld [%i0+32+8],%l3
ld [%i0+32+12],%l2
ld [%i0+0],%o1 ! load hash value
ld [%i0+4],%o0
ld [%i0+8],%o3
ld [%i0+12],%o2
ld [%i0+16],%l7
and %i1,7,%i5 ! alignment factor
andn %i1,7,%i1
set 8,%g2
sll %i5,3,%i5 ! *8
set 16,%g3
neg %i5,%i4
srl %l1,2,%l4
srl %l2,2,%l5
add %l1,%l4,%l4
srl %l3,2,%l6
add %l2,%l5,%l5
add %l3,%l6,%l6
.Loop:
ldxa [%i1+%g0]0x88,%g1 ! load little-endian input
brz,pt %i5,.Linp_aligned
ldxa [%i1+%g2]0x88,%g2
ldxa [%i1+%g3]0x88,%g3
srlx %g1,%i5,%g1
sllx %g2,%i4,%o5
srlx %g2,%i5,%g2
or %o5,%g1,%g1
sllx %g3,%i4,%g3
or %g3,%g2,%g2
.Linp_aligned:
srlx %g1,32,%o4
addcc %g1,%o0,%o0 ! accumulate input
srlx %g2,32,%o5
addccc %o4,%o1,%o1
addccc %g2,%o2,%o2
addccc %o5,%o3,%o3
addc %i3,%l7,%l7
umul %l0,%o0,%g1
umul %l1,%o0,%g2
umul %l2,%o0,%g3
umul %l3,%o0,%g4
sub %i2,1,%i2
add %i1,16,%i1
umul %l6,%o1,%o4
umul %l0,%o1,%o5
umul %l1,%o1,%o7
add %o4,%g1,%g1
add %o5,%g2,%g2
umul %l2,%o1,%o4
add %o7,%g3,%g3
add %o4,%g4,%g4
umul %l5,%o2,%o5
umul %l6,%o2,%o7
umul %l0,%o2,%o4
add %o5,%g1,%g1
add %o7,%g2,%g2
umul %l1,%o2,%o5
add %o4,%g3,%g3
add %o5,%g4,%g4
umul %l4,%o3,%o7
umul %l5,%o3,%o4
umul %l6,%o3,%o5
add %o7,%g1,%g1
add %o4,%g2,%g2
umul %l0,%o3,%o7
add %o5,%g3,%g3
add %o7,%g4,%g4
umul %l4,%l7,%o4
umul %l5,%l7,%o5
umul %l6,%l7,%o7
umul %l0,%l7,%l7
add %o4,%g2,%g2
add %o5,%g3,%g3
srlx %g1,32,%o1
add %o7,%g4,%g4
srlx %g2,32,%o2
addcc %g2,%o1,%o1
srlx %g3,32,%o3
set 8,%g2
addccc %g3,%o2,%o2
srlx %g4,32,%o4
set 16,%g3
addccc %g4,%o3,%o3
addc %o4,%l7,%l7
srl %l7,2,%o4 ! final reduction step
andn %l7,3,%o5
and %l7,3,%l7
add %o5,%o4,%o4
addcc %o4,%g1,%o0
addccc %g0,%o1,%o1
addccc %g0,%o2,%o2
addccc %g0,%o3,%o3
brnz,pt %i2,.Loop
addc %g0,%l7,%l7
st %o1,[%i0+0] ! store hash value
st %o0,[%i0+4]
st %o3,[%i0+8]
st %o2,[%i0+12]
st %l7,[%i0+16]
.Lno_data:
ret
restore
.type poly1305_blocks,#function
.size poly1305_blocks,.-poly1305_blocks
.align 32
poly1305_blocks_vis3:
save %sp,-STACK_FRAME,%sp
srln %i2,4,%i2
brz,pn %i2,.Lno_data
nop
ldx [%i0+32+0],%o3 ! load key
ldx [%i0+32+8],%o4
ldx [%i0+0],%o0 ! load hash value
ldx [%i0+8],%o1
ld [%i0+16],%o2
and %i1,7,%i5 ! alignment factor
andn %i1,7,%i1
set 8,%l1
sll %i5,3,%i5 ! *8
set 16,%l2
neg %i5,%i4
srlx %o4,2,%o5
b .Loop_vis3
add %o4,%o5,%o5
.Loop_vis3:
ldxa [%i1+%g0]0x88,%g1 ! load little-endian input
brz,pt %i5,.Linp_aligned_vis3
ldxa [%i1+%l1]0x88,%g2
ldxa [%i1+%l2]0x88,%g3
srlx %g1,%i5,%g1
sllx %g2,%i4,%o7
srlx %g2,%i5,%g2
or %o7,%g1,%g1
sllx %g3,%i4,%g3
or %g3,%g2,%g2
.Linp_aligned_vis3:
addcc %g1,%o0,%o0 ! accumulate input
sub %i2,1,%i2
.word 0x93b08269 !addxccc %g2,%o1,%o1
add %i1,16,%i1
mulx %o3,%o0,%g1 ! r0*h0
.word 0x95b6c22a !addxc %i3,%o2,%o2
.word 0x85b2c2c8 !umulxhi %o3,%o0,%g2
mulx %o5,%o1,%g4 ! s1*h1
.word 0x9fb342c9 !umulxhi %o5,%o1,%o7
addcc %g4,%g1,%g1
mulx %o4,%o0,%g4 ! r1*h0
.word 0x85b3c222 !addxc %o7,%g2,%g2
.word 0x87b302c8 !umulxhi %o4,%o0,%g3
addcc %g4,%g2,%g2
mulx %o3,%o1,%g4 ! r0*h1
.word 0x87b00223 !addxc %g0,%g3,%g3
.word 0x9fb2c2c9 !umulxhi %o3,%o1,%o7
addcc %g4,%g2,%g2
mulx %o5,%o2,%g4 ! s1*h2
.word 0x87b3c223 !addxc %o7,%g3,%g3
mulx %o3,%o2,%o7 ! r0*h2
addcc %g4,%g2,%g2
.word 0x87b3c223 !addxc %o7,%g3,%g3
srlx %g3,2,%g4 ! final reduction step
andn %g3,3,%o7
and %g3,3,%o2
add %o7,%g4,%g4
addcc %g4,%g1,%o0
.word 0x93b00262 !addxccc %g0,%g2,%o1
brnz,pt %i2,.Loop_vis3
.word 0x95b0022a !addxc %g0,%o2,%o2
stx %o0,[%i0+0] ! store hash value
stx %o1,[%i0+8]
st %o2,[%i0+16]
ret
restore
.type poly1305_blocks_vis3,#function
.size poly1305_blocks_vis3,.-poly1305_blocks_vis3
.globl poly1305_emit
.align 32
poly1305_emit:
save %sp,-STACK_FRAME,%sp
ld [%i0+0],%o1 ! load hash value
ld [%i0+4],%o0
ld [%i0+8],%o3
ld [%i0+12],%o2
ld [%i0+16],%l7
addcc %o0,5,%l0 ! compare to modulus
addccc %o1,0,%l1
addccc %o2,0,%l2
addccc %o3,0,%l3
addc %l7,0,%l7
andcc %l7,4,%g0 ! did it carry/borrow?
movnz %icc,%l0,%o0
ld [%i2+0],%l0 ! load nonce
movnz %icc,%l1,%o1
ld [%i2+4],%l1
movnz %icc,%l2,%o2
ld [%i2+8],%l2
movnz %icc,%l3,%o3
ld [%i2+12],%l3
addcc %l0,%o0,%o0 ! accumulate nonce
addccc %l1,%o1,%o1
addccc %l2,%o2,%o2
addc %l3,%o3,%o3
srl %o0,8,%l0
stb %o0,[%i1+0] ! store little-endian result
srl %o0,16,%l1
stb %l0,[%i1+1]
srl %o0,24,%l2
stb %l1,[%i1+2]
stb %l2,[%i1+3]
srl %o1,8,%l0
stb %o1,[%i1+4]
srl %o1,16,%l1
stb %l0,[%i1+5]
srl %o1,24,%l2
stb %l1,[%i1+6]
stb %l2,[%i1+7]
srl %o2,8,%l0
stb %o2,[%i1+8]
srl %o2,16,%l1
stb %l0,[%i1+9]
srl %o2,24,%l2
stb %l1,[%i1+10]
stb %l2,[%i1+11]
srl %o3,8,%l0
stb %o3,[%i1+12]
srl %o3,16,%l1
stb %l0,[%i1+13]
srl %o3,24,%l2
stb %l1,[%i1+14]
stb %l2,[%i1+15]
ret
restore
.type poly1305_emit,#function
.size poly1305_emit,.-poly1305_emit
.align 32
poly1305_init_fma:
save %sp,-STACK_FRAME-16,%sp
nop
.Lpoly1305_init_fma:
1: call .+8
add %o7,.Lconsts_fma-1b,%o7
ldd [%o7+8*0],%f16 ! load constants
ldd [%o7+8*1],%f18
ldd [%o7+8*2],%f20
ldd [%o7+8*3],%f22
ldd [%o7+8*5],%f26
std %f16,[%i0+8*0] ! initial hash value, biased 0
std %f18,[%i0+8*1]
std %f20,[%i0+8*2]
std %f22,[%i0+8*3]
brz,pn %i1,.Lno_key_fma
nop
stx %fsr,[%sp+LOCALS] ! save original %fsr
ldx [%o7+8*6],%fsr ! load new %fsr
std %f16,[%i0+8*4] ! key "template"
std %f18,[%i0+8*5]
std %f20,[%i0+8*6]
std %f22,[%i0+8*7]
and %i1,7,%l2
andn %i1,7,%i1 ! align pointer
mov 8,%l0
sll %l2,3,%l2
mov 16,%l1
neg %l2,%l3
ldxa [%i1+%g0]0x88,%o0 ! load little-endian key
ldxa [%i1+%l0]0x88,%o2
brz %l2,.Lkey_aligned_fma
sethi %hi(0xf0000000),%l0 ! 0xf0000000
ldxa [%i1+%l1]0x88,%o4
srlx %o0,%l2,%o0 ! align data
sllx %o2,%l3,%o1
srlx %o2,%l2,%o2
or %o1,%o0,%o0
sllx %o4,%l3,%o3
or %o3,%o2,%o2
.Lkey_aligned_fma:
or %l0,3,%l1 ! 0xf0000003
srlx %o0,32,%o1
andn %o0,%l0,%o0 ! &=0x0fffffff
andn %o1,%l1,%o1 ! &=0x0ffffffc
srlx %o2,32,%o3
andn %o2,%l1,%o2
andn %o3,%l1,%o3
st %o0,[%i0+36] ! fill "template"
st %o1,[%i0+44]
st %o2,[%i0+52]
st %o3,[%i0+60]
ldd [%i0+8*4],%f0 ! load [biased] key
ldd [%i0+8*5],%f4
ldd [%i0+8*6],%f8
ldd [%i0+8*7],%f12
fsubd %f0,%f16, %f0 ! r0
ldd [%o7+8*7],%f16 ! more constants
fsubd %f4,%f18,%f4 ! r1
ldd [%o7+8*8],%f18
fsubd %f8,%f20,%f8 ! r2
ldd [%o7+8*9],%f20
fsubd %f12,%f22,%f12 ! r3
ldd [%o7+8*10],%f22
fmuld %f26,%f4,%f52 ! s1
fmuld %f26,%f8,%f40 ! s2
fmuld %f26,%f12,%f44 ! s3
faddd %f0,%f16, %f2
faddd %f4,%f18,%f6
faddd %f8,%f20,%f10
faddd %f12,%f22,%f14
fsubd %f2,%f16, %f2
ldd [%o7+8*11],%f16 ! more constants
fsubd %f6,%f18,%f6
ldd [%o7+8*12],%f18
fsubd %f10,%f20,%f10
ldd [%o7+8*13],%f20
fsubd %f14,%f22,%f14
fsubd %f0,%f2,%f0
std %f2,[%i0+8*5] ! r0hi
fsubd %f4,%f6,%f4
std %f6,[%i0+8*7] ! r1hi
fsubd %f8,%f10,%f8
std %f10,[%i0+8*9] ! r2hi
fsubd %f12,%f14,%f12
std %f14,[%i0+8*11] ! r3hi
faddd %f52,%f16, %f54
faddd %f40,%f18,%f42
faddd %f44,%f20,%f46
fsubd %f54,%f16, %f54
fsubd %f42,%f18,%f42
fsubd %f46,%f20,%f46
fsubd %f52,%f54,%f52
fsubd %f40,%f42,%f40
fsubd %f44,%f46,%f44
ldx [%sp+LOCALS],%fsr ! restore %fsr
std %f0,[%i0+8*4] ! r0lo
std %f4,[%i0+8*6] ! r1lo
std %f8,[%i0+8*8] ! r2lo
std %f12,[%i0+8*10] ! r3lo
std %f54,[%i0+8*13]
std %f42,[%i0+8*15]
std %f46,[%i0+8*17]
std %f52,[%i0+8*12]
std %f40,[%i0+8*14]
std %f44,[%i0+8*16]
add %o7,poly1305_blocks_fma-.Lconsts_fma,%o0
add %o7,poly1305_emit_fma-.Lconsts_fma,%o1
STPTR %o0,[%i2]
STPTR %o1,[%i2+SIZE_T]
ret
restore %g0,1,%o0 ! return 1
.Lno_key_fma:
ret
restore %g0,%g0,%o0 ! return 0
.type poly1305_init_fma,#function
.size poly1305_init_fma,.-poly1305_init_fma
.align 32
poly1305_blocks_fma:
save %sp,-STACK_FRAME-48,%sp
srln %i2,4,%i2
brz,pn %i2,.Labort
sub %i2,1,%i2
1: call .+8
add %o7,.Lconsts_fma-1b,%o7
ldd [%o7+8*0],%f16 ! load constants
ldd [%o7+8*1],%f18
ldd [%o7+8*2],%f20
ldd [%o7+8*3],%f22
ldd [%o7+8*4],%f24
ldd [%o7+8*5],%f26
ldd [%i0+8*0],%f0 ! load [biased] hash value
ldd [%i0+8*1],%f4
ldd [%i0+8*2],%f8
ldd [%i0+8*3],%f12
std %f16,[%sp+LOCALS+8*0] ! input "template"
sethi %hi((1023+52+96)<<20),%o3
std %f18,[%sp+LOCALS+8*1]
or %i3,%o3,%o3
std %f20,[%sp+LOCALS+8*2]
st %o3,[%sp+LOCALS+8*3]
and %i1,7,%l2
andn %i1,7,%i1 ! align pointer
mov 8,%l0
sll %l2,3,%l2
mov 16,%l1
neg %l2,%l3
ldxa [%i1+%g0]0x88,%o0 ! load little-endian input
brz %l2,.Linp_aligned_fma
ldxa [%i1+%l0]0x88,%o2
ldxa [%i1+%l1]0x88,%o4
add %i1,8,%i1
srlx %o0,%l2,%o0 ! align data
sllx %o2,%l3,%o1
srlx %o2,%l2,%o2
or %o1,%o0,%o0
sllx %o4,%l3,%o3
srlx %o4,%l2,%o4 ! pre-shift
or %o3,%o2,%o2
.Linp_aligned_fma:
srlx %o0,32,%o1
movrz %i2,0,%l1
srlx %o2,32,%o3
add %l1,%i1,%i1 ! conditional advance
st %o0,[%sp+LOCALS+8*0+4] ! fill "template"
st %o1,[%sp+LOCALS+8*1+4]
st %o2,[%sp+LOCALS+8*2+4]
st %o3,[%sp+LOCALS+8*3+4]
ldd [%i0+8*4],%f28 ! load key
ldd [%i0+8*5],%f30
ldd [%i0+8*6],%f32
ldd [%i0+8*7],%f34
ldd [%i0+8*8],%f36
ldd [%i0+8*9],%f38
ldd [%i0+8*10],%f48
ldd [%i0+8*11],%f50
ldd [%i0+8*12],%f52
ldd [%i0+8*13],%f54
ldd [%i0+8*14],%f40
ldd [%i0+8*15],%f42
ldd [%i0+8*16],%f44
ldd [%i0+8*17],%f46
stx %fsr,[%sp+LOCALS+8*4] ! save original %fsr
ldx [%o7+8*6],%fsr ! load new %fsr
subcc %i2,1,%i2
movrz %i2,0,%l1
ldd [%sp+LOCALS+8*0],%f56 ! load biased input
ldd [%sp+LOCALS+8*1],%f58
ldd [%sp+LOCALS+8*2],%f60
ldd [%sp+LOCALS+8*3],%f62
fsubd %f0,%f16, %f0 ! de-bias hash value
fsubd %f4,%f18,%f4
ldxa [%i1+%g0]0x88,%o0 ! modulo-scheduled input load
fsubd %f8,%f20,%f8
fsubd %f12,%f22,%f12
ldxa [%i1+%l0]0x88,%o2
fsubd %f56,%f16, %f56 ! de-bias input
fsubd %f58,%f18,%f58
fsubd %f60,%f20,%f60
fsubd %f62,%f22,%f62
brz %l2,.Linp_aligned_fma2
add %l1,%i1,%i1 ! conditional advance
sllx %o0,%l3,%o1 ! align data
srlx %o0,%l2,%o3
or %o1,%o4,%o0
sllx %o2,%l3,%o1
srlx %o2,%l2,%o4 ! pre-shift
or %o3,%o1,%o2
.Linp_aligned_fma2:
srlx %o0,32,%o1
srlx %o2,32,%o3
faddd %f0,%f56,%f56 ! accumulate input
stw %o0,[%sp+LOCALS+8*0+4]
faddd %f4,%f58,%f58
stw %o1,[%sp+LOCALS+8*1+4]
faddd %f8,%f60,%f60
stw %o2,[%sp+LOCALS+8*2+4]
faddd %f12,%f62,%f62
stw %o3,[%sp+LOCALS+8*3+4]
b .Lentry_fma
nop
.align 16
.Loop_fma:
ldxa [%i1+%g0]0x88,%o0 ! modulo-scheduled input load
ldxa [%i1+%l0]0x88,%o2
movrz %i2,0,%l1
faddd %f52,%f0,%f0 ! accumulate input
faddd %f54,%f2,%f2
faddd %f62,%f8,%f8
faddd %f60,%f10,%f10
brz,pn %l2,.Linp_aligned_fma3
add %l1,%i1,%i1 ! conditional advance
sllx %o0,%l3,%o1 ! align data
srlx %o0,%l2,%o3
or %o1,%o4,%o0
sllx %o2,%l3,%o1
srlx %o2,%l2,%o4 ! pre-shift
or %o3,%o1,%o2
.Linp_aligned_fma3:
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
faddd %f20,%f4,%f52
srlx %o0,32,%o1
faddd %f20,%f6,%f54
srlx %o2,32,%o3
faddd %f24,%f12,%f60
st %o0,[%sp+LOCALS+8*0+4] ! fill "template"
faddd %f24,%f14,%f62
st %o1,[%sp+LOCALS+8*1+4]
faddd %f18,%f0,%f48
st %o2,[%sp+LOCALS+8*2+4]
faddd %f18,%f2,%f50
st %o3,[%sp+LOCALS+8*3+4]
faddd %f22,%f8,%f56
faddd %f22,%f10,%f58
fsubd %f52,%f20,%f52
fsubd %f54,%f20,%f54
fsubd %f60,%f24,%f60
fsubd %f62,%f24,%f62
fsubd %f48,%f18,%f48
fsubd %f50,%f18,%f50
fsubd %f56,%f22,%f56
fsubd %f58,%f22,%f58
fsubd %f4,%f52,%f4
fsubd %f6,%f54,%f6
fsubd %f12,%f60,%f12
fsubd %f14,%f62,%f14
fsubd %f8,%f56,%f8
fsubd %f10,%f58,%f10
fsubd %f0,%f48,%f0
fsubd %f2,%f50,%f2
faddd %f4,%f48,%f4
faddd %f6,%f50,%f6
faddd %f12,%f56,%f12
faddd %f14,%f58,%f14
faddd %f8,%f52,%f8
faddd %f10,%f54,%f10
.word 0x81be805d !fmaddd %f26,%f60,%f0,%f0
.word 0x85be845f !fmaddd %f26,%f62,%f2,%f2
faddd %f4,%f6,%f58
ldd [%i0+8*12],%f52 ! reload constants
faddd %f12,%f14,%f62
ldd [%i0+8*13],%f54
faddd %f8,%f10,%f60
ldd [%i0+8*10],%f48
faddd %f0,%f2,%f56
ldd [%i0+8*11],%f50
.Lentry_fma:
fmuld %f58,%f44,%f0
fmuld %f58,%f46,%f2
fmuld %f58,%f32,%f8
fmuld %f58,%f34,%f10
fmuld %f58,%f28,%f4
fmuld %f58,%f30,%f6
fmuld %f58,%f36,%f12
fmuld %f58,%f38,%f14
.word 0x81bfc055 !fmaddd %f62,%f52,%f0,%f0
.word 0x85bfc457 !fmaddd %f62,%f54,%f2,%f2
.word 0x91bfd04d !fmaddd %f62,%f44,%f8,%f8
.word 0x95bfd44f !fmaddd %f62,%f46,%f10,%f10
.word 0x89bfc849 !fmaddd %f62,%f40,%f4,%f4
.word 0x8dbfcc4b !fmaddd %f62,%f42,%f6,%f6
.word 0x99bfd85c !fmaddd %f62,%f28,%f12,%f12
.word 0x9dbfdc5e !fmaddd %f62,%f30,%f14,%f14
.word 0x81bf4049 !fmaddd %f60,%f40,%f0,%f0
.word 0x85bf444b !fmaddd %f60,%f42,%f2,%f2
.word 0x91bf505c !fmaddd %f60,%f28,%f8,%f8
.word 0x95bf545e !fmaddd %f60,%f30,%f10,%f10
.word 0x89bf484d !fmaddd %f60,%f44,%f4,%f4
ldd [%sp+LOCALS+8*0],%f52 ! load [biased] input
.word 0x8dbf4c4f !fmaddd %f60,%f46,%f6,%f6
ldd [%sp+LOCALS+8*1],%f54
.word 0x99bf5841 !fmaddd %f60,%f32,%f12,%f12
ldd [%sp+LOCALS+8*2],%f62
.word 0x9dbf5c43 !fmaddd %f60,%f34,%f14,%f14
ldd [%sp+LOCALS+8*3],%f60
.word 0x81be405c !fmaddd %f56,%f28,%f0,%f0
fsubd %f52,%f16, %f52 ! de-bias input
.word 0x85be445e !fmaddd %f56,%f30,%f2,%f2
fsubd %f54,%f18,%f54
.word 0x91be5045 !fmaddd %f56,%f36,%f8,%f8
fsubd %f62,%f20,%f62
.word 0x95be5447 !fmaddd %f56,%f38,%f10,%f10
fsubd %f60,%f22,%f60
.word 0x89be4841 !fmaddd %f56,%f32,%f4,%f4
.word 0x8dbe4c43 !fmaddd %f56,%f34,%f6,%f6
.word 0x99be5851 !fmaddd %f56,%f48,%f12,%f12
.word 0x9dbe5c53 !fmaddd %f56,%f50,%f14,%f14
bcc SIZE_T_CC,.Loop_fma
subcc %i2,1,%i2
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
faddd %f0,%f18,%f48
faddd %f2,%f18,%f50
faddd %f8,%f22,%f56
faddd %f10,%f22,%f58
faddd %f4,%f20,%f52
faddd %f6,%f20,%f54
faddd %f12,%f24,%f60
faddd %f14,%f24,%f62
fsubd %f48,%f18,%f48
fsubd %f50,%f18,%f50
fsubd %f56,%f22,%f56
fsubd %f58,%f22,%f58
fsubd %f52,%f20,%f52
fsubd %f54,%f20,%f54
fsubd %f60,%f24,%f60
fsubd %f62,%f24,%f62
fsubd %f4,%f52,%f4
fsubd %f6,%f54,%f6
fsubd %f12,%f60,%f12
fsubd %f14,%f62,%f14
fsubd %f8,%f56,%f8
fsubd %f10,%f58,%f10
fsubd %f0,%f48,%f0
fsubd %f2,%f50,%f2
faddd %f4,%f48,%f4
faddd %f6,%f50,%f6
faddd %f12,%f56,%f12
faddd %f14,%f58,%f14
faddd %f8,%f52,%f8
faddd %f10,%f54,%f10
.word 0x81be805d !fmaddd %f26,%f60,%f0,%f0
.word 0x85be845f !fmaddd %f26,%f62,%f2,%f2
faddd %f4,%f6,%f58
faddd %f12,%f14,%f62
faddd %f8,%f10,%f60
faddd %f0,%f2,%f56
faddd %f58,%f18,%f58 ! bias
faddd %f62,%f22,%f62
faddd %f60,%f20,%f60
faddd %f56,%f16, %f56
ldx [%sp+LOCALS+8*4],%fsr ! restore saved %fsr
std %f58,[%i0+8*1] ! store [biased] hash value
std %f62,[%i0+8*3]
std %f60,[%i0+8*2]
std %f56,[%i0+8*0]
.Labort:
ret
restore
.type poly1305_blocks_fma,#function
.size poly1305_blocks_fma,.-poly1305_blocks_fma
.align 32
poly1305_emit_fma:
save %sp,-STACK_FRAME,%sp
ld [%i0+8*0+0],%l5 ! load hash
ld [%i0+8*0+4],%l0
ld [%i0+8*1+0],%o0
ld [%i0+8*1+4],%l1
ld [%i0+8*2+0],%o1
ld [%i0+8*2+4],%l2
ld [%i0+8*3+0],%o2
ld [%i0+8*3+4],%l3
sethi %hi(0xfff00000),%o3
andn %l5,%o3,%l5 ! mask exponent
andn %o0,%o3,%o0
andn %o1,%o3,%o1
andn %o2,%o3,%o2 ! can be partially reduced...
mov 3,%o3
srl %o2,2,%i3 ! ... so reduce
and %o2,%o3,%l4
andn %o2,%o3,%o2
add %i3,%o2,%o2
addcc %o2,%l0,%l0
addccc %l5,%l1,%l1
addccc %o0,%l2,%l2
addccc %o1,%l3,%l3
addc %g0,%l4,%l4
addcc %l0,5,%l5 ! compare to modulus
addccc %l1,0,%o0
addccc %l2,0,%o1
addccc %l3,0,%o2
addc %l4,0,%o3
srl %o3,2,%o3 ! did it carry/borrow?
neg %o3,%o3
sra %o3,31,%o3 ! mask
andn %l0,%o3,%l0
and %l5,%o3,%l5
andn %l1,%o3,%l1
and %o0,%o3,%o0
or %l5,%l0,%l0
ld [%i2+0],%l5 ! load nonce
andn %l2,%o3,%l2
and %o1,%o3,%o1
or %o0,%l1,%l1
ld [%i2+4],%o0
andn %l3,%o3,%l3
and %o2,%o3,%o2
or %o1,%l2,%l2
ld [%i2+8],%o1
or %o2,%l3,%l3
ld [%i2+12],%o2
addcc %l5,%l0,%l0 ! accumulate nonce
addccc %o0,%l1,%l1
addccc %o1,%l2,%l2
addc %o2,%l3,%l3
stb %l0,[%i1+0] ! write little-endian result
srl %l0,8,%l0
stb %l1,[%i1+4]
srl %l1,8,%l1
stb %l2,[%i1+8]
srl %l2,8,%l2
stb %l3,[%i1+12]
srl %l3,8,%l3
stb %l0,[%i1+1]
srl %l0,8,%l0
stb %l1,[%i1+5]
srl %l1,8,%l1
stb %l2,[%i1+9]
srl %l2,8,%l2
stb %l3,[%i1+13]
srl %l3,8,%l3
stb %l0,[%i1+2]
srl %l0,8,%l0
stb %l1,[%i1+6]
srl %l1,8,%l1
stb %l2,[%i1+10]
srl %l2,8,%l2
stb %l3,[%i1+14]
srl %l3,8,%l3
stb %l0,[%i1+3]
stb %l1,[%i1+7]
stb %l2,[%i1+11]
stb %l3,[%i1+15]
ret
restore
.type poly1305_emit_fma,#function
.size poly1305_emit_fma,.-poly1305_emit_fma
.align 64
.Lconsts_fma:
.word 0x43300000,0x00000000 ! 2^(52+0)
.word 0x45300000,0x00000000 ! 2^(52+32)
.word 0x47300000,0x00000000 ! 2^(52+64)
.word 0x49300000,0x00000000 ! 2^(52+96)
.word 0x4b500000,0x00000000 ! 2^(52+130)
.word 0x37f40000,0x00000000 ! 5/2^130
.word 0,1<<30 ! fsr: truncate, no exceptions
.word 0x44300000,0x00000000 ! 2^(52+16+0)
.word 0x46300000,0x00000000 ! 2^(52+16+32)
.word 0x48300000,0x00000000 ! 2^(52+16+64)
.word 0x4a300000,0x00000000 ! 2^(52+16+96)
.word 0x3e300000,0x00000000 ! 2^(52+16+0-96)
.word 0x40300000,0x00000000 ! 2^(52+16+32-96)
.word 0x42300000,0x00000000 ! 2^(52+16+64-96)
.asciz "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro@openssl.org>"
.align 4

View File

@ -0,0 +1,6 @@
.PATH.S: ${.PARSEDIR}
POLY1305_SRCS = poly1305-sparcv9.S
POLY1305_CPPFLAGS+=-DPOLY1305_ASM
.include "../../poly1305.inc"

View File

@ -1,3 +1,5 @@
#include "sparc_arch.h"
.section ".text",#alloc,#execinstr
.global bn_mul_mont_int
@ -10,7 +12,7 @@ bn_mul_mont_int:
clr %o0
.align 32
.Lenter:
save %sp,-128,%sp
save %sp,-STACK_FRAME,%sp
sll %i5,2,%i5 ! num*=4
or %g1,%lo(0xffffffff),%g1
ld [%i4],%i4
@ -19,21 +21,21 @@ bn_mul_mont_int:
ld [%i2],%l2 ! bp[0]
nop
add %sp,0,%o7 ! real top of stack
add %sp,STACK_BIAS,%o7 ! real top of stack
ld [%i1],%o0 ! ap[0] ! redundant in squaring context
sub %o7,%i5,%o7
ld [%i1+4],%l5 ! ap[1]
and %o7,-1024,%o7
ld [%i3],%o1 ! np[0]
sub %o7,0,%sp ! alloca
sub %o7,STACK_BIAS,%sp ! alloca
ld [%i3+4],%l6 ! np[1]
be,pt %icc,.Lbn_sqr_mont
be,pt SIZE_T_CC,.Lbn_sqr_mont
mov 12,%l1
mulx %o0,%l2,%o0 ! ap[0]*bp[0]
mulx %l5,%l2,%g4 !prologue! ap[1]*bp[0]
and %o0,%g1,%o3
add %sp,0+128,%l4
add %sp,STACK_BIAS+STACK_FRAME,%l4
ld [%i1+8],%l5 !prologue!
mulx %i4,%o3,%l3 ! "t[0]"*n0
@ -92,7 +94,7 @@ bn_mul_mont_int:
mov 4,%l0 ! i++
ld [%i2+4],%l2 ! bp[1]
.Louter:
add %sp,0+128,%l4
add %sp,STACK_BIAS+STACK_FRAME,%l4
ld [%i1],%o0 ! ap[0]
ld [%i1+4],%l5 ! ap[1]
ld [%i3],%o1 ! np[0]
@ -211,7 +213,7 @@ bn_mul_mont_int:
mulx %l2,%l2,%o0 ! ap[0]*ap[0]
mulx %l5,%l2,%g4 !prologue!
and %o0,%g1,%o3
add %sp,0+128,%l4
add %sp,STACK_BIAS+STACK_FRAME,%l4
ld [%i1+8],%l5 !prologue!
mulx %i4,%o3,%l3 ! "t[0]"*n0
@ -220,7 +222,7 @@ bn_mul_mont_int:
mulx %o1,%l3,%o1 ! np[0]*"t[0]"*n0
mulx %l6,%l3,%o4 !prologue!
and %o0,1,%i2
and %o0,1,%o5
ld [%i3+8],%l6 !prologue!
srlx %o0,1,%o0
add %o3,%o1,%o1
@ -237,9 +239,9 @@ bn_mul_mont_int:
ld [%i3+%l1],%l6 ! np[j]
srlx %o0,32,%o0
add %o3,%o3,%o3
or %i2,%o3,%o3
or %o5,%o3,%o3
mov %g5,%o4
srlx %o3,32,%i2
srlx %o3,32,%o5
add %l1,4,%l1 ! j++
and %o3,%g1,%o3
cmp %l1,%i5
@ -258,8 +260,8 @@ bn_mul_mont_int:
and %o0,%g1,%o3
srlx %o0,32,%o0
add %o3,%o3,%o3
or %i2,%o3,%o3
srlx %o3,32,%i2
or %o5,%o3,%o3
srlx %o3,32,%o5
and %o3,%g1,%o3
add %o3,%o1,%o1
st %o1,[%l4]
@ -270,22 +272,22 @@ bn_mul_mont_int:
and %o0,%g1,%o3
srlx %o0,32,%o0
add %o3,%o3,%o3
or %i2,%o3,%o3
srlx %o3,32,%i2
or %o5,%o3,%o3
srlx %o3,32,%o5
and %o3,%g1,%o3
add %o3,%o1,%o1
st %o1,[%l4+4]
srlx %o1,32,%o1
add %o0,%o0,%o0
or %i2,%o0,%o0
or %o5,%o0,%o0
add %o0,%o1,%o1
st %o1,[%l4+8]
srlx %o1,32,%o2
ld [%sp+0+128],%g4 ! tp[0]
ld [%sp+0+128+4],%g5 ! tp[1]
ld [%sp+0+128+8],%l7 ! tp[2]
ld [%sp+STACK_BIAS+STACK_FRAME],%g4 ! tp[0]
ld [%sp+STACK_BIAS+STACK_FRAME+4],%g5 ! tp[1]
ld [%sp+STACK_BIAS+STACK_FRAME+8],%l7 ! tp[2]
ld [%i1+4],%l2 ! ap[1]
ld [%i1+8],%l5 ! ap[2]
ld [%i3],%o1 ! np[0]
@ -304,19 +306,19 @@ bn_mul_mont_int:
add %g5,%o1,%o1
srlx %o0,32,%o0
add %o3,%o1,%o1
and %o0,1,%i2
and %o0,1,%o5
add %o4,%o1,%o1
srlx %o0,1,%o0
mov 12,%l1
st %o1,[%sp+0+128] ! tp[0]=
st %o1,[%sp+STACK_BIAS+STACK_FRAME] ! tp[0]=
srlx %o1,32,%o1
add %sp,0+128+4,%l4
add %sp,STACK_BIAS+STACK_FRAME+4,%l4
.Lsqr_2nd:
mulx %l5,%l2,%o3
mulx %l6,%l3,%o4
add %o3,%o0,%o0
add %l7,%o1,%o1
add %l7,%o5,%o5
ld [%i1+%l1],%l5 ! ap[j]
and %o0,%g1,%o3
ld [%i3+%l1],%l6 ! np[j]
@ -325,8 +327,8 @@ bn_mul_mont_int:
ld [%l4+8],%l7 ! tp[j]
add %o3,%o3,%o3
add %l1,4,%l1 ! j++
or %i2,%o3,%o3
srlx %o3,32,%i2
add %o5,%o3,%o3
srlx %o3,32,%o5
and %o3,%g1,%o3
cmp %l1,%i5
add %o3,%o1,%o1
@ -339,27 +341,27 @@ bn_mul_mont_int:
mulx %l5,%l2,%o3
mulx %l6,%l3,%o4
add %o3,%o0,%o0
add %l7,%o1,%o1
add %l7,%o5,%o5
and %o0,%g1,%o3
srlx %o0,32,%o0
add %o4,%o1,%o1
add %o3,%o3,%o3
or %i2,%o3,%o3
srlx %o3,32,%i2
add %o5,%o3,%o3
srlx %o3,32,%o5
and %o3,%g1,%o3
add %o3,%o1,%o1
st %o1,[%l4] ! tp[j-1]
srlx %o1,32,%o1
add %o0,%o0,%o0
or %i2,%o0,%o0
add %o5,%o0,%o0
add %o0,%o1,%o1
add %o2,%o1,%o1
st %o1,[%l4+4]
srlx %o1,32,%o2
ld [%sp+0+128],%g5 ! tp[0]
ld [%sp+0+128+4],%l7 ! tp[1]
ld [%sp+STACK_BIAS+STACK_FRAME],%g5 ! tp[0]
ld [%sp+STACK_BIAS+STACK_FRAME+4],%l7 ! tp[1]
ld [%i1+8],%l2 ! ap[2]
ld [%i3],%o1 ! np[0]
ld [%i3+4],%l6 ! np[1]
@ -372,9 +374,9 @@ bn_mul_mont_int:
and %o0,%g1,%o3
add %g5,%o1,%o1
srlx %o0,32,%o0
add %sp,0+128,%l4
add %sp,STACK_BIAS+STACK_FRAME,%l4
srlx %o1,32,%o1
and %o0,1,%i2
and %o0,1,%o5
srlx %o0,1,%o0
mov 4,%l1
@ -412,7 +414,7 @@ bn_mul_mont_int:
.Lsqr_inner2:
mulx %l5,%l2,%o3
mulx %l6,%l3,%o4
add %l7,%o1,%o1
add %l7,%o5,%o5
add %o3,%o0,%o0
ld [%i1+%l1],%l5 ! ap[j]
and %o0,%g1,%o3
@ -420,9 +422,9 @@ bn_mul_mont_int:
srlx %o0,32,%o0
add %o3,%o3,%o3
ld [%l4+8],%l7 ! tp[j]
or %i2,%o3,%o3
add %o5,%o3,%o3
add %l1,4,%l1 ! j++
srlx %o3,32,%i2
srlx %o3,32,%o5
and %o3,%g1,%o3
cmp %l1,%i5
add %o3,%o1,%o1
@ -435,13 +437,13 @@ bn_mul_mont_int:
.Lsqr_no_inner2:
mulx %l5,%l2,%o3
mulx %l6,%l3,%o4
add %l7,%o1,%o1
add %l7,%o5,%o5
add %o3,%o0,%o0
and %o0,%g1,%o3
srlx %o0,32,%o0
add %o3,%o3,%o3
or %i2,%o3,%o3
srlx %o3,32,%i2
add %o5,%o3,%o3
srlx %o3,32,%o5
and %o3,%g1,%o3
add %o3,%o1,%o1
add %o4,%o1,%o1
@ -449,15 +451,15 @@ bn_mul_mont_int:
srlx %o1,32,%o1
add %o0,%o0,%o0
or %i2,%o0,%o0
add %o5,%o0,%o0
add %o0,%o1,%o1
add %o2,%o1,%o1
st %o1,[%l4+4]
srlx %o1,32,%o2
add %l0,4,%l0 ! i++
ld [%sp+0+128],%g5 ! tp[0]
ld [%sp+0+128+4],%l7 ! tp[1]
ld [%sp+STACK_BIAS+STACK_FRAME],%g5 ! tp[0]
ld [%sp+STACK_BIAS+STACK_FRAME+4],%l7 ! tp[1]
ld [%i1+%l0],%l2 ! ap[j]
ld [%i3],%o1 ! np[0]
ld [%i3+4],%l6 ! np[1]
@ -470,9 +472,9 @@ bn_mul_mont_int:
and %o0,%g1,%o3
add %g5,%o1,%o1
srlx %o0,32,%o0
add %sp,0+128,%l4
add %sp,STACK_BIAS+STACK_FRAME,%l4
srlx %o1,32,%o1
and %o0,1,%i2
and %o0,1,%o5
srlx %o0,1,%o0
cmp %g4,%i5 ! i<num-1
@ -494,14 +496,17 @@ bn_mul_mont_int:
!.Lsqr_last
mulx %l6,%l3,%o4
add %l7,%o1,%o1
add %l7,%o3,%o3
srlx %o3,32,%g4
and %o3,%g1,%o3
add %g4,%o5,%o5
add %o3,%o1,%o1
add %o4,%o1,%o1
st %o1,[%l4]
srlx %o1,32,%o1
add %o0,%o0,%o0 ! recover %o0
or %i2,%o0,%o0
add %o5,%o0,%o0
add %o0,%o1,%o1
add %o2,%o1,%o1
st %o1,[%l4+4]

View File

@ -1,9 +1,11 @@
#include "sparc_arch.h"
.section ".text",#alloc,#execinstr
.global bn_mul_mont_fpu
.align 32
bn_mul_mont_fpu:
save %sp,-128-64,%sp
save %sp,-STACK_FRAME-64,%sp
cmp %i5,4
bl,a,pn %icc,.Lret
@ -22,15 +24,15 @@ bn_mul_mont_fpu:
sll %i5,3,%i5 ! num*=8
add %sp,0,%o0 ! real top of stack
add %sp,STACK_BIAS,%o0 ! real top of stack
sll %i5,2,%o1
add %o1,%i5,%o1 ! %o1=num*5
sub %o0,%o1,%o0
and %o0,-2048,%o0 ! optimize TLB utilization
sub %o0,0,%sp ! alloca(5*num*8)
sub %o0,STACK_BIAS,%sp ! alloca(5*num*8)
rd %asi,%o7 ! save %asi
add %sp,0+128+64,%l0
add %sp,STACK_BIAS+STACK_FRAME+64,%l0
add %l0,%i5,%l1
add %l1,%i5,%l1 ! [an]p_[lh] point at the vectors' ends !
add %l1,%i5,%l2
@ -44,7 +46,7 @@ bn_mul_mont_fpu:
add %i2,%i5,%i2
add %i3,%i5,%i3
stx %o7,[%sp+0+128+48] ! save %asi
stx %o7,[%sp+STACK_BIAS+STACK_FRAME+48] ! save %asi
sub %g0,%i5,%l5 ! i=-num
sub %g0,%i5,%l6 ! j=-num
@ -65,7 +67,7 @@ bn_mul_mont_fpu:
mulx %o1,%o0,%o0 ! ap[0]*bp[0]
mulx %g4,%o0,%o0 ! ap[0]*bp[0]*n0
stx %o0,[%sp+0+128+0]
stx %o0,[%sp+STACK_BIAS+STACK_FRAME+0]
ld [%o3+0],%f17 ! load a[j] as pair of 32-bit words
.word 0xa1b00c20 ! fzeros %f16
@ -87,13 +89,13 @@ bn_mul_mont_fpu:
fxtod %f22,%f22
! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
ldda [%sp+0+128+6]%asi,%f8
ldda [%sp+STACK_BIAS+STACK_FRAME+6]%asi,%f8
fxtod %f0,%f0
ldda [%sp+0+128+4]%asi,%f10
ldda [%sp+STACK_BIAS+STACK_FRAME+4]%asi,%f10
fxtod %f2,%f2
ldda [%sp+0+128+2]%asi,%f12
ldda [%sp+STACK_BIAS+STACK_FRAME+2]%asi,%f12
fxtod %f4,%f4
ldda [%sp+0+128+0]%asi,%f14
ldda [%sp+STACK_BIAS+STACK_FRAME+0]%asi,%f14
fxtod %f6,%f6
std %f16,[%l1+%l6] ! save smashed ap[j] in double format
@ -139,13 +141,13 @@ bn_mul_mont_fpu:
fdtox %f52,%f52
fdtox %f54,%f54
std %f48,[%sp+0+128+0]
std %f48,[%sp+STACK_BIAS+STACK_FRAME+0]
add %l6,8,%l6
std %f50,[%sp+0+128+8]
std %f50,[%sp+STACK_BIAS+STACK_FRAME+8]
add %i1,%l6,%o4
std %f52,[%sp+0+128+16]
std %f52,[%sp+STACK_BIAS+STACK_FRAME+16]
add %i3,%l6,%o5
std %f54,[%sp+0+128+24]
std %f54,[%sp+STACK_BIAS+STACK_FRAME+24]
ld [%o4+0],%f17 ! load a[j] as pair of 32-bit words
.word 0xa1b00c20 ! fzeros %f16
@ -161,13 +163,13 @@ bn_mul_mont_fpu:
fxtod %f20,%f20
fxtod %f22,%f22
ldx [%sp+0+128+0],%o0
ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0
fmuld %f16,%f0,%f32
ldx [%sp+0+128+8],%o1
ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1
fmuld %f20,%f8,%f48
ldx [%sp+0+128+16],%o2
ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2
fmuld %f16,%f2,%f34
ldx [%sp+0+128+24],%o3
ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3
fmuld %f20,%f10,%f50
srlx %o0,16,%o7
@ -223,12 +225,12 @@ bn_mul_mont_fpu:
fdtox %f52,%f52
fdtox %f54,%f54
std %f48,[%sp+0+128+0]
std %f50,[%sp+0+128+8]
std %f48,[%sp+STACK_BIAS+STACK_FRAME+0]
std %f50,[%sp+STACK_BIAS+STACK_FRAME+8]
addcc %l6,8,%l6
std %f52,[%sp+0+128+16]
std %f52,[%sp+STACK_BIAS+STACK_FRAME+16]
bz,pn %icc,.L1stskip
std %f54,[%sp+0+128+24]
std %f54,[%sp+STACK_BIAS+STACK_FRAME+24]
.align 32 ! incidentally already aligned !
.L1st:
@ -248,13 +250,13 @@ bn_mul_mont_fpu:
fxtod %f20,%f20
fxtod %f22,%f22
ldx [%sp+0+128+0],%o0
ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0
fmuld %f16,%f0,%f32
ldx [%sp+0+128+8],%o1
ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1
fmuld %f20,%f8,%f48
ldx [%sp+0+128+16],%o2
ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2
fmuld %f16,%f2,%f34
ldx [%sp+0+128+24],%o3
ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3
fmuld %f20,%f10,%f50
srlx %o0,16,%o7
@ -314,10 +316,10 @@ bn_mul_mont_fpu:
fdtox %f52,%f52
fdtox %f54,%f54
std %f48,[%sp+0+128+0]
std %f50,[%sp+0+128+8]
std %f52,[%sp+0+128+16]
std %f54,[%sp+0+128+24]
std %f48,[%sp+STACK_BIAS+STACK_FRAME+0]
std %f50,[%sp+STACK_BIAS+STACK_FRAME+8]
std %f52,[%sp+STACK_BIAS+STACK_FRAME+16]
std %f54,[%sp+STACK_BIAS+STACK_FRAME+24]
addcc %l6,8,%l6
bnz,pt %icc,.L1st
@ -327,15 +329,15 @@ bn_mul_mont_fpu:
fdtox %f24,%f24
fdtox %f26,%f26
ldx [%sp+0+128+0],%o0
ldx [%sp+0+128+8],%o1
ldx [%sp+0+128+16],%o2
ldx [%sp+0+128+24],%o3
ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0
ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1
ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2
ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3
srlx %o0,16,%o7
std %f24,[%sp+0+128+32]
std %f24,[%sp+STACK_BIAS+STACK_FRAME+32]
add %o7,%o1,%o1
std %f26,[%sp+0+128+40]
std %f26,[%sp+STACK_BIAS+STACK_FRAME+40]
srlx %o1,16,%o7
add %o7,%o2,%o2
srlx %o2,16,%o7
@ -349,9 +351,9 @@ bn_mul_mont_fpu:
or %o1,%o0,%o0
or %o2,%o0,%o0
or %o7,%o0,%o0 ! 64-bit result
ldx [%sp+0+128+32],%o4
ldx [%sp+STACK_BIAS+STACK_FRAME+32],%o4
addcc %g1,%o0,%o0
ldx [%sp+0+128+40],%o5
ldx [%sp+STACK_BIAS+STACK_FRAME+40],%o5
srlx %o3,16,%g1 ! 34-bit carry
bcs,a %xcc,.+8
add %g1,1,%g1
@ -377,7 +379,7 @@ bn_mul_mont_fpu:
.align 32
.Louter:
sub %g0,%i5,%l6 ! j=-num
add %sp,0+128+64,%l0
add %sp,STACK_BIAS+STACK_FRAME+64,%l0
add %i1,%l6,%o3
add %i2,%l5,%o4
@ -395,7 +397,7 @@ bn_mul_mont_fpu:
mulx %o1,%o0,%o0
addcc %o2,%o0,%o0
mulx %g4,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
stx %o0,[%sp+0+128+0]
stx %o0,[%sp+STACK_BIAS+STACK_FRAME+0]
! transfer b[i] to FPU as 4x16-bit values
ldda [%o4+2]%asi,%f0
@ -404,13 +406,13 @@ bn_mul_mont_fpu:
ldda [%o4+4]%asi,%f6
! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
ldda [%sp+0+128+6]%asi,%f8
ldda [%sp+STACK_BIAS+STACK_FRAME+6]%asi,%f8
fxtod %f0,%f0
ldda [%sp+0+128+4]%asi,%f10
ldda [%sp+STACK_BIAS+STACK_FRAME+4]%asi,%f10
fxtod %f2,%f2
ldda [%sp+0+128+2]%asi,%f12
ldda [%sp+STACK_BIAS+STACK_FRAME+2]%asi,%f12
fxtod %f4,%f4
ldda [%sp+0+128+0]%asi,%f14
ldda [%sp+STACK_BIAS+STACK_FRAME+0]%asi,%f14
fxtod %f6,%f6
ldd [%l1+%l6],%f16 ! load a[j] in double format
fxtod %f8,%f8
@ -455,11 +457,11 @@ bn_mul_mont_fpu:
fdtox %f52,%f52
fdtox %f54,%f54
std %f48,[%sp+0+128+0]
std %f50,[%sp+0+128+8]
std %f52,[%sp+0+128+16]
std %f48,[%sp+STACK_BIAS+STACK_FRAME+0]
std %f50,[%sp+STACK_BIAS+STACK_FRAME+8]
std %f52,[%sp+STACK_BIAS+STACK_FRAME+16]
add %l6,8,%l6
std %f54,[%sp+0+128+24]
std %f54,[%sp+STACK_BIAS+STACK_FRAME+24]
ldd [%l1+%l6],%f16 ! load a[j] in double format
ldd [%l2+%l6],%f18
@ -471,15 +473,15 @@ bn_mul_mont_fpu:
fmuld %f16,%f2,%f34
fmuld %f20,%f10,%f50
fmuld %f16,%f4,%f36
ldx [%sp+0+128+0],%o0
ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0
faddd %f32,%f48,%f48
fmuld %f20,%f12,%f52
ldx [%sp+0+128+8],%o1
ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1
fmuld %f16,%f6,%f38
ldx [%sp+0+128+16],%o2
ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2
faddd %f34,%f50,%f50
fmuld %f20,%f14,%f54
ldx [%sp+0+128+24],%o3
ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3
fmuld %f18,%f0,%f40
srlx %o0,16,%o7
@ -527,12 +529,12 @@ bn_mul_mont_fpu:
fdtox %f52,%f52
fdtox %f54,%f54
std %f48,[%sp+0+128+0]
std %f50,[%sp+0+128+8]
std %f48,[%sp+STACK_BIAS+STACK_FRAME+0]
std %f50,[%sp+STACK_BIAS+STACK_FRAME+8]
addcc %l6,8,%l6
std %f52,[%sp+0+128+16]
std %f52,[%sp+STACK_BIAS+STACK_FRAME+16]
bz,pn %icc,.Linnerskip
std %f54,[%sp+0+128+24]
std %f54,[%sp+STACK_BIAS+STACK_FRAME+24]
ba .Linner
nop
@ -548,15 +550,15 @@ bn_mul_mont_fpu:
fmuld %f16,%f2,%f34
fmuld %f20,%f10,%f50
fmuld %f16,%f4,%f36
ldx [%sp+0+128+0],%o0
ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0
faddd %f32,%f48,%f48
fmuld %f20,%f12,%f52
ldx [%sp+0+128+8],%o1
ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1
fmuld %f16,%f6,%f38
ldx [%sp+0+128+16],%o2
ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2
faddd %f34,%f50,%f50
fmuld %f20,%f14,%f54
ldx [%sp+0+128+24],%o3
ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3
fmuld %f18,%f0,%f40
srlx %o0,16,%o7
@ -606,11 +608,11 @@ bn_mul_mont_fpu:
stx %o0,[%l0] ! tp[j-1]
fdtox %f54,%f54
std %f48,[%sp+0+128+0]
std %f50,[%sp+0+128+8]
std %f52,[%sp+0+128+16]
std %f48,[%sp+STACK_BIAS+STACK_FRAME+0]
std %f50,[%sp+STACK_BIAS+STACK_FRAME+8]
std %f52,[%sp+STACK_BIAS+STACK_FRAME+16]
addcc %l6,8,%l6
std %f54,[%sp+0+128+24]
std %f54,[%sp+STACK_BIAS+STACK_FRAME+24]
bnz,pt %icc,.Linner
add %l0,8,%l0
@ -618,15 +620,15 @@ bn_mul_mont_fpu:
fdtox %f24,%f24
fdtox %f26,%f26
ldx [%sp+0+128+0],%o0
ldx [%sp+0+128+8],%o1
ldx [%sp+0+128+16],%o2
ldx [%sp+0+128+24],%o3
ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0
ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1
ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2
ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3
srlx %o0,16,%o7
std %f24,[%sp+0+128+32]
std %f24,[%sp+STACK_BIAS+STACK_FRAME+32]
add %o7,%o1,%o1
std %f26,[%sp+0+128+40]
std %f26,[%sp+STACK_BIAS+STACK_FRAME+40]
srlx %o1,16,%o7
add %o7,%o2,%o2
srlx %o2,16,%o7
@ -639,9 +641,9 @@ bn_mul_mont_fpu:
sllx %o3,48,%o7
or %o1,%o0,%o0
or %o2,%o0,%o0
ldx [%sp+0+128+32],%o4
ldx [%sp+STACK_BIAS+STACK_FRAME+32],%o4
or %o7,%o0,%o0 ! 64-bit result
ldx [%sp+0+128+40],%o5
ldx [%sp+STACK_BIAS+STACK_FRAME+40],%o5
addcc %g1,%o0,%o0
ldx [%l0+8],%o7 ! tp[j]
srlx %o3,16,%g1 ! 34-bit carry
@ -728,7 +730,7 @@ bn_mul_mont_fpu:
brnz,pt %o7,.Lzap
nop
ldx [%sp+0+128+48],%o7
ldx [%sp+STACK_BIAS+STACK_FRAME+48],%o7
wr %g0,%o7,%asi ! restore %asi
mov 1,%i0

View File

@ -1,8 +1,15 @@
#include "sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
.section ".text",#alloc,#execinstr
.globl bn_mul_mont_vis3
.align 32
bn_mul_mont_vis3:
add %sp, 0, %g4 ! real top of stack
add %sp, STACK_BIAS, %g4 ! real top of stack
sll %o5, 2, %o5 ! size in bytes
add %o5, 63, %g5
andn %g5, 63, %g5 ! buffer size rounded up to 64 bytes
@ -10,12 +17,12 @@ bn_mul_mont_vis3:
add %g5, %g1, %g1 ! 3*buffer size
sub %g4, %g1, %g1
andn %g1, 63, %g1 ! align at 64 byte
sub %g1, 112, %g1 ! new top of stack
sub %g1, STACK_FRAME, %g1 ! new top of stack
sub %g1, %g4, %g1
save %sp, %g1, %sp
ld [%i4+0], %l0 ! pull n0[0..1] value
add %sp, 0+112, %l5
add %sp, STACK_BIAS+STACK_FRAME, %l5
ld [%i4+4], %l1
add %l5, %g5, %l7
ld [%i2+0], %l2 ! m0=bp[0]