mirror of
https://git.musl-libc.org/git/musl
synced 2025-02-23 13:44:11 +03:00
optimize scalbn family
the fscale instruction is slow everywhere, probably because it involves a costly and unnecessary integer truncation operation that ends up being a no-op in common usages. instead, construct a floating point scale value with integer arithmetic and simply multiply by it, when possible. for float and double, this is always possible by going to the next-larger type. we use some cheap but effective saturating arithmetic tricks to make sure even very large-magnitude exponents fit. for long double, if the scaling exponent is too large to fit in the exponent of a long double value, we simply fallback to the expensive fscale method. on atom cpu, these changes speed up scalbn by over 30%. (min rdtsc timing dropped from 110 cycles to 70 cycles.)
This commit is contained in:
parent
7513d3ecab
commit
baa43bca0a
@ -11,10 +11,23 @@ scalbln:
|
||||
.global scalbn
|
||||
.type scalbn,@function
|
||||
scalbn:
|
||||
fildl 12(%esp)
|
||||
mov 12(%esp),%eax
|
||||
add $0x3ffe,%eax
|
||||
cmp $0x7ffd,%eax
|
||||
jb 1f
|
||||
sub $0x3ffe,%eax
|
||||
sar $31,%eax
|
||||
xor $0xfff,%eax
|
||||
add $0x3ffe,%eax
|
||||
1: inc %eax
|
||||
fldl 4(%esp)
|
||||
fscale
|
||||
fstp %st(1)
|
||||
mov %eax,12(%esp)
|
||||
mov $0x80000000,%eax
|
||||
mov %eax,8(%esp)
|
||||
xor %eax,%eax
|
||||
mov %eax,4(%esp)
|
||||
fldt 4(%esp)
|
||||
fmulp
|
||||
fstpl 4(%esp)
|
||||
fldl 4(%esp)
|
||||
ret
|
||||
|
@ -11,10 +11,22 @@ scalblnf:
|
||||
.global scalbnf
|
||||
.type scalbnf,@function
|
||||
scalbnf:
|
||||
fildl 8(%esp)
|
||||
mov 8(%esp),%eax
|
||||
add $0x3fe,%eax
|
||||
cmp $0x7fd,%eax
|
||||
jb 1f
|
||||
sub $0x3fe,%eax
|
||||
sar $31,%eax
|
||||
xor $0x1ff,%eax
|
||||
add $0x3fe,%eax
|
||||
1: inc %eax
|
||||
shl $20,%eax
|
||||
flds 4(%esp)
|
||||
fscale
|
||||
fstp %st(1)
|
||||
mov %eax,8(%esp)
|
||||
xor %eax,%eax
|
||||
mov %eax,4(%esp)
|
||||
fldl 4(%esp)
|
||||
fmulp
|
||||
fstps 4(%esp)
|
||||
flds 4(%esp)
|
||||
ret
|
||||
|
@ -11,7 +11,21 @@ scalblnl:
|
||||
.global scalbnl
|
||||
.type scalbnl,@function
|
||||
scalbnl:
|
||||
fildl 16(%esp)
|
||||
mov 16(%esp),%eax
|
||||
add $0x3ffe,%eax
|
||||
cmp $0x7ffd,%eax
|
||||
jae 1f
|
||||
inc %eax
|
||||
fldt 4(%esp)
|
||||
mov %eax,12(%esp)
|
||||
mov $0x80000000,%eax
|
||||
mov %eax,8(%esp)
|
||||
xor %eax,%eax
|
||||
mov %eax,4(%esp)
|
||||
fldt 4(%esp)
|
||||
fmulp
|
||||
ret
|
||||
1: fildl 16(%esp)
|
||||
fldt 4(%esp)
|
||||
fscale
|
||||
fstp %st(1)
|
||||
|
Loading…
x
Reference in New Issue
Block a user