optimize scalbn family

the fscale instruction is slow everywhere, probably because it
involves a costly and unnecessary integer truncation operation that
ends up being a no-op in common usages. instead, construct a floating
point scale value with integer arithmetic and simply multiply by it,
when possible.

for float and double, this is always possible by going to the
next-larger type. we use some cheap but effective saturating
arithmetic tricks to make sure even very large-magnitude exponents
fit. for long double, if the scaling exponent is too large to fit in
the exponent of a long double value, we simply fallback to the
expensive fscale method.

on atom cpu, these changes speed up scalbn by over 30%. (min rdtsc
timing dropped from 110 cycles to 70 cycles.)
This commit is contained in:
Rich Felker 2012-03-20 00:51:32 -04:00
parent 7513d3ecab
commit baa43bca0a
3 changed files with 46 additions and 7 deletions

View File

@ -11,10 +11,23 @@ scalbln:
.global scalbn
.type scalbn,@function
scalbn:
fildl 12(%esp)
mov 12(%esp),%eax
add $0x3ffe,%eax
cmp $0x7ffd,%eax
jb 1f
sub $0x3ffe,%eax
sar $31,%eax
xor $0xfff,%eax
add $0x3ffe,%eax
1: inc %eax
fldl 4(%esp)
fscale
fstp %st(1)
mov %eax,12(%esp)
mov $0x80000000,%eax
mov %eax,8(%esp)
xor %eax,%eax
mov %eax,4(%esp)
fldt 4(%esp)
fmulp
fstpl 4(%esp)
fldl 4(%esp)
ret

View File

@ -11,10 +11,22 @@ scalblnf:
.global scalbnf
.type scalbnf,@function
scalbnf:
fildl 8(%esp)
mov 8(%esp),%eax
add $0x3fe,%eax
cmp $0x7fd,%eax
jb 1f
sub $0x3fe,%eax
sar $31,%eax
xor $0x1ff,%eax
add $0x3fe,%eax
1: inc %eax
shl $20,%eax
flds 4(%esp)
fscale
fstp %st(1)
mov %eax,8(%esp)
xor %eax,%eax
mov %eax,4(%esp)
fldl 4(%esp)
fmulp
fstps 4(%esp)
flds 4(%esp)
ret

View File

@ -11,7 +11,21 @@ scalblnl:
.global scalbnl
.type scalbnl,@function
scalbnl:
fildl 16(%esp)
mov 16(%esp),%eax
add $0x3ffe,%eax
cmp $0x7ffd,%eax
jae 1f
inc %eax
fldt 4(%esp)
mov %eax,12(%esp)
mov $0x80000000,%eax
mov %eax,8(%esp)
xor %eax,%eax
mov %eax,4(%esp)
fldt 4(%esp)
fmulp
ret
1: fildl 16(%esp)
fldt 4(%esp)
fscale
fstp %st(1)