From baa43bca0a051e8deb0d6a9a8882ceeea5c27249 Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Tue, 20 Mar 2012 00:51:32 -0400 Subject: optimize scalbn family the fscale instruction is slow everywhere, probably because it involves a costly and unnecessary integer truncation operation that ends up being a no-op in common usages. instead, construct a floating point scale value with integer arithmetic and simply multiply by it, when possible. for float and double, this is always possible by going to the next-larger type. we use some cheap but effective saturating arithmetic tricks to make sure even very large-magnitude exponents fit. for long double, if the scaling exponent is too large to fit in the exponent of a long double value, we simply fallback to the expensive fscale method. on atom cpu, these changes speed up scalbn by over 30%. (min rdtsc timing dropped from 110 cycles to 70 cycles.) --- src/math/i386/scalbnl.s | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'src/math/i386/scalbnl.s') diff --git a/src/math/i386/scalbnl.s b/src/math/i386/scalbnl.s index 224b1bef..54414c2e 100644 --- a/src/math/i386/scalbnl.s +++ b/src/math/i386/scalbnl.s @@ -11,7 +11,21 @@ scalblnl: .global scalbnl .type scalbnl,@function scalbnl: - fildl 16(%esp) + mov 16(%esp),%eax + add $0x3ffe,%eax + cmp $0x7ffd,%eax + jae 1f + inc %eax + fldt 4(%esp) + mov %eax,12(%esp) + mov $0x80000000,%eax + mov %eax,8(%esp) + xor %eax,%eax + mov %eax,4(%esp) + fldt 4(%esp) + fmulp + ret +1: fildl 16(%esp) fldt 4(%esp) fscale fstp %st(1) -- cgit v1.2.1