optimize scalbn family

the fscale instruction is slow everywhere, probably because it involves a costly and unnecessary integer truncation operation that ends up being a no-op in common usages. instead, construct a floating point scale value with integer arithmetic and simply multiply by it, when possible. for float and double, this is always possible by going to the next-larger type. we use some cheap but effective saturating arithmetic tricks to make sure even very large-magnitude exponents fit. for long double, if the scaling exponent is too large to fit in the exponent of a long double value, we simply fallback to the expensive fscale method. on atom cpu, these changes speed up scalbn by over 30%. (min rdtsc timing dropped from 110 cycles to 70 cycles.)
author: Rich Felker <dalias@aerifal.cx> 2012-03-20 00:51:32 -0400
committer: Rich Felker <dalias@aerifal.cx> 2012-03-20 00:51:32 -0400
commit: baa43bca0a051e8deb0d6a9a8882ceeea5c27249 (patch)
tree: f5fe7ae916d9039adfe82217716e2aafd08702fb /src/math/i386/scalbnf.s
parent: 7513d3ecabb998e2c8c4cb9ed5de48c4b64a166b (diff)
download: musl-baa43bca0a051e8deb0d6a9a8882ceeea5c27249.tar.gz
1 files changed, 15 insertions, 3 deletions
diff --git a/src/math/i386/scalbnf.s b/src/math/i386/scalbnf.s
index 40232b6a..9cb9ef5f 100644
--- a/src/math/i386/scalbnf.s
+++ b/src/math/i386/scalbnf.s
@@ -11,10 +11,22 @@ scalblnf:
 .global scalbnf
 .type scalbnf,@function
 scalbnf:
-	fildl 8(%esp)
+	mov 8(%esp),%eax
+	add $0x3fe,%eax
+	cmp $0x7fd,%eax
+	jb 1f
+	sub $0x3fe,%eax
+	sar $31,%eax
+	xor $0x1ff,%eax
+	add $0x3fe,%eax
+1:	inc %eax
+	shl $20,%eax
 	flds 4(%esp)
-	fscale
-	fstp %st(1)
+	mov %eax,8(%esp)
+	xor %eax,%eax
+	mov %eax,4(%esp)
+	fldl 4(%esp)
+	fmulp
 	fstps 4(%esp)
 	flds 4(%esp)
 	ret
author	Rich Felker <dalias@aerifal.cx>	2012-03-20 00:51:32 -0400
committer	Rich Felker <dalias@aerifal.cx>	2012-03-20 00:51:32 -0400
commit	baa43bca0a051e8deb0d6a9a8882ceeea5c27249 (patch)
tree	f5fe7ae916d9039adfe82217716e2aafd08702fb /src/math/i386/scalbnf.s
parent	7513d3ecabb998e2c8c4cb9ed5de48c4b64a166b (diff)
download	musl-baa43bca0a051e8deb0d6a9a8882ceeea5c27249.tar.gz