From 7513d3ecabb998e2c8c4cb9ed5de48c4b64a166b Mon Sep 17 00:00:00 2001
From: Rich Felker <dalias@aerifal.cx>
Date: Mon, 19 Mar 2012 23:53:52 -0400
Subject: remquo asm: return quotient mod 8, as intended by the spec

this is a lot more efficient and also what is generally wanted.
perhaps the bit shuffling could be more efficient...
---
 src/math/i386/remquo.s | 43 ++++++++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 17 deletions(-)

(limited to 'src')

diff --git a/src/math/i386/remquo.s b/src/math/i386/remquo.s
index 86ae2dc3..37a24450 100644
--- a/src/math/i386/remquo.s
+++ b/src/math/i386/remquo.s
@@ -2,40 +2,49 @@
 .type remquof,@function
 remquof:
 	mov 12(%esp),%ecx
-	fldl 4(%esp)
 	fldl 8(%esp)
+	fldl 4(%esp)
+	mov 11(%esp),%dh
+	xor 7(%esp),%dh
 	jmp 1f
 
 .global remquol
 .type remquol,@function
 remquol:
 	mov 28(%esp),%ecx
-	fldl 4(%esp)
 	fldl 16(%esp)
+	fldl 4(%esp)
+	mov 25(%esp),%dh
+	xor 13(%esp),%dh
 	jmp 1f
 
 .global remquo
 .type remquo,@function
 remquo:
 	mov 20(%esp),%ecx
-	fldl 4(%esp)
 	fldl 12(%esp)
-1:	fld %st(1)
+	fldl 4(%esp)
+	mov 19(%esp),%dh
+	xor 11(%esp),%dh
 1:      fprem1
 	fnstsw %ax
 	sahf
 	jp 1b
-	fsubr %st(0),%st(2)
-	fxch %st(2)
-	fdivp
-	mov $0x4f000000,%eax
-	mov %eax,4(%esp)
-	flds 4(%esp)
-	fxch %st(1)
-1:	fprem
-	fnstsw %ax
-	sahf
-	jp 1b
-	fistpl (%ecx)
-	fstp %st(0)
+	fstp %st(1)
+	mov %ah,%dl
+	shr %dl
+	and $1,%dl
+	mov %ah,%al
+	shr $5,%al
+	and $2,%al
+	or %al,%dl
+	mov %ah,%al
+	shl $2,%al
+	and $4,%al
+	or %al,%dl
+	test %dh,%dh
+	jns 1f
+	neg %dl
+1:	movsbl %dl,%edx
+	mov %edx,(%ecx)
 	ret
-- 
cgit v1.2.1


From baa43bca0a051e8deb0d6a9a8882ceeea5c27249 Mon Sep 17 00:00:00 2001
From: Rich Felker <dalias@aerifal.cx>
Date: Tue, 20 Mar 2012 00:51:32 -0400
Subject: optimize scalbn family

the fscale instruction is slow everywhere, probably because it
involves a costly and unnecessary integer truncation operation that
ends up being a no-op in common usages. instead, construct a floating
point scale value with integer arithmetic and simply multiply by it,
when possible.

for float and double, this is always possible by going to the
next-larger type. we use some cheap but effective saturating
arithmetic tricks to make sure even very large-magnitude exponents
fit. for long double, if the scaling exponent is too large to fit in
the exponent of a long double value, we simply fallback to the
expensive fscale method.

on atom cpu, these changes speed up scalbn by over 30%. (min rdtsc
timing dropped from 110 cycles to 70 cycles.)
---
 src/math/i386/scalbn.s  | 19 ++++++++++++++++---
 src/math/i386/scalbnf.s | 18 +++++++++++++++---
 src/math/i386/scalbnl.s | 16 +++++++++++++++-
 3 files changed, 46 insertions(+), 7 deletions(-)

(limited to 'src')

diff --git a/src/math/i386/scalbn.s b/src/math/i386/scalbn.s
index e275d14f..8bf302f2 100644
--- a/src/math/i386/scalbn.s
+++ b/src/math/i386/scalbn.s
@@ -11,10 +11,23 @@ scalbln:
 .global scalbn
 .type scalbn,@function
 scalbn:
-	fildl 12(%esp)
+	mov 12(%esp),%eax
+	add $0x3ffe,%eax
+	cmp $0x7ffd,%eax
+	jb 1f
+	sub $0x3ffe,%eax
+	sar $31,%eax
+	xor $0xfff,%eax
+	add $0x3ffe,%eax
+1:	inc %eax
 	fldl 4(%esp)
-	fscale
-	fstp %st(1)
+	mov %eax,12(%esp)
+	mov $0x80000000,%eax
+	mov %eax,8(%esp)
+	xor %eax,%eax
+	mov %eax,4(%esp)
+	fldt 4(%esp)
+	fmulp
 	fstpl 4(%esp)
 	fldl 4(%esp)
 	ret
diff --git a/src/math/i386/scalbnf.s b/src/math/i386/scalbnf.s
index 40232b6a..9cb9ef5f 100644
--- a/src/math/i386/scalbnf.s
+++ b/src/math/i386/scalbnf.s
@@ -11,10 +11,22 @@ scalblnf:
 .global scalbnf
 .type scalbnf,@function
 scalbnf:
-	fildl 8(%esp)
+	mov 8(%esp),%eax
+	add $0x3fe,%eax
+	cmp $0x7fd,%eax
+	jb 1f
+	sub $0x3fe,%eax
+	sar $31,%eax
+	xor $0x1ff,%eax
+	add $0x3fe,%eax
+1:	inc %eax
+	shl $20,%eax
 	flds 4(%esp)
-	fscale
-	fstp %st(1)
+	mov %eax,8(%esp)
+	xor %eax,%eax
+	mov %eax,4(%esp)
+	fldl 4(%esp)
+	fmulp
 	fstps 4(%esp)
 	flds 4(%esp)
 	ret
diff --git a/src/math/i386/scalbnl.s b/src/math/i386/scalbnl.s
index 224b1bef..54414c2e 100644
--- a/src/math/i386/scalbnl.s
+++ b/src/math/i386/scalbnl.s
@@ -11,7 +11,21 @@ scalblnl:
 .global scalbnl
 .type scalbnl,@function
 scalbnl:
-	fildl 16(%esp)
+	mov 16(%esp),%eax
+	add $0x3ffe,%eax
+	cmp $0x7ffd,%eax
+	jae 1f
+	inc %eax
+	fldt 4(%esp)
+	mov %eax,12(%esp)
+	mov $0x80000000,%eax
+	mov %eax,8(%esp)
+	xor %eax,%eax
+	mov %eax,4(%esp)
+	fldt 4(%esp)
+	fmulp
+	ret
+1:	fildl 16(%esp)
 	fldt 4(%esp)
 	fscale
 	fstp %st(1)
-- 
cgit v1.2.1