From f697d66b81912af59128ac1b96bc0e2a4514b758 Mon Sep 17 00:00:00 2001
From: nsz <nsz@port70.net>
Date: Sat, 5 May 2012 01:11:56 +0200
Subject: math: change the formula used for acos.s

old: 2*atan2(sqrt(1-x),sqrt(1+x))
new: atan2(fabs(sqrt((1-x)*(1+x))),x)
improvements:
* all edge cases are fixed (sign of zero in downward rounding)
* a bit faster (here a single call is about 131ns vs 162ns)
* a bit more precise (at most 1ulp error on 1M uniform random
samples in [0,1), the old formula gave some 2ulp errors as well)
---
 src/math/i386/acos.s    | 18 ++++++++----------
 src/math/x86_64/acosl.s | 18 ++++++++----------
 2 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/src/math/i386/acos.s b/src/math/i386/acos.s
index bfff0c5c..47f365ef 100644
--- a/src/math/i386/acos.s
+++ b/src/math/i386/acos.s
@@ -1,3 +1,5 @@
+# use acos(x) = atan2(fabs(sqrt((1-x)*(1+x))), x)
+
 .global acosf
 .type acosf,@function
 acosf:
@@ -14,17 +16,13 @@ acosl:
 .type acos,@function
 acos:
 	fldl 4(%esp)
-1:	fld1
-	fld %st(1)
+1:	fld %st(0)
 	fld1
-	fsubp
-	fsqrt
-	fxch %st(2)
-	faddp
+	fsub %st(0),%st(1)
+	fadd %st(2)
+	fmulp
 	fsqrt
+	fabs         # fix sign of zero (matters in downward rounding mode)
+	fxch %st(1)
 	fpatan
-	fld1
-	fld1
-	faddp
-	fmulp
 	ret
diff --git a/src/math/x86_64/acosl.s b/src/math/x86_64/acosl.s
index db68d2de..88e01b49 100644
--- a/src/math/x86_64/acosl.s
+++ b/src/math/x86_64/acosl.s
@@ -1,18 +1,16 @@
+# see ../i386/acos.s
+
 .global acosl
 .type acosl,@function
 acosl:
 	fldt 8(%rsp)
+1:	fld %st(0)
 	fld1
-	fld %st(1)
-	fld1
-	fsubp
-	fsqrt
-	fxch %st(2)
-	faddp
+	fsub %st(0),%st(1)
+	fadd %st(2)
+	fmulp
 	fsqrt
+	fabs
+	fxch %st(1)
 	fpatan
-	fld1
-	fld1
-	faddp
-	fmulp
 	ret
-- 
cgit v1.2.1