123 files changed, 1150 insertions, 650 deletions
diff --git a/src/math/__expo2.c b/src/math/__expo2.c
index 740ac680..248f052b 100644
--- a/src/math/__expo2.c
+++ b/src/math/__expo2.c
@@ -5,12 +5,13 @@ static const int k = 2043;
 static const double kln2 = 0x1.62066151add8bp+10;
 
 /* exp(x)/2 for x >= log(DBL_MAX), slightly better than 0.5*exp(x/2)*exp(x/2) */
-double __expo2(double x)
+double __expo2(double x, double sign)
 {
 	double scale;
 
 	/* note that k is odd and scale*scale overflows */
 	INSERT_WORDS(scale, (uint32_t)(0x3ff + k/2) << 20, 0);
 	/* exp(x - k ln2) * 2**(k-1) */
-	return exp(x - kln2) * scale * scale;
+	/* in directed rounding correct sign before rounding or overflow is important */
+	return exp(x - kln2) * (sign * scale) * scale;
 }
diff --git a/src/math/__expo2f.c b/src/math/__expo2f.c
index 5163e418..538eb09c 100644
--- a/src/math/__expo2f.c
+++ b/src/math/__expo2f.c
@@ -5,12 +5,13 @@ static const int k = 235;
 static const float kln2 = 0x1.45c778p+7f;
 
 /* expf(x)/2 for x >= log(FLT_MAX), slightly better than 0.5f*expf(x/2)*expf(x/2) */
-float __expo2f(float x)
+float __expo2f(float x, float sign)
 {
 	float scale;
 
 	/* note that k is odd and scale*scale overflows */
 	SET_FLOAT_WORD(scale, (uint32_t)(0x7f + k/2) << 23);
 	/* exp(x - k ln2) * 2**(k-1) */
-	return expf(x - kln2) * scale * scale;
+	/* in directed rounding correct sign before rounding or overflow is important */
+	return expf(x - kln2) * (sign * scale) * scale;
 }
diff --git a/src/math/__math_invalidl.c b/src/math/__math_invalidl.c
new file mode 100644
index 00000000..1fca99de
--- /dev/null
+++ b/src/math/__math_invalidl.c
@@ -0,0 +1,9 @@
+#include <float.h>
+#include "libm.h"
+
+#if LDBL_MANT_DIG != DBL_MANT_DIG
+long double __math_invalidl(long double x)
+{
+	return (x - x) / (x - x);
+}
+#endif
diff --git a/src/math/__rem_pio2.c b/src/math/__rem_pio2.c
index d403f81c..dcf672fb 100644
--- a/src/math/__rem_pio2.c
+++ b/src/math/__rem_pio2.c
@@ -36,6 +36,7 @@
  */
 static const double
 toint   = 1.5/EPS,
+pio4    = 0x1.921fb54442d18p-1,
 invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
 pio2_1  = 1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
 pio2_1t = 6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */
@@ -117,11 +118,23 @@ int __rem_pio2(double x, double *y)
 	}
 	if (ix < 0x413921fb) {  /* |x| ~< 2^20*(pi/2), medium size */
 medium:
-		/* rint(x/(pi/2)), Assume round-to-nearest. */
+		/* rint(x/(pi/2)) */
 		fn = (double_t)x*invpio2 + toint - toint;
 		n = (int32_t)fn;
 		r = x - fn*pio2_1;
 		w = fn*pio2_1t;  /* 1st round, good to 85 bits */
+		/* Matters with directed rounding. */
+		if (predict_false(r - w < -pio4)) {
+			n--;
+			fn--;
+			r = x - fn*pio2_1;
+			w = fn*pio2_1t;
+		} else if (predict_false(r - w > pio4)) {
+			n++;
+			fn++;
+			r = x - fn*pio2_1;
+			w = fn*pio2_1t;
+		}
 		y[0] = r - w;
 		u.f = y[0];
 		ey = u.i>>52 & 0x7ff;
diff --git a/src/math/__rem_pio2f.c b/src/math/__rem_pio2f.c
index 4473c1c4..e6765643 100644
--- a/src/math/__rem_pio2f.c
+++ b/src/math/__rem_pio2f.c
@@ -35,6 +35,7 @@
  */
 static const double
 toint   = 1.5/EPS,
+pio4    = 0x1.921fb6p-1,
 invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
 pio2_1  = 1.57079631090164184570e+00, /* 0x3FF921FB, 0x50000000 */
 pio2_1t = 1.58932547735281966916e-08; /* 0x3E5110b4, 0x611A6263 */
@@ -50,10 +51,20 @@ int __rem_pio2f(float x, double *y)
 	ix = u.i & 0x7fffffff;
 	/* 25+53 bit pi is good enough for medium size */
 	if (ix < 0x4dc90fdb) {  /* |x| ~< 2^28*(pi/2), medium size */
-		/* Use a specialized rint() to get fn.  Assume round-to-nearest. */
+		/* Use a specialized rint() to get fn. */
 		fn = (double_t)x*invpio2 + toint - toint;
 		n  = (int32_t)fn;
 		*y = x - fn*pio2_1 - fn*pio2_1t;
+		/* Matters with directed rounding. */
+		if (predict_false(*y < -pio4)) {
+			n--;
+			fn--;
+			*y = x - fn*pio2_1 - fn*pio2_1t;
+		} else if (predict_false(*y > pio4)) {
+			n++;
+			fn++;
+			*y = x - fn*pio2_1 - fn*pio2_1t;
+		}
 		return n;
 	}
 	if(ix>=0x7f800000) {  /* x is inf or NaN */
diff --git a/src/math/__rem_pio2l.c b/src/math/__rem_pio2l.c
index 77255bd8..236b2def 100644
--- a/src/math/__rem_pio2l.c
+++ b/src/math/__rem_pio2l.c
@@ -44,6 +44,7 @@ pio2_1 =  1.57079632679597125389e+00, /* 0x3FF921FB, 0x54444000 */
 pio2_2 = -1.07463465549783099519e-12, /* -0x12e7b967674000.0p-92 */
 pio2_3 =  6.36831716351370313614e-25; /*  0x18a2e037074000.0p-133 */
 static const long double
+pio4    =  0x1.921fb54442d1846ap-1L,
 invpio2 =  6.36619772367581343076e-01L, /*  0xa2f9836e4e44152a.0p-64 */
 pio2_1t = -1.07463465549719416346e-12L, /* -0x973dcb3b399d747f.0p-103 */
 pio2_2t =  6.36831716351095013979e-25L, /*  0xc51701b839a25205.0p-144 */
@@ -57,6 +58,7 @@ pio2_3t = -2.75299651904407171810e-37L; /* -0xbb5bf6c7ddd660ce.0p-185 */
 #define NX 5
 #define NY 3
 static const long double
+pio4    =  0x1.921fb54442d18469898cc51701b8p-1L,
 invpio2 =  6.3661977236758134307553505349005747e-01L,	/*  0x145f306dc9c882a53f84eafa3ea6a.0p-113 */
 pio2_1  =  1.5707963267948966192292994253909555e+00L,	/*  0x1921fb54442d18469800000000000.0p-112 */
 pio2_1t =  2.0222662487959507323996846200947577e-21L,	/*  0x13198a2e03707344a4093822299f3.0p-181 */
@@ -76,11 +78,23 @@ int __rem_pio2l(long double x, long double *y)
 	u.f = x;
 	ex = u.i.se & 0x7fff;
 	if (SMALL(u)) {
-		/* rint(x/(pi/2)), Assume round-to-nearest. */
+		/* rint(x/(pi/2)) */
 		fn = x*invpio2 + toint - toint;
 		n = QUOBITS(fn);
 		r = x-fn*pio2_1;
 		w = fn*pio2_1t;  /* 1st round good to 102/180 bits (ld80/ld128) */
+		/* Matters with directed rounding. */
+		if (predict_false(r - w < -pio4)) {
+			n--;
+			fn--;
+			r = x - fn*pio2_1;
+			w = fn*pio2_1t;
+		} else if (predict_false(r - w > pio4)) {
+			n++;
+			fn++;
+			r = x - fn*pio2_1;
+			w = fn*pio2_1t;
+		}
 		y[0] = r-w;
 		u.f = y[0];
 		ey = u.i.se & 0x7fff;
diff --git a/src/math/acoshf.c b/src/math/acoshf.c
index 8a4ec4d5..b773d48e 100644
--- a/src/math/acoshf.c
+++ b/src/math/acoshf.c
@@ -15,12 +15,12 @@ float acoshf(float x)
 	uint32_t a = u.i & 0x7fffffff;
 
 	if (a < 0x3f800000+(1<<23))
-		/* |x| < 2, invalid if x < 1 or nan */
+		/* |x| < 2, invalid if x < 1 */
 		/* up to 2ulp error in [1,1.125] */
 		return log1pf(x-1 + sqrtf((x-1)*(x-1)+2*(x-1)));
-	if (a < 0x3f800000+(12<<23))
-		/* |x| < 0x1p12 */
+	if (u.i < 0x3f800000+(12<<23))
+		/* 2 <= x < 0x1p12 */
 		return logf(2*x - 1/(x+sqrtf(x*x-1)));
-	/* x >= 0x1p12 */
+	/* x >= 0x1p12 or x <= -2 or nan */
 	return logf(x) + 0.693147180559945309417232121458176568f;
 }
diff --git a/src/math/acoshl.c b/src/math/acoshl.c
index 8d4b43f6..943cec17 100644
--- a/src/math/acoshl.c
+++ b/src/math/acoshl.c
@@ -10,14 +10,18 @@ long double acoshl(long double x)
 long double acoshl(long double x)
 {
 	union ldshape u = {x};
-	int e = u.i.se & 0x7fff;
+	int e = u.i.se;
 
 	if (e < 0x3fff + 1)
-		/* |x| < 2, invalid if x < 1 or nan */
+		/* 0 <= x < 2, invalid if x < 1 */
 		return log1pl(x-1 + sqrtl((x-1)*(x-1)+2*(x-1)));
 	if (e < 0x3fff + 32)
-		/* |x| < 0x1p32 */
+		/* 2 <= x < 0x1p32 */
 		return logl(2*x - 1/(x+sqrtl(x*x-1)));
+	if (e & 0x8000)
+		/* x < 0 or x = -0, invalid */
+		return (x - x) / (x - x);
+	/* 0x1p32 <= x or nan */
 	return logl(x) + 0.693147180559945309417232121458176568L;
 }
 #elif LDBL_MANT_DIG == 113 && LDBL_MAX_EXP == 16384
diff --git a/src/math/arm/fabs.c b/src/math/arm/fabs.c
index f890520a..6e1d367d 100644
--- a/src/math/arm/fabs.c
+++ b/src/math/arm/fabs.c
@@ -1,6 +1,6 @@
 #include <math.h>
 
-#if __ARM_PCS_VFP
+#if __ARM_PCS_VFP && __ARM_FP&8
 
 double fabs(double x)
 {
diff --git a/src/math/arm/sqrt.c b/src/math/arm/sqrt.c
index 874af960..567e2e91 100644
--- a/src/math/arm/sqrt.c
+++ b/src/math/arm/sqrt.c
@@ -1,6 +1,6 @@
 #include <math.h>
 
-#if __ARM_PCS_VFP || (__VFP_FP__ && !__SOFTFP__)
+#if (__ARM_PCS_VFP || (__VFP_FP__ && !__SOFTFP__)) && (__ARM_FP&8)
 
 double sqrt(double x)
 {
diff --git a/src/math/cosh.c b/src/math/cosh.c
index 100f8231..490c15fb 100644
--- a/src/math/cosh.c
+++ b/src/math/cosh.c
@@ -35,6 +35,6 @@ double cosh(double x)
 
 	/* |x| > log(DBL_MAX) or nan */
 	/* note: the result is stored to handle overflow */
-	t = __expo2(x);
+	t = __expo2(x, 1.0);
 	return t;
 }
diff --git a/src/math/coshf.c b/src/math/coshf.c
index b09f2ee5..e739cff9 100644
--- a/src/math/coshf.c
+++ b/src/math/coshf.c
@@ -28,6 +28,6 @@ float coshf(float x)
 	}
 
 	/* |x| > log(FLT_MAX) or nan */
-	t = __expo2f(x);
+	t = __expo2f(x, 1.0f);
 	return t;
 }
diff --git a/src/math/expm1f.c b/src/math/expm1f.c
index 297e0b44..09a41afe 100644
--- a/src/math/expm1f.c
+++ b/src/math/expm1f.c
@@ -16,7 +16,6 @@
 #include "libm.h"
 
 static const float
-o_threshold = 8.8721679688e+01, /* 0x42b17180 */
 ln2_hi      = 6.9313812256e-01, /* 0x3f317180 */
 ln2_lo      = 9.0580006145e-06, /* 0x3717f7d1 */
 invln2      = 1.4426950216e+00, /* 0x3fb8aa3b */
@@ -41,7 +40,7 @@ float expm1f(float x)
 			return x;
 		if (sign)
 			return -1;
-		if (x > o_threshold) {
+		if (hx > 0x42b17217) { /* x > log(FLT_MAX) */
 			x *= 0x1p127f;
 			return x;
 		}
diff --git a/src/math/fma.c b/src/math/fma.c
index 0c6f90c9..adfadca8 100644
--- a/src/math/fma.c
+++ b/src/math/fma.c
@@ -53,7 +53,7 @@ double fma(double x, double y, double z)
 		return x*y + z;
 	if (nz.e >= ZEROINFNAN) {
 		if (nz.e > ZEROINFNAN) /* z==0 */
-			return x*y + z;
+			return x*y;
 		return z;
 	}
 
diff --git a/src/math/fmaf.c b/src/math/fmaf.c
index 80f5cd8a..7c65acf1 100644
--- a/src/math/fmaf.c
+++ b/src/math/fmaf.c
@@ -77,17 +77,16 @@ float fmaf(float x, float y, float z)
 	 * If result is inexact, and exactly halfway between two float values,
 	 * we need to adjust the low-order bit in the direction of the error.
 	 */
-#ifdef FE_TOWARDZERO
-	fesetround(FE_TOWARDZERO);
-#endif
-	volatile double vxy = xy;  /* XXX work around gcc CSE bug */
-	double adjusted_result = vxy + z;
-	fesetround(FE_TONEAREST);
-	if (result == adjusted_result) {
-		u.f = adjusted_result;
+	double err;
+	int neg = u.i >> 63;
+	if (neg == (z > xy))
+		err = xy - result + z;
+	else
+		err = z - result + xy;
+	if (neg == (err < 0))
 		u.i++;
-		adjusted_result = u.f;
-	}
-	z = adjusted_result;
+	else
+		u.i--;
+	z = u.f;
 	return z;
 }
diff --git a/src/math/i386/exp.c b/src/math/i386/exp.c
deleted file mode 100644
index 11282284..00000000
--- a/src/math/i386/exp.c
+++ /dev/null
@@ -1 +0,0 @@
-// see exp_ld.s
diff --git a/src/math/i386/exp2.s b/src/math/i386/exp2.s
deleted file mode 100644
index f335a3e5..00000000
--- a/src/math/i386/exp2.s
+++ /dev/null
@@ -1 +0,0 @@
-# see exp.s
diff --git a/src/math/i386/exp2f.s b/src/math/i386/exp2f.s
deleted file mode 100644
index f335a3e5..00000000
--- a/src/math/i386/exp2f.s
+++ /dev/null
@@ -1 +0,0 @@
-# see exp.s
diff --git a/src/math/i386/exp2l.s b/src/math/i386/exp2l.s
index f335a3e5..8125761d 100644
--- a/src/math/i386/exp2l.s
+++ b/src/math/i386/exp2l.s
@@ -1 +1 @@
-# see exp.s
+# see exp_ld.s
diff --git a/src/math/i386/exp_ld.s b/src/math/i386/exp_ld.s
index df87c497..99cba01f 100644
--- a/src/math/i386/exp_ld.s
+++ b/src/math/i386/exp_ld.s
@@ -1,35 +1,8 @@
-.global expm1f
-.type expm1f,@function
-expm1f:
-	flds 4(%esp)
-	mov 4(%esp),%eax
-	add %eax,%eax
-	cmp $0x01000000,%eax
-	jae 1f
-		# subnormal x, return x with underflow
-	fld %st(0)
-	fmul %st(1)
-	fstps 4(%esp)
-	ret
-
 .global expm1l
 .type expm1l,@function
 expm1l:
 	fldt 4(%esp)
-	jmp 1f
-
-.global expm1
-.type expm1,@function
-expm1:
-	fldl 4(%esp)
-	mov 8(%esp),%eax
-	add %eax,%eax
-	cmp $0x00200000,%eax
-	jae 1f
-		# subnormal x, return x with underflow
-	fsts 4(%esp)
-	ret
-1:	fldl2e
+	fldl2e
 	fmulp
 	mov $0xc2820000,%eax
 	push %eax
@@ -59,12 +32,6 @@ expm1:
 	fsubrp
 	ret
 
-.global exp2f
-.type exp2f,@function
-exp2f:
-	flds 4(%esp)
-	jmp 1f
-
 .global exp2l
 .global __exp2l
 .hidden __exp2l
@@ -72,26 +39,6 @@ exp2f:
 exp2l:
 __exp2l:
 	fldt 4(%esp)
-	jmp 1f
-
-.global expf
-.type expf,@function
-expf:
-	flds 4(%esp)
-	jmp 2f
-
-.global exp
-.type exp,@function
-exp:
-	fldl 4(%esp)
-2:	fldl2e
-	fmulp
-	jmp 1f
-
-.global exp2
-.type exp2,@function
-exp2:
-	fldl 4(%esp)
 1:	sub $12,%esp
 	fld %st(0)
 	fstpt (%esp)
diff --git a/src/math/i386/expf.s b/src/math/i386/expf.s
deleted file mode 100644
index f335a3e5..00000000
--- a/src/math/i386/expf.s
+++ /dev/null
@@ -1 +0,0 @@
-# see exp.s
diff --git a/src/math/i386/expm1.s b/src/math/i386/expm1.s
deleted file mode 100644
index f335a3e5..00000000
--- a/src/math/i386/expm1.s
+++ /dev/null
@@ -1 +0,0 @@
-# see exp.s
diff --git a/src/math/i386/expm1f.s b/src/math/i386/expm1f.s
deleted file mode 100644
index f335a3e5..00000000
--- a/src/math/i386/expm1f.s
+++ /dev/null
@@ -1 +0,0 @@
-# see exp.s
diff --git a/src/math/i386/expm1l.s b/src/math/i386/expm1l.s
index f335a3e5..8125761d 100644
--- a/src/math/i386/expm1l.s
+++ b/src/math/i386/expm1l.s
@@ -1 +1 @@
-# see exp.s
+# see exp_ld.s
diff --git a/src/math/i386/fabs.c b/src/math/i386/fabs.c
new file mode 100644
index 00000000..39672786
--- /dev/null
+++ b/src/math/i386/fabs.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+double fabs(double x)
+{
+	__asm__ ("fabs" : "+t"(x));
+	return x;
+}
diff --git a/src/math/i386/fabs.s b/src/math/i386/fabs.s
deleted file mode 100644
index d66ea9a1..00000000
--- a/src/math/i386/fabs.s
+++ /dev/null
@@ -1,6 +0,0 @@
-.global fabs
-.type fabs,@function
-fabs:
-	fldl 4(%esp)
-	fabs
-	ret
diff --git a/src/math/i386/fabsf.c b/src/math/i386/fabsf.c
new file mode 100644
index 00000000..d882eee3
--- /dev/null
+++ b/src/math/i386/fabsf.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+float fabsf(float x)
+{
+	__asm__ ("fabs" : "+t"(x));
+	return x;
+}
diff --git a/src/math/i386/fabsf.s b/src/math/i386/fabsf.s
deleted file mode 100644
index a981c422..00000000
--- a/src/math/i386/fabsf.s
+++ /dev/null
@@ -1,6 +0,0 @@
-.global fabsf
-.type fabsf,@function
-fabsf:
-	flds 4(%esp)
-	fabs
-	ret
diff --git a/src/math/i386/fabsl.c b/src/math/i386/fabsl.c
new file mode 100644
index 00000000..cc1c9ed9
--- /dev/null
+++ b/src/math/i386/fabsl.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+long double fabsl(long double x)
+{
+	__asm__ ("fabs" : "+t"(x));
+	return x;
+}
diff --git a/src/math/i386/fabsl.s b/src/math/i386/fabsl.s
deleted file mode 100644
index ceef9e4c..00000000
--- a/src/math/i386/fabsl.s
+++ /dev/null
@@ -1,6 +0,0 @@
-.global fabsl
-.type fabsl,@function
-fabsl:
-	fldt 4(%esp)
-	fabs
-	ret
diff --git a/src/math/i386/fmod.c b/src/math/i386/fmod.c
new file mode 100644
index 00000000..ea0c58d9
--- /dev/null
+++ b/src/math/i386/fmod.c
@@ -0,0 +1,10 @@
+#include <math.h>
+
+double fmod(double x, double y)
+{
+	unsigned short fpsr;
+	// fprem does not introduce excess precision into x
+	do __asm__ ("fprem; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y));
+	while (fpsr & 0x400);
+	return x;
+}
diff --git a/src/math/i386/fmod.s b/src/math/i386/fmod.s
deleted file mode 100644
index 2113b3c5..00000000
--- a/src/math/i386/fmod.s
+++ /dev/null
@@ -1,11 +0,0 @@
-.global fmod
-.type fmod,@function
-fmod:
-	fldl 12(%esp)
-	fldl 4(%esp)
-1:	fprem
-	fnstsw %ax
-	sahf
-	jp 1b
-	fstp %st(1)
-	ret
diff --git a/src/math/i386/fmodf.c b/src/math/i386/fmodf.c
new file mode 100644
index 00000000..90b56ab0
--- /dev/null
+++ b/src/math/i386/fmodf.c
@@ -0,0 +1,10 @@
+#include <math.h>
+
+float fmodf(float x, float y)
+{
+	unsigned short fpsr;
+	// fprem does not introduce excess precision into x
+	do __asm__ ("fprem; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y));
+	while (fpsr & 0x400);
+	return x;
+}
diff --git a/src/math/i386/fmodf.s b/src/math/i386/fmodf.s
deleted file mode 100644
index e04e2a56..00000000
--- a/src/math/i386/fmodf.s
+++ /dev/null
@@ -1,11 +0,0 @@
-.global fmodf
-.type fmodf,@function
-fmodf:
-	flds 8(%esp)
-	flds 4(%esp)
-1:	fprem
-	fnstsw %ax
-	sahf
-	jp 1b
-	fstp %st(1)
-	ret
diff --git a/src/math/i386/fmodl.c b/src/math/i386/fmodl.c
new file mode 100644
index 00000000..3daeab06
--- /dev/null
+++ b/src/math/i386/fmodl.c
@@ -0,0 +1,9 @@
+#include <math.h>
+
+long double fmodl(long double x, long double y)
+{
+	unsigned short fpsr;
+	do __asm__ ("fprem; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y));
+	while (fpsr & 0x400);
+	return x;
+}
diff --git a/src/math/i386/fmodl.s b/src/math/i386/fmodl.s
deleted file mode 100644
index 0cb3fe9b..00000000
--- a/src/math/i386/fmodl.s
+++ /dev/null
@@ -1,11 +0,0 @@
-.global fmodl
-.type fmodl,@function
-fmodl:
-	fldt 16(%esp)
-	fldt 4(%esp)
-1:	fprem
-	fnstsw %ax
-	sahf
-	jp 1b
-	fstp %st(1)
-	ret
diff --git a/src/math/i386/llrint.c b/src/math/i386/llrint.c
new file mode 100644
index 00000000..aa400817
--- /dev/null
+++ b/src/math/i386/llrint.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long long llrint(double x)
+{
+	long long r;
+	__asm__ ("fistpll %0" : "=m"(r) : "t"(x) : "st");
+	return r;
+}
diff --git a/src/math/i386/llrint.s b/src/math/i386/llrint.s
deleted file mode 100644
index 8e89cd91..00000000
--- a/src/math/i386/llrint.s
+++ /dev/null
@@ -1,8 +0,0 @@
-.global llrint
-.type llrint,@function
-llrint:
-	fldl 4(%esp)
-	fistpll 4(%esp)
-	mov 4(%esp),%eax
-	mov 8(%esp),%edx
-	ret
diff --git a/src/math/i386/llrintf.c b/src/math/i386/llrintf.c
new file mode 100644
index 00000000..c41a317b
--- /dev/null
+++ b/src/math/i386/llrintf.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long long llrintf(float x)
+{
+	long long r;
+	__asm__ ("fistpll %0" : "=m"(r) : "t"(x) : "st");
+	return r;
+}
diff --git a/src/math/i386/llrintf.s b/src/math/i386/llrintf.s
deleted file mode 100644
index aa850c6c..00000000
--- a/src/math/i386/llrintf.s
+++ /dev/null
@@ -1,9 +0,0 @@
-.global llrintf
-.type llrintf,@function
-llrintf:
-	sub $8,%esp
-	flds 12(%esp)
-	fistpll (%esp)
-	pop %eax
-	pop %edx
-	ret
diff --git a/src/math/i386/llrintl.c b/src/math/i386/llrintl.c
new file mode 100644
index 00000000..c439ef28
--- /dev/null
+++ b/src/math/i386/llrintl.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long long llrintl(long double x)
+{
+	long long r;
+	__asm__ ("fistpll %0" : "=m"(r) : "t"(x) : "st");
+	return r;
+}
diff --git a/src/math/i386/llrintl.s b/src/math/i386/llrintl.s
deleted file mode 100644
index 1cfb56f1..00000000
--- a/src/math/i386/llrintl.s
+++ /dev/null
@@ -1,8 +0,0 @@
-.global llrintl
-.type llrintl,@function
-llrintl:
-	fldt 4(%esp)
-	fistpll 4(%esp)
-	mov 4(%esp),%eax
-	mov 8(%esp),%edx
-	ret
diff --git a/src/math/i386/lrint.c b/src/math/i386/lrint.c
new file mode 100644
index 00000000..89563ab2
--- /dev/null
+++ b/src/math/i386/lrint.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long lrint(double x)
+{
+	long r;
+	__asm__ ("fistpl %0" : "=m"(r) : "t"(x) : "st");
+	return r;
+}
diff --git a/src/math/i386/lrint.s b/src/math/i386/lrint.s
deleted file mode 100644
index 02b83d9f..00000000
--- a/src/math/i386/lrint.s
+++ /dev/null
@@ -1,7 +0,0 @@
-.global lrint
-.type lrint,@function
-lrint:
-	fldl 4(%esp)
-	fistpl 4(%esp)
-	mov 4(%esp),%eax
-	ret
diff --git a/src/math/i386/lrintf.c b/src/math/i386/lrintf.c
new file mode 100644
index 00000000..0bbf29de
--- /dev/null
+++ b/src/math/i386/lrintf.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long lrintf(float x)
+{
+	long r;
+	__asm__ ("fistpl %0" : "=m"(r) : "t"(x) : "st");
+	return r;
+}
diff --git a/src/math/i386/lrintf.s b/src/math/i386/lrintf.s
deleted file mode 100644
index 907aac29..00000000
--- a/src/math/i386/lrintf.s
+++ /dev/null
@@ -1,7 +0,0 @@
-.global lrintf
-.type lrintf,@function
-lrintf:
-	flds 4(%esp)
-	fistpl 4(%esp)
-	mov 4(%esp),%eax
-	ret
diff --git a/src/math/i386/lrintl.c b/src/math/i386/lrintl.c
new file mode 100644
index 00000000..eb8c0902
--- /dev/null
+++ b/src/math/i386/lrintl.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long lrintl(long double x)
+{
+	long r;
+	__asm__ ("fistpl %0" : "=m"(r) : "t"(x) : "st");
+	return r;
+}
diff --git a/src/math/i386/lrintl.s b/src/math/i386/lrintl.s
deleted file mode 100644
index 3ae05aac..00000000
--- a/src/math/i386/lrintl.s
+++ /dev/null
@@ -1,7 +0,0 @@
-.global lrintl
-.type lrintl,@function
-lrintl:
-	fldt 4(%esp)
-	fistpl 4(%esp)
-	mov 4(%esp),%eax
-	ret
diff --git a/src/math/i386/remainder.c b/src/math/i386/remainder.c
new file mode 100644
index 00000000..c083df90
--- /dev/null
+++ b/src/math/i386/remainder.c
@@ -0,0 +1,12 @@
+#include <math.h>
+
+double remainder(double x, double y)
+{
+	unsigned short fpsr;
+	// fprem1 does not introduce excess precision into x
+	do __asm__ ("fprem1; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y));
+	while (fpsr & 0x400);
+	return x;
+}
+
+weak_alias(remainder, drem);
diff --git a/src/math/i386/remainder.s b/src/math/i386/remainder.s
deleted file mode 100644
index ab1da95d..00000000
--- a/src/math/i386/remainder.s
+++ /dev/null
@@ -1,14 +0,0 @@
-.global remainder
-.type remainder,@function
-remainder:
-.weak drem
-.type drem,@function
-drem:
-	fldl 12(%esp)
-	fldl 4(%esp)
-1:	fprem1
-	fnstsw %ax
-	sahf
-	jp 1b
-	fstp %st(1)
-	ret
diff --git a/src/math/i386/remainderf.c b/src/math/i386/remainderf.c
new file mode 100644
index 00000000..280207d2
--- /dev/null
+++ b/src/math/i386/remainderf.c
@@ -0,0 +1,12 @@
+#include <math.h>
+
+float remainderf(float x, float y)
+{
+	unsigned short fpsr;
+	// fprem1 does not introduce excess precision into x
+	do __asm__ ("fprem1; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y));
+	while (fpsr & 0x400);
+	return x;
+}
+
+weak_alias(remainderf, dremf);
diff --git a/src/math/i386/remainderf.s b/src/math/i386/remainderf.s
deleted file mode 100644
index 6a7378a3..00000000
--- a/src/math/i386/remainderf.s
+++ /dev/null
@@ -1,14 +0,0 @@
-.global remainderf
-.type remainderf,@function
-remainderf:
-.weak dremf
-.type dremf,@function
-dremf:
-	flds 8(%esp)
-	flds 4(%esp)
-1:	fprem1
-	fnstsw %ax
-	sahf
-	jp 1b
-	fstp %st(1)
-	ret
diff --git a/src/math/i386/remainderl.c b/src/math/i386/remainderl.c
new file mode 100644
index 00000000..8cf75071
--- /dev/null
+++ b/src/math/i386/remainderl.c
@@ -0,0 +1,9 @@
+#include <math.h>
+
+long double remainderl(long double x, long double y)
+{
+	unsigned short fpsr;
+	do __asm__ ("fprem1; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y));
+	while (fpsr & 0x400);
+	return x;
+}
diff --git a/src/math/i386/remainderl.s b/src/math/i386/remainderl.s
deleted file mode 100644
index b41518ed..00000000
--- a/src/math/i386/remainderl.s
+++ /dev/null
@@ -1,11 +0,0 @@
-.global remainderl
-.type remainderl,@function
-remainderl:
-	fldt 16(%esp)
-	fldt 4(%esp)
-1:	fprem1
-	fnstsw %ax
-	sahf
-	jp 1b
-	fstp %st(1)
-	ret
diff --git a/src/math/i386/rint.c b/src/math/i386/rint.c
new file mode 100644
index 00000000..a5276a60
--- /dev/null
+++ b/src/math/i386/rint.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+double rint(double x)
+{
+	__asm__ ("frndint" : "+t"(x));
+	return x;
+}
diff --git a/src/math/i386/rint.s b/src/math/i386/rint.s
deleted file mode 100644
index bb99a11c..00000000
--- a/src/math/i386/rint.s
+++ /dev/null
@@ -1,6 +0,0 @@
-.global rint
-.type rint,@function
-rint:
-	fldl 4(%esp)
-	frndint
-	ret
diff --git a/src/math/i386/rintf.c b/src/math/i386/rintf.c
new file mode 100644
index 00000000..bb4121a4
--- /dev/null
+++ b/src/math/i386/rintf.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+float rintf(float x)
+{
+	__asm__ ("frndint" : "+t"(x));
+	return x;
+}
diff --git a/src/math/i386/rintf.s b/src/math/i386/rintf.s
deleted file mode 100644
index bce4c5a6..00000000
--- a/src/math/i386/rintf.s
+++ /dev/null
@@ -1,6 +0,0 @@
-.global rintf
-.type rintf,@function
-rintf:
-	flds 4(%esp)
-	frndint
-	ret
diff --git a/src/math/i386/rintl.c b/src/math/i386/rintl.c
new file mode 100644
index 00000000..e1a92077
--- /dev/null
+++ b/src/math/i386/rintl.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+long double rintl(long double x)
+{
+	__asm__ ("frndint" : "+t"(x));
+	return x;
+}
diff --git a/src/math/i386/rintl.s b/src/math/i386/rintl.s
deleted file mode 100644
index cd2bf9a9..00000000
--- a/src/math/i386/rintl.s
+++ /dev/null
@@ -1,6 +0,0 @@
-.global rintl
-.type rintl,@function
-rintl:
-	fldt 4(%esp)
-	frndint
-	ret
diff --git a/src/math/i386/sqrt.c b/src/math/i386/sqrt.c
new file mode 100644
index 00000000..934fbcca
--- /dev/null
+++ b/src/math/i386/sqrt.c
@@ -0,0 +1,15 @@
+#include "libm.h"
+
+double sqrt(double x)
+{
+	union ldshape ux;
+	unsigned fpsr;
+	__asm__ ("fsqrt; fnstsw %%ax": "=t"(ux.f), "=a"(fpsr) : "0"(x));
+	if ((ux.i.m & 0x7ff) != 0x400)
+		return (double)ux.f;
+	/* Rounding to double would have encountered an exact halfway case.
+	   Adjust mantissa downwards if fsqrt rounded up, else upwards.
+	   (result of fsqrt could not have been exact) */
+	ux.i.m ^= (fpsr & 0x200) + 0x300;
+	return (double)ux.f;
+}
diff --git a/src/math/i386/sqrt.s b/src/math/i386/sqrt.s
deleted file mode 100644
index 57837e25..00000000
--- a/src/math/i386/sqrt.s
+++ /dev/null
@@ -1,21 +0,0 @@
-.global sqrt
-.type sqrt,@function
-sqrt:	fldl 4(%esp)
-	fsqrt
-	fnstsw %ax
-	sub $12,%esp
-	fld %st(0)
-	fstpt (%esp)
-	mov (%esp),%ecx
-	and $0x7ff,%ecx
-	cmp $0x400,%ecx
-	jnz 1f
-	and $0x200,%eax
-	sub $0x100,%eax
-	sub %eax,(%esp)
-	fstp %st(0)
-	fldt (%esp)
-1:	add $12,%esp
-	fstpl 4(%esp)
-	fldl 4(%esp)
-	ret
diff --git a/src/math/i386/sqrtf.c b/src/math/i386/sqrtf.c
new file mode 100644
index 00000000..41c65c2b
--- /dev/null
+++ b/src/math/i386/sqrtf.c
@@ -0,0 +1,12 @@
+#include <math.h>
+
+float sqrtf(float x)
+{
+	long double t;
+	/* The long double result has sufficient precision so that
+	 * second rounding to float still keeps the returned value
+	 * correctly rounded, see Pierre Roux, "Innocuous Double
+	 * Rounding of Basic Arithmetic Operations". */
+	__asm__ ("fsqrt" : "=t"(t) : "0"(x));
+	return (float)t;
+}
diff --git a/src/math/i386/sqrtf.s b/src/math/i386/sqrtf.s
deleted file mode 100644
index 9e944f45..00000000
--- a/src/math/i386/sqrtf.s
+++ /dev/null
@@ -1,7 +0,0 @@
-.global sqrtf
-.type sqrtf,@function
-sqrtf:	flds 4(%esp)
-	fsqrt
-	fstps 4(%esp)
-	flds 4(%esp)
-	ret
diff --git a/src/math/i386/sqrtl.c b/src/math/i386/sqrtl.c
new file mode 100644
index 00000000..864cfcc4
--- /dev/null
+++ b/src/math/i386/sqrtl.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+long double sqrtl(long double x)
+{
+	__asm__ ("fsqrt" : "+t"(x));
+	return x;
+}
diff --git a/src/math/i386/sqrtl.s b/src/math/i386/sqrtl.s
deleted file mode 100644
index e0d42616..00000000
--- a/src/math/i386/sqrtl.s
+++ /dev/null
@@ -1,5 +0,0 @@
-.global sqrtl
-.type sqrtl,@function
-sqrtl:	fldt 4(%esp)
-	fsqrt
-	ret
diff --git a/src/math/logf.c b/src/math/logf.c
index 7ee5d7fe..e4c2237c 100644
--- a/src/math/logf.c
+++ b/src/math/logf.c
@@ -53,7 +53,7 @@ float logf(float x)
 	tmp = ix - OFF;
 	i = (tmp >> (23 - LOGF_TABLE_BITS)) % N;
 	k = (int32_t)tmp >> 23; /* arithmetic shift */
-	iz = ix - (tmp & 0x1ff << 23);
+	iz = ix - (tmp & 0xff800000);
 	invc = T[i].invc;
 	logc = T[i].logc;
 	z = (double_t)asfloat(iz);
diff --git a/src/math/m68k/sqrtl.c b/src/math/m68k/sqrtl.c
new file mode 100644
index 00000000..b1c303c7
--- /dev/null
+++ b/src/math/m68k/sqrtl.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __HAVE_68881__
+
+long double sqrtl(long double x)
+{
+	__asm__ ("fsqrt.x %1,%0" : "=f"(x) : "fm"(x));
+	return x;
+}
+
+#else
+
+#include "../sqrtl.c"
+
+#endif
diff --git a/src/math/powerpc/fabs.c b/src/math/powerpc/fabs.c
index 0efc21ef..9453a3aa 100644
--- a/src/math/powerpc/fabs.c
+++ b/src/math/powerpc/fabs.c
@@ -1,6 +1,6 @@
 #include <math.h>
 
-#if defined(_SOFT_FLOAT) || defined(BROKEN_PPC_D_ASM)
+#if defined(_SOFT_FLOAT) || defined(__NO_FPRS__) || defined(BROKEN_PPC_D_ASM)
 
 #include "../fabs.c"
 
diff --git a/src/math/powerpc/fabsf.c b/src/math/powerpc/fabsf.c
index d88b5911..2e9da588 100644
--- a/src/math/powerpc/fabsf.c
+++ b/src/math/powerpc/fabsf.c
@@ -1,6 +1,6 @@
 #include <math.h>
 
-#ifdef _SOFT_FLOAT
+#if defined(_SOFT_FLOAT) || defined(__NO_FPRS__)
 
 #include "../fabsf.c"
 
diff --git a/src/math/powerpc/fma.c b/src/math/powerpc/fma.c
index 135c9903..0eb2ba1e 100644
--- a/src/math/powerpc/fma.c
+++ b/src/math/powerpc/fma.c
@@ -1,6 +1,6 @@
 #include <math.h>
 
-#if defined(_SOFT_FLOAT) || defined(BROKEN_PPC_D_ASM)
+#if defined(_SOFT_FLOAT) || defined(__NO_FPRS__) || defined(BROKEN_PPC_D_ASM)
 
 #include "../fma.c"
 
diff --git a/src/math/powerpc/fmaf.c b/src/math/powerpc/fmaf.c
index a99a2a3b..dc1a749d 100644
--- a/src/math/powerpc/fmaf.c
+++ b/src/math/powerpc/fmaf.c
@@ -1,6 +1,6 @@
 #include <math.h>
 
-#ifdef _SOFT_FLOAT
+#if defined(_SOFT_FLOAT) || defined(__NO_FPRS__)
 
 #include "../fmaf.c"
 
diff --git a/src/math/powl.c b/src/math/powl.c
index 5b6da07b..9eb22162 100644
--- a/src/math/powl.c
+++ b/src/math/powl.c
@@ -57,14 +57,6 @@
  *    IEEE     0,8700       60000      6.5e-18      1.0e-18
  * 0.99 < x < 1.01, 0 < y < 8700, uniformly distributed.
  *
- *
- * ERROR MESSAGES:
- *
- *   message         condition      value returned
- * pow overflow     x**y > MAXNUM      INFINITY
- * pow underflow   x**y < 1/MAXNUM       0.0
- * pow domain      x<0 and y noninteger  0.0
- *
  */
 
 #include "libm.h"
@@ -212,25 +204,33 @@ long double powl(long double x, long double y)
 	}
 	if (x == 1.0)
 		return 1.0; /* 1**y = 1, even if y is nan */
-	if (x == -1.0 && !isfinite(y))
-		return 1.0; /* -1**inf = 1 */
 	if (y == 0.0)
 		return 1.0; /* x**0 = 1, even if x is nan */
 	if (y == 1.0)
 		return x;
-	if (y >= LDBL_MAX) {
-		if (x > 1.0 || x < -1.0)
-			return INFINITY;
-		if (x != 0.0)
-			return 0.0;
-	}
-	if (y <= -LDBL_MAX) {
-		if (x > 1.0 || x < -1.0)
+	/* if y*log2(x) < log2(LDBL_TRUE_MIN)-1 then x^y uflows to 0
+	   if y*log2(x) > -log2(LDBL_TRUE_MIN)+1 > LDBL_MAX_EXP then x^y oflows
+	   if |x|!=1 then |log2(x)| > |log(x)| > LDBL_EPSILON/2 so
+	   x^y oflows/uflows if |y|*LDBL_EPSILON/2 > -log2(LDBL_TRUE_MIN)+1 */
+	if (fabsl(y) > 2*(-LDBL_MIN_EXP+LDBL_MANT_DIG+1)/LDBL_EPSILON) {
+		/* y is not an odd int */
+		if (x == -1.0)
+			return 1.0;
+		if (y == INFINITY) {
+			if (x > 1.0 || x < -1.0)
+				return INFINITY;
 			return 0.0;
-		if (x != 0.0 || y == -INFINITY)
+		}
+		if (y == -INFINITY) {
+			if (x > 1.0 || x < -1.0)
+				return 0.0;
 			return INFINITY;
+		}
+		if ((x > 1.0 || x < -1.0) == (y > 0))
+			return huge * huge;
+		return twom10000 * twom10000;
 	}
-	if (x >= LDBL_MAX) {
+	if (x == INFINITY) {
 		if (y > 0.0)
 			return INFINITY;
 		return 0.0;
@@ -253,7 +253,7 @@ long double powl(long double x, long double y)
 			yoddint = 1;
 	}
 
-	if (x <= -LDBL_MAX) {
+	if (x == -INFINITY) {
 		if (y > 0.0) {
 			if (yoddint)
 				return -INFINITY;
diff --git a/src/math/riscv32/copysign.c b/src/math/riscv32/copysign.c
new file mode 100644
index 00000000..c7854178
--- /dev/null
+++ b/src/math/riscv32/copysign.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 64
+
+double copysign(double x, double y)
+{
+	__asm__ ("fsgnj.d %0, %1, %2" : "=f"(x) : "f"(x), "f"(y));
+	return x;
+}
+
+#else
+
+#include "../copysign.c"
+
+#endif
diff --git a/src/math/riscv32/copysignf.c b/src/math/riscv32/copysignf.c
new file mode 100644
index 00000000..a125611a
--- /dev/null
+++ b/src/math/riscv32/copysignf.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 32
+
+float copysignf(float x, float y)
+{
+	__asm__ ("fsgnj.s %0, %1, %2" : "=f"(x) : "f"(x), "f"(y));
+	return x;
+}
+
+#else
+
+#include "../copysignf.c"
+
+#endif
diff --git a/src/math/riscv32/fabs.c b/src/math/riscv32/fabs.c
new file mode 100644
index 00000000..5290b6f0
--- /dev/null
+++ b/src/math/riscv32/fabs.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 64
+
+double fabs(double x)
+{
+	__asm__ ("fabs.d %0, %1" : "=f"(x) : "f"(x));
+	return x;
+}
+
+#else
+
+#include "../fabs.c"
+
+#endif
diff --git a/src/math/riscv32/fabsf.c b/src/math/riscv32/fabsf.c
new file mode 100644
index 00000000..f5032e35
--- /dev/null
+++ b/src/math/riscv32/fabsf.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 32
+
+float fabsf(float x)
+{
+	__asm__ ("fabs.s %0, %1" : "=f"(x) : "f"(x));
+	return x;
+}
+
+#else
+
+#include "../fabsf.c"
+
+#endif
diff --git a/src/math/riscv32/fma.c b/src/math/riscv32/fma.c
new file mode 100644
index 00000000..99b05713
--- /dev/null
+++ b/src/math/riscv32/fma.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 64
+
+double fma(double x, double y, double z)
+{
+	__asm__ ("fmadd.d %0, %1, %2, %3" : "=f"(x) : "f"(x), "f"(y), "f"(z));
+	return x;
+}
+
+#else
+
+#include "../fma.c"
+
+#endif
diff --git a/src/math/riscv32/fmaf.c b/src/math/riscv32/fmaf.c
new file mode 100644
index 00000000..f9dc47ed
--- /dev/null
+++ b/src/math/riscv32/fmaf.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 32
+
+float fmaf(float x, float y, float z)
+{
+	__asm__ ("fmadd.s %0, %1, %2, %3" : "=f"(x) : "f"(x), "f"(y), "f"(z));
+	return x;
+}
+
+#else
+
+#include "../fmaf.c"
+
+#endif
diff --git a/src/math/riscv32/fmax.c b/src/math/riscv32/fmax.c
new file mode 100644
index 00000000..023709cd
--- /dev/null
+++ b/src/math/riscv32/fmax.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 64
+
+double fmax(double x, double y)
+{
+	__asm__ ("fmax.d %0, %1, %2" : "=f"(x) : "f"(x), "f"(y));
+	return x;
+}
+
+#else
+
+#include "../fmax.c"
+
+#endif
diff --git a/src/math/riscv32/fmaxf.c b/src/math/riscv32/fmaxf.c
new file mode 100644
index 00000000..863d2bd1
--- /dev/null
+++ b/src/math/riscv32/fmaxf.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 32
+
+float fmaxf(float x, float y)
+{
+	__asm__ ("fmax.s %0, %1, %2" : "=f"(x) : "f"(x), "f"(y));
+	return x;
+}
+
+#else
+
+#include "../fmaxf.c"
+
+#endif
diff --git a/src/math/riscv32/fmin.c b/src/math/riscv32/fmin.c
new file mode 100644
index 00000000..a4e3b067
--- /dev/null
+++ b/src/math/riscv32/fmin.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 64
+
+double fmin(double x, double y)
+{
+	__asm__ ("fmin.d %0, %1, %2" : "=f"(x) : "f"(x), "f"(y));
+	return x;
+}
+
+#else
+
+#include "../fmin.c"
+
+#endif
diff --git a/src/math/riscv32/fminf.c b/src/math/riscv32/fminf.c
new file mode 100644
index 00000000..32156e80
--- /dev/null
+++ b/src/math/riscv32/fminf.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 32
+
+float fminf(float x, float y)
+{
+	__asm__ ("fmin.s %0, %1, %2" : "=f"(x) : "f"(x), "f"(y));
+	return x;
+}
+
+#else
+
+#include "../fminf.c"
+
+#endif
diff --git a/src/math/riscv32/sqrt.c b/src/math/riscv32/sqrt.c
new file mode 100644
index 00000000..867a504c
--- /dev/null
+++ b/src/math/riscv32/sqrt.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 64
+
+double sqrt(double x)
+{
+	__asm__ ("fsqrt.d %0, %1" : "=f"(x) : "f"(x));
+	return x;
+}
+
+#else
+
+#include "../sqrt.c"
+
+#endif
diff --git a/src/math/riscv32/sqrtf.c b/src/math/riscv32/sqrtf.c
new file mode 100644
index 00000000..610c2cf8
--- /dev/null
+++ b/src/math/riscv32/sqrtf.c
@@ -0,0 +1,15 @@
+#include <math.h>
+
+#if __riscv_flen >= 32
+
+float sqrtf(float x)
+{
+	__asm__ ("fsqrt.s %0, %1" : "=f"(x) : "f"(x));
+	return x;
+}
+
+#else
+
+#include "../sqrtf.c"
+
+#endif
diff --git a/src/math/sinh.c b/src/math/sinh.c
index 00022c4e..a01951ae 100644
--- a/src/math/sinh.c
+++ b/src/math/sinh.c
@@ -34,6 +34,6 @@ double sinh(double x)
 
 	/* |x| > log(DBL_MAX) or nan */
 	/* note: the result is stored to handle overflow */
-	t = 2*h*__expo2(absx);
+	t = __expo2(absx, 2*h);
 	return t;
 }
diff --git a/src/math/sinhf.c b/src/math/sinhf.c
index 6ad19ea2..b9caa793 100644
--- a/src/math/sinhf.c
+++ b/src/math/sinhf.c
@@ -26,6 +26,6 @@ float sinhf(float x)
 	}
 
 	/* |x| > logf(FLT_MAX) or nan */
-	t = 2*h*__expo2f(absx);
+	t = __expo2f(absx, 2*h);
 	return t;
 }
diff --git a/src/math/sqrt.c b/src/math/sqrt.c
index f1f6d76c..5ba26559 100644
--- a/src/math/sqrt.c
+++ b/src/math/sqrt.c
@@ -1,184 +1,158 @@
-/* origin: FreeBSD /usr/src/lib/msun/src/e_sqrt.c */
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunSoft, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-/* sqrt(x)
- * Return correctly rounded sqrt.
- *           ------------------------------------------
- *           |  Use the hardware sqrt if you have one |
- *           ------------------------------------------
- * Method:
- *   Bit by bit method using integer arithmetic. (Slow, but portable)
- *   1. Normalization
- *      Scale x to y in [1,4) with even powers of 2:
- *      find an integer k such that  1 <= (y=x*2^(2k)) < 4, then
- *              sqrt(x) = 2^k * sqrt(y)
- *   2. Bit by bit computation
- *      Let q  = sqrt(y) truncated to i bit after binary point (q = 1),
- *           i                                                   0
- *                                     i+1         2
- *          s  = 2*q , and      y  =  2   * ( y - q  ).         (1)
- *           i      i            i                 i
- *
- *      To compute q    from q , one checks whether
- *                  i+1       i
- *
- *                            -(i+1) 2
- *                      (q + 2      ) <= y.                     (2)
- *                        i
- *                                                            -(i+1)
- *      If (2) is false, then q   = q ; otherwise q   = q  + 2      .
- *                             i+1   i             i+1   i
- *
- *      With some algebric manipulation, it is not difficult to see
- *      that (2) is equivalent to
- *                             -(i+1)
- *                      s  +  2       <= y                      (3)
- *                       i                i
- *
- *      The advantage of (3) is that s  and y  can be computed by
- *                                    i      i
- *      the following recurrence formula:
- *          if (3) is false
- *
- *          s     =  s  ,       y    = y   ;                    (4)
- *           i+1      i          i+1    i
- *
- *          otherwise,
- *                         -i                     -(i+1)
- *          s     =  s  + 2  ,  y    = y  -  s  - 2             (5)
- *           i+1      i          i+1    i     i
- *
- *      One may easily use induction to prove (4) and (5).
- *      Note. Since the left hand side of (3) contain only i+2 bits,
- *            it does not necessary to do a full (53-bit) comparison
- *            in (3).
- *   3. Final rounding
- *      After generating the 53 bits result, we compute one more bit.
- *      Together with the remainder, we can decide whether the
- *      result is exact, bigger than 1/2ulp, or less than 1/2ulp
- *      (it will never equal to 1/2ulp).
- *      The rounding mode can be detected by checking whether
- *      huge + tiny is equal to huge, and whether huge - tiny is
- *      equal to huge for some floating point number "huge" and "tiny".
- *
- * Special cases:
- *      sqrt(+-0) = +-0         ... exact
- *      sqrt(inf) = inf
- *      sqrt(-ve) = NaN         ... with invalid signal
- *      sqrt(NaN) = NaN         ... with invalid signal for signaling NaN
- */
-
+#include <stdint.h>
+#include <math.h>
 #include "libm.h"
+#include "sqrt_data.h"
 
-static const double tiny = 1.0e-300;
+#define FENV_SUPPORT 1
 
-double sqrt(double x)
+/* returns a*b*2^-32 - e, with error 0 <= e < 1.  */
+static inline uint32_t mul32(uint32_t a, uint32_t b)
 {
-	double z;
-	int32_t sign = (int)0x80000000;
-	int32_t ix0,s0,q,m,t,i;
-	uint32_t r,t1,s1,ix1,q1;
+	return (uint64_t)a*b >> 32;
+}
 
-	EXTRACT_WORDS(ix0, ix1, x);
+/* returns a*b*2^-64 - e, with error 0 <= e < 3.  */
+static inline uint64_t mul64(uint64_t a, uint64_t b)
+{
+	uint64_t ahi = a>>32;
+	uint64_t alo = a&0xffffffff;
+	uint64_t bhi = b>>32;
+	uint64_t blo = b&0xffffffff;
+	return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32);
+}
 
-	/* take care of Inf and NaN */
-	if ((ix0&0x7ff00000) == 0x7ff00000) {
-		return x*x + x;  /* sqrt(NaN)=NaN, sqrt(+inf)=+inf, sqrt(-inf)=sNaN */
-	}
-	/* take care of zero */
-	if (ix0 <= 0) {
-		if (((ix0&~sign)|ix1) == 0)
-			return x;  /* sqrt(+-0) = +-0 */
-		if (ix0 < 0)
-			return (x-x)/(x-x);  /* sqrt(-ve) = sNaN */
-	}
-	/* normalize x */
-	m = ix0>>20;
-	if (m == 0) {  /* subnormal x */
-		while (ix0 == 0) {
-			m -= 21;
-			ix0 |= (ix1>>11);
-			ix1 <<= 21;
-		}
-		for (i=0; (ix0&0x00100000) == 0; i++)
-			ix0<<=1;
-		m -= i - 1;
-		ix0 |= ix1>>(32-i);
-		ix1 <<= i;
-	}
-	m -= 1023;    /* unbias exponent */
-	ix0 = (ix0&0x000fffff)|0x00100000;
-	if (m & 1) {  /* odd m, double x to make it even */
-		ix0 += ix0 + ((ix1&sign)>>31);
-		ix1 += ix1;
-	}
-	m >>= 1;      /* m = [m/2] */
-
-	/* generate sqrt(x) bit by bit */
-	ix0 += ix0 + ((ix1&sign)>>31);
-	ix1 += ix1;
-	q = q1 = s0 = s1 = 0;  /* [q,q1] = sqrt(x) */
-	r = 0x00200000;        /* r = moving bit from right to left */
-
-	while (r != 0) {
-		t = s0 + r;
-		if (t <= ix0) {
-			s0   = t + r;
-			ix0 -= t;
-			q   += r;
-		}
-		ix0 += ix0 + ((ix1&sign)>>31);
-		ix1 += ix1;
-		r >>= 1;
-	}
+double sqrt(double x)
+{
+	uint64_t ix, top, m;
 
-	r = sign;
-	while (r != 0) {
-		t1 = s1 + r;
-		t  = s0;
-		if (t < ix0 || (t == ix0 && t1 <= ix1)) {
-			s1 = t1 + r;
-			if ((t1&sign) == sign && (s1&sign) == 0)
-				s0++;
-			ix0 -= t;
-			if (ix1 < t1)
-				ix0--;
-			ix1 -= t1;
-			q1 += r;
-		}
-		ix0 += ix0 + ((ix1&sign)>>31);
-		ix1 += ix1;
-		r >>= 1;
+	/* special case handling.  */
+	ix = asuint64(x);
+	top = ix >> 52;
+	if (predict_false(top - 0x001 >= 0x7ff - 0x001)) {
+		/* x < 0x1p-1022 or inf or nan.  */
+		if (ix * 2 == 0)
+			return x;
+		if (ix == 0x7ff0000000000000)
+			return x;
+		if (ix > 0x7ff0000000000000)
+			return __math_invalid(x);
+		/* x is subnormal, normalize it.  */
+		ix = asuint64(x * 0x1p52);
+		top = ix >> 52;
+		top -= 52;
 	}
 
-	/* use floating add to find out rounding direction */
-	if ((ix0|ix1) != 0) {
-		z = 1.0 - tiny; /* raise inexact flag */
-		if (z >= 1.0) {
-			z = 1.0 + tiny;
-			if (q1 == (uint32_t)0xffffffff) {
-				q1 = 0;
-				q++;
-			} else if (z > 1.0) {
-				if (q1 == (uint32_t)0xfffffffe)
-					q++;
-				q1 += 2;
-			} else
-				q1 += q1 & 1;
-		}
+	/* argument reduction:
+	   x = 4^e m; with integer e, and m in [1, 4)
+	   m: fixed point representation [2.62]
+	   2^e is the exponent part of the result.  */
+	int even = top & 1;
+	m = (ix << 11) | 0x8000000000000000;
+	if (even) m >>= 1;
+	top = (top + 0x3ff) >> 1;
+
+	/* approximate r ~ 1/sqrt(m) and s ~ sqrt(m) when m in [1,4)
+
+	   initial estimate:
+	   7bit table lookup (1bit exponent and 6bit significand).
+
+	   iterative approximation:
+	   using 2 goldschmidt iterations with 32bit int arithmetics
+	   and a final iteration with 64bit int arithmetics.
+
+	   details:
+
+	   the relative error (e = r0 sqrt(m)-1) of a linear estimate
+	   (r0 = a m + b) is |e| < 0.085955 ~ 0x1.6p-4 at best,
+	   a table lookup is faster and needs one less iteration
+	   6 bit lookup table (128b) gives |e| < 0x1.f9p-8
+	   7 bit lookup table (256b) gives |e| < 0x1.fdp-9
+	   for single and double prec 6bit is enough but for quad
+	   prec 7bit is needed (or modified iterations). to avoid
+	   one more iteration >=13bit table would be needed (16k).
+
+	   a newton-raphson iteration for r is
+	     w = r*r
+	     u = 3 - m*w
+	     r = r*u/2
+	   can use a goldschmidt iteration for s at the end or
+	     s = m*r
+
+	   first goldschmidt iteration is
+	     s = m*r
+	     u = 3 - s*r
+	     r = r*u/2
+	     s = s*u/2
+	   next goldschmidt iteration is
+	     u = 3 - s*r
+	     r = r*u/2
+	     s = s*u/2
+	   and at the end r is not computed only s.
+
+	   they use the same amount of operations and converge at the
+	   same quadratic rate, i.e. if
+	     r1 sqrt(m) - 1 = e, then
+	     r2 sqrt(m) - 1 = -3/2 e^2 - 1/2 e^3
+	   the advantage of goldschmidt is that the mul for s and r
+	   are independent (computed in parallel), however it is not
+	   "self synchronizing": it only uses the input m in the
+	   first iteration so rounding errors accumulate. at the end
+	   or when switching to larger precision arithmetics rounding
+	   errors dominate so the first iteration should be used.
+
+	   the fixed point representations are
+	     m: 2.30 r: 0.32, s: 2.30, d: 2.30, u: 2.30, three: 2.30
+	   and after switching to 64 bit
+	     m: 2.62 r: 0.64, s: 2.62, d: 2.62, u: 2.62, three: 2.62  */
+
+	static const uint64_t three = 0xc0000000;
+	uint64_t r, s, d, u, i;
+
+	i = (ix >> 46) % 128;
+	r = (uint32_t)__rsqrt_tab[i] << 16;
+	/* |r sqrt(m) - 1| < 0x1.fdp-9 */
+	s = mul32(m>>32, r);
+	/* |s/sqrt(m) - 1| < 0x1.fdp-9 */
+	d = mul32(s, r);
+	u = three - d;
+	r = mul32(r, u) << 1;
+	/* |r sqrt(m) - 1| < 0x1.7bp-16 */
+	s = mul32(s, u) << 1;
+	/* |s/sqrt(m) - 1| < 0x1.7bp-16 */
+	d = mul32(s, r);
+	u = three - d;
+	r = mul32(r, u) << 1;
+	/* |r sqrt(m) - 1| < 0x1.3704p-29 (measured worst-case) */
+	r = r << 32;
+	s = mul64(m, r);
+	d = mul64(s, r);
+	u = (three<<32) - d;
+	s = mul64(s, u);  /* repr: 3.61 */
+	/* -0x1p-57 < s - sqrt(m) < 0x1.8001p-61 */
+	s = (s - 2) >> 9; /* repr: 12.52 */
+	/* -0x1.09p-52 < s - sqrt(m) < -0x1.fffcp-63 */
+
+	/* s < sqrt(m) < s + 0x1.09p-52,
+	   compute nearest rounded result:
+	   the nearest result to 52 bits is either s or s+0x1p-52,
+	   we can decide by comparing (2^52 s + 0.5)^2 to 2^104 m.  */
+	uint64_t d0, d1, d2;
+	double y, t;
+	d0 = (m << 42) - s*s;
+	d1 = s - d0;
+	d2 = d1 + s + 1;
+	s += d1 >> 63;
+	s &= 0x000fffffffffffff;
+	s |= top << 52;
+	y = asdouble(s);
+	if (FENV_SUPPORT) {
+		/* handle rounding modes and inexact exception:
+		   only (s+1)^2 == 2^42 m case is exact otherwise
+		   add a tiny value to cause the fenv effects.  */
+		uint64_t tiny = predict_false(d2==0) ? 0 : 0x0010000000000000;
+		tiny |= (d1^d2) & 0x8000000000000000;
+		t = asdouble(tiny);
+		y = eval_as_double(y + t);
 	}
-	ix0 = (q>>1) + 0x3fe00000;
-	ix1 = q1>>1;
-	if (q&1)
-		ix1 |= sign;
-	INSERT_WORDS(z, ix0 + ((uint32_t)m << 20), ix1);
-	return z;
+	return y;
 }
diff --git a/src/math/sqrt_data.c b/src/math/sqrt_data.c
new file mode 100644
index 00000000..61bc22f4
--- /dev/null
+++ b/src/math/sqrt_data.c
@@ -0,0 +1,19 @@
+#include "sqrt_data.h"
+const uint16_t __rsqrt_tab[128] = {
+0xb451,0xb2f0,0xb196,0xb044,0xaef9,0xadb6,0xac79,0xab43,
+0xaa14,0xa8eb,0xa7c8,0xa6aa,0xa592,0xa480,0xa373,0xa26b,
+0xa168,0xa06a,0x9f70,0x9e7b,0x9d8a,0x9c9d,0x9bb5,0x9ad1,
+0x99f0,0x9913,0x983a,0x9765,0x9693,0x95c4,0x94f8,0x9430,
+0x936b,0x92a9,0x91ea,0x912e,0x9075,0x8fbe,0x8f0a,0x8e59,
+0x8daa,0x8cfe,0x8c54,0x8bac,0x8b07,0x8a64,0x89c4,0x8925,
+0x8889,0x87ee,0x8756,0x86c0,0x862b,0x8599,0x8508,0x8479,
+0x83ec,0x8361,0x82d8,0x8250,0x81c9,0x8145,0x80c2,0x8040,
+0xff02,0xfd0e,0xfb25,0xf947,0xf773,0xf5aa,0xf3ea,0xf234,
+0xf087,0xeee3,0xed47,0xebb3,0xea27,0xe8a3,0xe727,0xe5b2,
+0xe443,0xe2dc,0xe17a,0xe020,0xdecb,0xdd7d,0xdc34,0xdaf1,
+0xd9b3,0xd87b,0xd748,0xd61a,0xd4f1,0xd3cd,0xd2ad,0xd192,
+0xd07b,0xcf69,0xce5b,0xcd51,0xcc4a,0xcb48,0xca4a,0xc94f,
+0xc858,0xc764,0xc674,0xc587,0xc49d,0xc3b7,0xc2d4,0xc1f4,
+0xc116,0xc03c,0xbf65,0xbe90,0xbdbe,0xbcef,0xbc23,0xbb59,
+0xba91,0xb9cc,0xb90a,0xb84a,0xb78c,0xb6d0,0xb617,0xb560,
+};
diff --git a/src/math/sqrt_data.h b/src/math/sqrt_data.h
new file mode 100644
index 00000000..260c7f9c
--- /dev/null
+++ b/src/math/sqrt_data.h
@@ -0,0 +1,13 @@
+#ifndef _SQRT_DATA_H
+#define _SQRT_DATA_H
+
+#include <features.h>
+#include <stdint.h>
+
+/* if x in [1,2): i = (int)(64*x);
+   if x in [2,4): i = (int)(32*x-64);
+   __rsqrt_tab[i]*2^-16 is estimating 1/sqrt(x) with small relative error:
+   |__rsqrt_tab[i]*0x1p-16*sqrt(x) - 1| < -0x1.fdp-9 < 2^-8 */
+extern hidden const uint16_t __rsqrt_tab[128];
+
+#endif
diff --git a/src/math/sqrtf.c b/src/math/sqrtf.c
index d6ace38a..740d81cb 100644
--- a/src/math/sqrtf.c
+++ b/src/math/sqrtf.c
@@ -1,83 +1,83 @@
-/* origin: FreeBSD /usr/src/lib/msun/src/e_sqrtf.c */
-/*
- * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
- */
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-
+#include <stdint.h>
+#include <math.h>
 #include "libm.h"
+#include "sqrt_data.h"
 
-static const float tiny = 1.0e-30;
+#define FENV_SUPPORT 1
 
-float sqrtf(float x)
+static inline uint32_t mul32(uint32_t a, uint32_t b)
 {
-	float z;
-	int32_t sign = (int)0x80000000;
-	int32_t ix,s,q,m,t,i;
-	uint32_t r;
+	return (uint64_t)a*b >> 32;
+}
 
-	GET_FLOAT_WORD(ix, x);
+/* see sqrt.c for more detailed comments.  */
 
-	/* take care of Inf and NaN */
-	if ((ix&0x7f800000) == 0x7f800000)
-		return x*x + x; /* sqrt(NaN)=NaN, sqrt(+inf)=+inf, sqrt(-inf)=sNaN */
+float sqrtf(float x)
+{
+	uint32_t ix, m, m1, m0, even, ey;
 
-	/* take care of zero */
-	if (ix <= 0) {
-		if ((ix&~sign) == 0)
-			return x;  /* sqrt(+-0) = +-0 */
-		if (ix < 0)
-			return (x-x)/(x-x);  /* sqrt(-ve) = sNaN */
-	}
-	/* normalize x */
-	m = ix>>23;
-	if (m == 0) {  /* subnormal x */
-		for (i = 0; (ix&0x00800000) == 0; i++)
-			ix<<=1;
-		m -= i - 1;
+	ix = asuint(x);
+	if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000)) {
+		/* x < 0x1p-126 or inf or nan.  */
+		if (ix * 2 == 0)
+			return x;
+		if (ix == 0x7f800000)
+			return x;
+		if (ix > 0x7f800000)
+			return __math_invalidf(x);
+		/* x is subnormal, normalize it.  */
+		ix = asuint(x * 0x1p23f);
+		ix -= 23 << 23;
 	}
-	m -= 127;  /* unbias exponent */
-	ix = (ix&0x007fffff)|0x00800000;
-	if (m&1)  /* odd m, double x to make it even */
-		ix += ix;
-	m >>= 1;  /* m = [m/2] */
 
-	/* generate sqrt(x) bit by bit */
-	ix += ix;
-	q = s = 0;       /* q = sqrt(x) */
-	r = 0x01000000;  /* r = moving bit from right to left */
+	/* x = 4^e m; with int e and m in [1, 4).  */
+	even = ix & 0x00800000;
+	m1 = (ix << 8) | 0x80000000;
+	m0 = (ix << 7) & 0x7fffffff;
+	m = even ? m0 : m1;
 
-	while (r != 0) {
-		t = s + r;
-		if (t <= ix) {
-			s = t+r;
-			ix -= t;
-			q += r;
-		}
-		ix += ix;
-		r >>= 1;
-	}
+	/* 2^e is the exponent part of the return value.  */
+	ey = ix >> 1;
+	ey += 0x3f800000 >> 1;
+	ey &= 0x7f800000;
+
+	/* compute r ~ 1/sqrt(m), s ~ sqrt(m) with 2 goldschmidt iterations.  */
+	static const uint32_t three = 0xc0000000;
+	uint32_t r, s, d, u, i;
+	i = (ix >> 17) % 128;
+	r = (uint32_t)__rsqrt_tab[i] << 16;
+	/* |r*sqrt(m) - 1| < 0x1p-8 */
+	s = mul32(m, r);
+	/* |s/sqrt(m) - 1| < 0x1p-8 */
+	d = mul32(s, r);
+	u = three - d;
+	r = mul32(r, u) << 1;
+	/* |r*sqrt(m) - 1| < 0x1.7bp-16 */
+	s = mul32(s, u) << 1;
+	/* |s/sqrt(m) - 1| < 0x1.7bp-16 */
+	d = mul32(s, r);
+	u = three - d;
+	s = mul32(s, u);
+	/* -0x1.03p-28 < s/sqrt(m) - 1 < 0x1.fp-31 */
+	s = (s - 1)>>6;
+	/* s < sqrt(m) < s + 0x1.08p-23 */
 
-	/* use floating add to find out rounding direction */
-	if (ix != 0) {
-		z = 1.0f - tiny; /* raise inexact flag */
-		if (z >= 1.0f) {
-			z = 1.0f + tiny;
-			if (z > 1.0f)
-				q += 2;
-			else
-				q += q & 1;
-		}
+	/* compute nearest rounded result.  */
+	uint32_t d0, d1, d2;
+	float y, t;
+	d0 = (m << 16) - s*s;
+	d1 = s - d0;
+	d2 = d1 + s + 1;
+	s += d1 >> 31;
+	s &= 0x007fffff;
+	s |= ey;
+	y = asfloat(s);
+	if (FENV_SUPPORT) {
+		/* handle rounding and inexact exception. */
+		uint32_t tiny = predict_false(d2==0) ? 0 : 0x01000000;
+		tiny |= (d1^d2) & 0x80000000;
+		t = asfloat(tiny);
+		y = eval_as_float(y + t);
 	}
-	ix = (q>>1) + 0x3f000000;
-	SET_FLOAT_WORD(z, ix + ((uint32_t)m << 23));
-	return z;
+	return y;
 }
diff --git a/src/math/sqrtl.c b/src/math/sqrtl.c
index 83a8f80c..a231b3f2 100644
--- a/src/math/sqrtl.c
+++ b/src/math/sqrtl.c
@@ -1,7 +1,259 @@
+#include <stdint.h>
 #include <math.h>
+#include <float.h>
+#include "libm.h"
 
+#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
 long double sqrtl(long double x)
 {
-	/* FIXME: implement in C, this is for LDBL_MANT_DIG == 64 only */
 	return sqrt(x);
 }
+#elif (LDBL_MANT_DIG == 113 || LDBL_MANT_DIG == 64) && LDBL_MAX_EXP == 16384
+#include "sqrt_data.h"
+
+#define FENV_SUPPORT 1
+
+typedef struct {
+	uint64_t hi;
+	uint64_t lo;
+} u128;
+
+/* top: 16 bit sign+exponent, x: significand.  */
+static inline long double mkldbl(uint64_t top, u128 x)
+{
+	union ldshape u;
+#if LDBL_MANT_DIG == 113
+	u.i2.hi = x.hi;
+	u.i2.lo = x.lo;
+	u.i2.hi &= 0x0000ffffffffffff;
+	u.i2.hi |= top << 48;
+#elif LDBL_MANT_DIG == 64
+	u.i.se = top;
+	u.i.m = x.lo;
+	/* force the top bit on non-zero (and non-subnormal) results.  */
+	if (top & 0x7fff)
+		u.i.m |= 0x8000000000000000;
+#endif
+	return u.f;
+}
+
+/* return: top 16 bit is sign+exp and following bits are the significand.  */
+static inline u128 asu128(long double x)
+{
+	union ldshape u = {.f=x};
+	u128 r;
+#if LDBL_MANT_DIG == 113
+	r.hi = u.i2.hi;
+	r.lo = u.i2.lo;
+#elif LDBL_MANT_DIG == 64
+	r.lo = u.i.m<<49;
+	/* ignore the top bit: pseudo numbers are not handled. */
+	r.hi = u.i.m>>15;
+	r.hi &= 0x0000ffffffffffff;
+	r.hi |= (uint64_t)u.i.se << 48;
+#endif
+	return r;
+}
+
+/* returns a*b*2^-32 - e, with error 0 <= e < 1.  */
+static inline uint32_t mul32(uint32_t a, uint32_t b)
+{
+	return (uint64_t)a*b >> 32;
+}
+
+/* returns a*b*2^-64 - e, with error 0 <= e < 3.  */
+static inline uint64_t mul64(uint64_t a, uint64_t b)
+{
+	uint64_t ahi = a>>32;
+	uint64_t alo = a&0xffffffff;
+	uint64_t bhi = b>>32;
+	uint64_t blo = b&0xffffffff;
+	return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32);
+}
+
+static inline u128 add64(u128 a, uint64_t b)
+{
+	u128 r;
+	r.lo = a.lo + b;
+	r.hi = a.hi;
+	if (r.lo < a.lo)
+		r.hi++;
+	return r;
+}
+
+static inline u128 add128(u128 a, u128 b)
+{
+	u128 r;
+	r.lo = a.lo + b.lo;
+	r.hi = a.hi + b.hi;
+	if (r.lo < a.lo)
+		r.hi++;
+	return r;
+}
+
+static inline u128 sub64(u128 a, uint64_t b)
+{
+	u128 r;
+	r.lo = a.lo - b;
+	r.hi = a.hi;
+	if (a.lo < b)
+		r.hi--;
+	return r;
+}
+
+static inline u128 sub128(u128 a, u128 b)
+{
+	u128 r;
+	r.lo = a.lo - b.lo;
+	r.hi = a.hi - b.hi;
+	if (a.lo < b.lo)
+		r.hi--;
+	return r;
+}
+
+/* a<<n, 0 <= n <= 127 */
+static inline u128 lsh(u128 a, int n)
+{
+	if (n == 0)
+		return a;
+	if (n >= 64) {
+		a.hi = a.lo<<(n-64);
+		a.lo = 0;
+	} else {
+		a.hi = (a.hi<<n) | (a.lo>>(64-n));
+		a.lo = a.lo<<n;
+	}
+	return a;
+}
+
+/* a>>n, 0 <= n <= 127 */
+static inline u128 rsh(u128 a, int n)
+{
+	if (n == 0)
+		return a;
+	if (n >= 64) {
+		a.lo = a.hi>>(n-64);
+		a.hi = 0;
+	} else {
+		a.lo = (a.lo>>n) | (a.hi<<(64-n));
+		a.hi = a.hi>>n;
+	}
+	return a;
+}
+
+/* returns a*b exactly.  */
+static inline u128 mul64_128(uint64_t a, uint64_t b)
+{
+	u128 r;
+	uint64_t ahi = a>>32;
+	uint64_t alo = a&0xffffffff;
+	uint64_t bhi = b>>32;
+	uint64_t blo = b&0xffffffff;
+	uint64_t lo1 = ((ahi*blo)&0xffffffff) + ((alo*bhi)&0xffffffff) + (alo*blo>>32);
+	uint64_t lo2 = (alo*blo)&0xffffffff;
+	r.hi = ahi*bhi + (ahi*blo>>32) + (alo*bhi>>32) + (lo1>>32);
+	r.lo = (lo1<<32) + lo2;
+	return r;
+}
+
+/* returns a*b*2^-128 - e, with error 0 <= e < 7.  */
+static inline u128 mul128(u128 a, u128 b)
+{
+	u128 hi = mul64_128(a.hi, b.hi);
+	uint64_t m1 = mul64(a.hi, b.lo);
+	uint64_t m2 = mul64(a.lo, b.hi);
+	return add64(add64(hi, m1), m2);
+}
+
+/* returns a*b % 2^128.  */
+static inline u128 mul128_tail(u128 a, u128 b)
+{
+	u128 lo = mul64_128(a.lo, b.lo);
+	lo.hi += a.hi*b.lo + a.lo*b.hi;
+	return lo;
+}
+
+
+/* see sqrt.c for detailed comments.  */
+
+long double sqrtl(long double x)
+{
+	u128 ix, ml;
+	uint64_t top;
+
+	ix = asu128(x);
+	top = ix.hi >> 48;
+	if (predict_false(top - 0x0001 >= 0x7fff - 0x0001)) {
+		/* x < 0x1p-16382 or inf or nan.  */
+		if (2*ix.hi == 0 && ix.lo == 0)
+			return x;
+		if (ix.hi == 0x7fff000000000000 && ix.lo == 0)
+			return x;
+		if (top >= 0x7fff)
+			return __math_invalidl(x);
+		/* x is subnormal, normalize it.  */
+		ix = asu128(x * 0x1p112);
+		top = ix.hi >> 48;
+		top -= 112;
+	}
+
+	/* x = 4^e m; with int e and m in [1, 4) */
+	int even = top & 1;
+	ml = lsh(ix, 15);
+	ml.hi |= 0x8000000000000000;
+	if (even) ml = rsh(ml, 1);
+	top = (top + 0x3fff) >> 1;
+
+	/* r ~ 1/sqrt(m) */
+	const uint64_t three = 0xc0000000;
+	uint64_t r, s, d, u, i;
+	i = (ix.hi >> 42) % 128;
+	r = (uint32_t)__rsqrt_tab[i] << 16;
+	/* |r sqrt(m) - 1| < 0x1p-8 */
+	s = mul32(ml.hi>>32, r);
+	d = mul32(s, r);
+	u = three - d;
+	r = mul32(u, r) << 1;
+	/* |r sqrt(m) - 1| < 0x1.7bp-16, switch to 64bit */
+	r = r<<32;
+	s = mul64(ml.hi, r);
+	d = mul64(s, r);
+	u = (three<<32) - d;
+	r = mul64(u, r) << 1;
+	/* |r sqrt(m) - 1| < 0x1.a5p-31 */
+	s = mul64(u, s) << 1;
+	d = mul64(s, r);
+	u = (three<<32) - d;
+	r = mul64(u, r) << 1;
+	/* |r sqrt(m) - 1| < 0x1.c001p-59, switch to 128bit */
+
+	const u128 threel = {.hi=three<<32, .lo=0};
+	u128 rl, sl, dl, ul;
+	rl.hi = r;
+	rl.lo = 0;
+	sl = mul128(ml, rl);
+	dl = mul128(sl, rl);
+	ul = sub128(threel, dl);
+	sl = mul128(ul, sl); /* repr: 3.125 */
+	/* -0x1p-116 < s - sqrt(m) < 0x3.8001p-125 */
+	sl = rsh(sub64(sl, 4), 125-(LDBL_MANT_DIG-1));
+	/* s < sqrt(m) < s + 1 ULP + tiny */
+
+	long double y;
+	u128 d2, d1, d0;
+	d0 = sub128(lsh(ml, 2*(LDBL_MANT_DIG-1)-126), mul128_tail(sl,sl));
+	d1 = sub128(sl, d0);
+	d2 = add128(add64(sl, 1), d1);
+	sl = add64(sl, d1.hi >> 63);
+	y = mkldbl(top, sl);
+	if (FENV_SUPPORT) {
+		/* handle rounding modes and inexact exception.  */
+		top = predict_false((d2.hi|d2.lo)==0) ? 0 : 1;
+		top |= ((d1.hi^d2.hi)&0x8000000000000000) >> 48;
+		y += mkldbl(top, (u128){0});
+	}
+	return y;
+}
+#else
+#error unsupported long double format
+#endif
diff --git a/src/math/x86_64/fabs.c b/src/math/x86_64/fabs.c
new file mode 100644
index 00000000..16562477
--- /dev/null
+++ b/src/math/x86_64/fabs.c
@@ -0,0 +1,10 @@
+#include <math.h>
+
+double fabs(double x)
+{
+	double t;
+	__asm__ ("pcmpeqd %0, %0" : "=x"(t));          // t = ~0
+	__asm__ ("psrlq   $1, %0" : "+x"(t));          // t >>= 1
+	__asm__ ("andps   %1, %0" : "+x"(x) : "x"(t)); // x &= t
+	return x;
+}
diff --git a/src/math/x86_64/fabs.s b/src/math/x86_64/fabs.s
deleted file mode 100644
index 5715005e..00000000
--- a/src/math/x86_64/fabs.s
+++ /dev/null
@@ -1,9 +0,0 @@
-.global fabs
-.type fabs,@function
-fabs:
-	xor %eax,%eax
-	dec %rax
-	shr %rax
-	movq %rax,%xmm1
-	andpd %xmm1,%xmm0
-	ret
diff --git a/src/math/x86_64/fabsf.c b/src/math/x86_64/fabsf.c
new file mode 100644
index 00000000..36ea7481
--- /dev/null
+++ b/src/math/x86_64/fabsf.c
@@ -0,0 +1,10 @@
+#include <math.h>
+
+float fabsf(float x)
+{
+	float t;
+	__asm__ ("pcmpeqd %0, %0" : "=x"(t));          // t = ~0
+	__asm__ ("psrld   $1, %0" : "+x"(t));          // t >>= 1
+	__asm__ ("andps   %1, %0" : "+x"(x) : "x"(t)); // x &= t
+	return x;
+}
diff --git a/src/math/x86_64/fabsf.s b/src/math/x86_64/fabsf.s
deleted file mode 100644
index 501a1f17..00000000
--- a/src/math/x86_64/fabsf.s
+++ /dev/null
@@ -1,7 +0,0 @@
-.global fabsf
-.type fabsf,@function
-fabsf:
-	mov $0x7fffffff,%eax
-	movq %rax,%xmm1
-	andps %xmm1,%xmm0
-	ret
diff --git a/src/math/x86_64/fabsl.c b/src/math/x86_64/fabsl.c
new file mode 100644
index 00000000..cc1c9ed9
--- /dev/null
+++ b/src/math/x86_64/fabsl.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+long double fabsl(long double x)
+{
+	__asm__ ("fabs" : "+t"(x));
+	return x;
+}
diff --git a/src/math/x86_64/fabsl.s b/src/math/x86_64/fabsl.s
deleted file mode 100644
index 4e7ab525..00000000
--- a/src/math/x86_64/fabsl.s
+++ /dev/null
@@ -1,6 +0,0 @@
-.global fabsl
-.type fabsl,@function
-fabsl:
-	fldt 8(%rsp)
-	fabs
-	ret
diff --git a/src/math/x86_64/fmodl.c b/src/math/x86_64/fmodl.c
new file mode 100644
index 00000000..3daeab06
--- /dev/null
+++ b/src/math/x86_64/fmodl.c
@@ -0,0 +1,9 @@
+#include <math.h>
+
+long double fmodl(long double x, long double y)
+{
+	unsigned short fpsr;
+	do __asm__ ("fprem; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y));
+	while (fpsr & 0x400);
+	return x;
+}
diff --git a/src/math/x86_64/fmodl.s b/src/math/x86_64/fmodl.s
deleted file mode 100644
index ea07b402..00000000
--- a/src/math/x86_64/fmodl.s
+++ /dev/null
@@ -1,11 +0,0 @@
-.global fmodl
-.type fmodl,@function
-fmodl:
-	fldt 24(%rsp)
-	fldt 8(%rsp)
-1:	fprem
-	fnstsw %ax
-	testb $4,%ah
-	jnz 1b
-	fstp %st(1)
-	ret
diff --git a/src/math/x86_64/llrint.c b/src/math/x86_64/llrint.c
new file mode 100644
index 00000000..dd38a722
--- /dev/null
+++ b/src/math/x86_64/llrint.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long long llrint(double x)
+{
+	long long r;
+	__asm__ ("cvtsd2si %1, %0" : "=r"(r) : "x"(x));
+	return r;
+}
diff --git a/src/math/x86_64/llrint.s b/src/math/x86_64/llrint.s
deleted file mode 100644
index bf476498..00000000
--- a/src/math/x86_64/llrint.s
+++ /dev/null
@@ -1,5 +0,0 @@
-.global llrint
-.type llrint,@function
-llrint:
-	cvtsd2si %xmm0,%rax
-	ret
diff --git a/src/math/x86_64/llrintf.c b/src/math/x86_64/llrintf.c
new file mode 100644
index 00000000..fc8625e8
--- /dev/null
+++ b/src/math/x86_64/llrintf.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long long llrintf(float x)
+{
+	long long r;
+	__asm__ ("cvtss2si %1, %0" : "=r"(r) : "x"(x));
+	return r;
+}
diff --git a/src/math/x86_64/llrintf.s b/src/math/x86_64/llrintf.s
deleted file mode 100644
index d7204ac0..00000000
--- a/src/math/x86_64/llrintf.s
+++ /dev/null
@@ -1,5 +0,0 @@
-.global llrintf
-.type llrintf,@function
-llrintf:
-	cvtss2si %xmm0,%rax
-	ret
diff --git a/src/math/x86_64/llrintl.c b/src/math/x86_64/llrintl.c
new file mode 100644
index 00000000..c439ef28
--- /dev/null
+++ b/src/math/x86_64/llrintl.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long long llrintl(long double x)
+{
+	long long r;
+	__asm__ ("fistpll %0" : "=m"(r) : "t"(x) : "st");
+	return r;
+}
diff --git a/src/math/x86_64/llrintl.s b/src/math/x86_64/llrintl.s
deleted file mode 100644
index 1ec0817d..00000000
--- a/src/math/x86_64/llrintl.s
+++ /dev/null
@@ -1,7 +0,0 @@
-.global llrintl
-.type llrintl,@function
-llrintl:
-	fldt 8(%rsp)
-	fistpll 8(%rsp)
-	mov 8(%rsp),%rax
-	ret
diff --git a/src/math/x86_64/lrint.c b/src/math/x86_64/lrint.c
new file mode 100644
index 00000000..a742fec6
--- /dev/null
+++ b/src/math/x86_64/lrint.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long lrint(double x)
+{
+	long r;
+	__asm__ ("cvtsd2si %1, %0" : "=r"(r) : "x"(x));
+	return r;
+}
diff --git a/src/math/x86_64/lrint.s b/src/math/x86_64/lrint.s
deleted file mode 100644
index 15fc2454..00000000
--- a/src/math/x86_64/lrint.s
+++ /dev/null
@@ -1,5 +0,0 @@
-.global lrint
-.type lrint,@function
-lrint:
-	cvtsd2si %xmm0,%rax
-	ret
diff --git a/src/math/x86_64/lrintf.c b/src/math/x86_64/lrintf.c
new file mode 100644
index 00000000..2ba5639d
--- /dev/null
+++ b/src/math/x86_64/lrintf.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long lrintf(float x)
+{
+	long r;
+	__asm__ ("cvtss2si %1, %0" : "=r"(r) : "x"(x));
+	return r;
+}
diff --git a/src/math/x86_64/lrintf.s b/src/math/x86_64/lrintf.s
deleted file mode 100644
index 488423d2..00000000
--- a/src/math/x86_64/lrintf.s
+++ /dev/null
@@ -1,5 +0,0 @@
-.global lrintf
-.type lrintf,@function
-lrintf:
-	cvtss2si %xmm0,%rax
-	ret
diff --git a/src/math/x86_64/lrintl.c b/src/math/x86_64/lrintl.c
new file mode 100644
index 00000000..068e2e4d
--- /dev/null
+++ b/src/math/x86_64/lrintl.c
@@ -0,0 +1,8 @@
+#include <math.h>
+
+long lrintl(long double x)
+{
+	long r;
+	__asm__ ("fistpll %0" : "=m"(r) : "t"(x) : "st");
+	return r;
+}
diff --git a/src/math/x86_64/lrintl.s b/src/math/x86_64/lrintl.s
deleted file mode 100644
index d587b12b..00000000
--- a/src/math/x86_64/lrintl.s
+++ /dev/null
@@ -1,7 +0,0 @@
-.global lrintl
-.type lrintl,@function
-lrintl:
-	fldt 8(%rsp)
-	fistpll 8(%rsp)
-	mov 8(%rsp),%rax
-	ret
diff --git a/src/math/x86_64/remainderl.c b/src/math/x86_64/remainderl.c
new file mode 100644
index 00000000..8cf75071
--- /dev/null
+++ b/src/math/x86_64/remainderl.c
@@ -0,0 +1,9 @@
+#include <math.h>
+
+long double remainderl(long double x, long double y)
+{
+	unsigned short fpsr;
+	do __asm__ ("fprem1; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y));
+	while (fpsr & 0x400);
+	return x;
+}
diff --git a/src/math/x86_64/remainderl.s b/src/math/x86_64/remainderl.s
deleted file mode 100644
index cb3857b4..00000000
--- a/src/math/x86_64/remainderl.s
+++ /dev/null
@@ -1,11 +0,0 @@
-.global remainderl
-.type remainderl,@function
-remainderl:
-	fldt 24(%rsp)
-	fldt 8(%rsp)
-1:	fprem1
-	fnstsw %ax
-	testb $4,%ah
-	jnz 1b
-	fstp %st(1)
-	ret
diff --git a/src/math/x86_64/remquol.c b/src/math/x86_64/remquol.c
new file mode 100644
index 00000000..60eef089
--- /dev/null
+++ b/src/math/x86_64/remquol.c
@@ -0,0 +1,32 @@
+#include <math.h>
+
+long double remquol(long double x, long double y, int *quo)
+{
+	signed char *cx = (void *)&x, *cy = (void *)&y;
+	/* By ensuring that addresses of x and y cannot be discarded,
+	 * this empty asm guides GCC into representing extraction of
+	 * their sign bits as memory loads rather than making x and y
+	 * not-address-taken internally and using bitfield operations,
+	 * which in the end wouldn't work out, as extraction from FPU
+	 * registers needs to go through memory anyway. This way GCC
+	 * should manage to use incoming stack slots without spills. */
+	__asm__ ("" :: "X"(cx), "X"(cy));
+
+	long double t = x;
+	unsigned fpsr;
+	do __asm__ ("fprem1; fnstsw %%ax" : "+t"(t), "=a"(fpsr) : "u"(y));
+	while (fpsr & 0x400);
+	/* C0, C1, C3 flags in x87 status word carry low bits of quotient:
+	 * 15 14 13 12 11 10  9  8
+	 *  . C3  .  .  . C2 C1 C0
+	 *  . b1  .  .  .  0 b0 b2 */
+	unsigned char i = fpsr >> 8;
+	i = i>>4 | i<<4;
+	/* i[5:2] is now {b0 b2 ? b1}. Retrieve {0 b2 b1 b0} via
+	 * in-register table lookup. */
+	unsigned qbits = 0x7575313164642020 >> (i & 60);
+	qbits &= 7;
+
+	*quo = (cx[9]^cy[9]) < 0 ? -qbits : qbits;
+	return t;
+}
diff --git a/src/math/x86_64/rintl.c b/src/math/x86_64/rintl.c
new file mode 100644
index 00000000..e1a92077
--- /dev/null
+++ b/src/math/x86_64/rintl.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+long double rintl(long double x)
+{
+	__asm__ ("frndint" : "+t"(x));
+	return x;
+}
diff --git a/src/math/x86_64/rintl.s b/src/math/x86_64/rintl.s
deleted file mode 100644
index 64e663cd..00000000
--- a/src/math/x86_64/rintl.s
+++ /dev/null
@@ -1,6 +0,0 @@
-.global rintl
-.type rintl,@function
-rintl:
-	fldt 8(%rsp)
-	frndint
-	ret
diff --git a/src/math/x86_64/sqrt.c b/src/math/x86_64/sqrt.c
new file mode 100644
index 00000000..657e09e3
--- /dev/null
+++ b/src/math/x86_64/sqrt.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+double sqrt(double x)
+{
+	__asm__ ("sqrtsd %1, %0" : "=x"(x) : "x"(x));
+	return x;
+}
diff --git a/src/math/x86_64/sqrt.s b/src/math/x86_64/sqrt.s
deleted file mode 100644
index d3c609f9..00000000
--- a/src/math/x86_64/sqrt.s
+++ /dev/null
@@ -1,4 +0,0 @@
-.global sqrt
-.type sqrt,@function
-sqrt:	sqrtsd %xmm0, %xmm0
-	ret
diff --git a/src/math/x86_64/sqrtf.c b/src/math/x86_64/sqrtf.c
new file mode 100644
index 00000000..720baec6
--- /dev/null
+++ b/src/math/x86_64/sqrtf.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+float sqrtf(float x)
+{
+	__asm__ ("sqrtss %1, %0" : "=x"(x) : "x"(x));
+	return x;
+}
diff --git a/src/math/x86_64/sqrtf.s b/src/math/x86_64/sqrtf.s
deleted file mode 100644
index eec48c60..00000000
--- a/src/math/x86_64/sqrtf.s
+++ /dev/null
@@ -1,4 +0,0 @@
-.global sqrtf
-.type sqrtf,@function
-sqrtf:  sqrtss %xmm0, %xmm0
-	ret
diff --git a/src/math/x86_64/sqrtl.c b/src/math/x86_64/sqrtl.c
new file mode 100644
index 00000000..864cfcc4
--- /dev/null
+++ b/src/math/x86_64/sqrtl.c
@@ -0,0 +1,7 @@
+#include <math.h>
+
+long double sqrtl(long double x)
+{
+	__asm__ ("fsqrt" : "+t"(x));
+	return x;
+}
diff --git a/src/math/x86_64/sqrtl.s b/src/math/x86_64/sqrtl.s
deleted file mode 100644
index 23cd687d..00000000
--- a/src/math/x86_64/sqrtl.s
+++ /dev/null
@@ -1,5 +0,0 @@
-.global sqrtl
-.type sqrtl,@function
-sqrtl:	fldt 8(%rsp)
-	fsqrt
-	ret