diff options
Diffstat (limited to 'src/math')
123 files changed, 1150 insertions, 642 deletions
diff --git a/src/math/__expo2.c b/src/math/__expo2.c index 740ac680..248f052b 100644 --- a/src/math/__expo2.c +++ b/src/math/__expo2.c @@ -5,12 +5,13 @@ static const int k = 2043; static const double kln2 = 0x1.62066151add8bp+10; /* exp(x)/2 for x >= log(DBL_MAX), slightly better than 0.5*exp(x/2)*exp(x/2) */ -double __expo2(double x) +double __expo2(double x, double sign) { double scale; /* note that k is odd and scale*scale overflows */ INSERT_WORDS(scale, (uint32_t)(0x3ff + k/2) << 20, 0); /* exp(x - k ln2) * 2**(k-1) */ - return exp(x - kln2) * scale * scale; + /* in directed rounding correct sign before rounding or overflow is important */ + return exp(x - kln2) * (sign * scale) * scale; } diff --git a/src/math/__expo2f.c b/src/math/__expo2f.c index 5163e418..538eb09c 100644 --- a/src/math/__expo2f.c +++ b/src/math/__expo2f.c @@ -5,12 +5,13 @@ static const int k = 235; static const float kln2 = 0x1.45c778p+7f; /* expf(x)/2 for x >= log(FLT_MAX), slightly better than 0.5f*expf(x/2)*expf(x/2) */ -float __expo2f(float x) +float __expo2f(float x, float sign) { float scale; /* note that k is odd and scale*scale overflows */ SET_FLOAT_WORD(scale, (uint32_t)(0x7f + k/2) << 23); /* exp(x - k ln2) * 2**(k-1) */ - return expf(x - kln2) * scale * scale; + /* in directed rounding correct sign before rounding or overflow is important */ + return expf(x - kln2) * (sign * scale) * scale; } diff --git a/src/math/__math_invalidl.c b/src/math/__math_invalidl.c new file mode 100644 index 00000000..1fca99de --- /dev/null +++ b/src/math/__math_invalidl.c @@ -0,0 +1,9 @@ +#include <float.h> +#include "libm.h" + +#if LDBL_MANT_DIG != DBL_MANT_DIG +long double __math_invalidl(long double x) +{ + return (x - x) / (x - x); +} +#endif diff --git a/src/math/__rem_pio2.c b/src/math/__rem_pio2.c index d403f81c..dcf672fb 100644 --- a/src/math/__rem_pio2.c +++ b/src/math/__rem_pio2.c @@ -36,6 +36,7 @@ */ static const double toint = 1.5/EPS, +pio4 = 0x1.921fb54442d18p-1, invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */ pio2_1 = 1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */ pio2_1t = 6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */ @@ -117,11 +118,23 @@ int __rem_pio2(double x, double *y) } if (ix < 0x413921fb) { /* |x| ~< 2^20*(pi/2), medium size */ medium: - /* rint(x/(pi/2)), Assume round-to-nearest. */ + /* rint(x/(pi/2)) */ fn = (double_t)x*invpio2 + toint - toint; n = (int32_t)fn; r = x - fn*pio2_1; w = fn*pio2_1t; /* 1st round, good to 85 bits */ + /* Matters with directed rounding. */ + if (predict_false(r - w < -pio4)) { + n--; + fn--; + r = x - fn*pio2_1; + w = fn*pio2_1t; + } else if (predict_false(r - w > pio4)) { + n++; + fn++; + r = x - fn*pio2_1; + w = fn*pio2_1t; + } y[0] = r - w; u.f = y[0]; ey = u.i>>52 & 0x7ff; diff --git a/src/math/__rem_pio2f.c b/src/math/__rem_pio2f.c index 4473c1c4..e6765643 100644 --- a/src/math/__rem_pio2f.c +++ b/src/math/__rem_pio2f.c @@ -35,6 +35,7 @@ */ static const double toint = 1.5/EPS, +pio4 = 0x1.921fb6p-1, invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */ pio2_1 = 1.57079631090164184570e+00, /* 0x3FF921FB, 0x50000000 */ pio2_1t = 1.58932547735281966916e-08; /* 0x3E5110b4, 0x611A6263 */ @@ -50,10 +51,20 @@ int __rem_pio2f(float x, double *y) ix = u.i & 0x7fffffff; /* 25+53 bit pi is good enough for medium size */ if (ix < 0x4dc90fdb) { /* |x| ~< 2^28*(pi/2), medium size */ - /* Use a specialized rint() to get fn. Assume round-to-nearest. */ + /* Use a specialized rint() to get fn. */ fn = (double_t)x*invpio2 + toint - toint; n = (int32_t)fn; *y = x - fn*pio2_1 - fn*pio2_1t; + /* Matters with directed rounding. */ + if (predict_false(*y < -pio4)) { + n--; + fn--; + *y = x - fn*pio2_1 - fn*pio2_1t; + } else if (predict_false(*y > pio4)) { + n++; + fn++; + *y = x - fn*pio2_1 - fn*pio2_1t; + } return n; } if(ix>=0x7f800000) { /* x is inf or NaN */ diff --git a/src/math/__rem_pio2l.c b/src/math/__rem_pio2l.c index 77255bd8..236b2def 100644 --- a/src/math/__rem_pio2l.c +++ b/src/math/__rem_pio2l.c @@ -44,6 +44,7 @@ pio2_1 = 1.57079632679597125389e+00, /* 0x3FF921FB, 0x54444000 */ pio2_2 = -1.07463465549783099519e-12, /* -0x12e7b967674000.0p-92 */ pio2_3 = 6.36831716351370313614e-25; /* 0x18a2e037074000.0p-133 */ static const long double +pio4 = 0x1.921fb54442d1846ap-1L, invpio2 = 6.36619772367581343076e-01L, /* 0xa2f9836e4e44152a.0p-64 */ pio2_1t = -1.07463465549719416346e-12L, /* -0x973dcb3b399d747f.0p-103 */ pio2_2t = 6.36831716351095013979e-25L, /* 0xc51701b839a25205.0p-144 */ @@ -57,6 +58,7 @@ pio2_3t = -2.75299651904407171810e-37L; /* -0xbb5bf6c7ddd660ce.0p-185 */ #define NX 5 #define NY 3 static const long double +pio4 = 0x1.921fb54442d18469898cc51701b8p-1L, invpio2 = 6.3661977236758134307553505349005747e-01L, /* 0x145f306dc9c882a53f84eafa3ea6a.0p-113 */ pio2_1 = 1.5707963267948966192292994253909555e+00L, /* 0x1921fb54442d18469800000000000.0p-112 */ pio2_1t = 2.0222662487959507323996846200947577e-21L, /* 0x13198a2e03707344a4093822299f3.0p-181 */ @@ -76,11 +78,23 @@ int __rem_pio2l(long double x, long double *y) u.f = x; ex = u.i.se & 0x7fff; if (SMALL(u)) { - /* rint(x/(pi/2)), Assume round-to-nearest. */ + /* rint(x/(pi/2)) */ fn = x*invpio2 + toint - toint; n = QUOBITS(fn); r = x-fn*pio2_1; w = fn*pio2_1t; /* 1st round good to 102/180 bits (ld80/ld128) */ + /* Matters with directed rounding. */ + if (predict_false(r - w < -pio4)) { + n--; + fn--; + r = x - fn*pio2_1; + w = fn*pio2_1t; + } else if (predict_false(r - w > pio4)) { + n++; + fn++; + r = x - fn*pio2_1; + w = fn*pio2_1t; + } y[0] = r-w; u.f = y[0]; ey = u.i.se & 0x7fff; diff --git a/src/math/acoshf.c b/src/math/acoshf.c index 8a4ec4d5..b773d48e 100644 --- a/src/math/acoshf.c +++ b/src/math/acoshf.c @@ -15,12 +15,12 @@ float acoshf(float x) uint32_t a = u.i & 0x7fffffff; if (a < 0x3f800000+(1<<23)) - /* |x| < 2, invalid if x < 1 or nan */ + /* |x| < 2, invalid if x < 1 */ /* up to 2ulp error in [1,1.125] */ return log1pf(x-1 + sqrtf((x-1)*(x-1)+2*(x-1))); - if (a < 0x3f800000+(12<<23)) - /* |x| < 0x1p12 */ + if (u.i < 0x3f800000+(12<<23)) + /* 2 <= x < 0x1p12 */ return logf(2*x - 1/(x+sqrtf(x*x-1))); - /* x >= 0x1p12 */ + /* x >= 0x1p12 or x <= -2 or nan */ return logf(x) + 0.693147180559945309417232121458176568f; } diff --git a/src/math/acoshl.c b/src/math/acoshl.c index 8d4b43f6..943cec17 100644 --- a/src/math/acoshl.c +++ b/src/math/acoshl.c @@ -10,14 +10,18 @@ long double acoshl(long double x) long double acoshl(long double x) { union ldshape u = {x}; - int e = u.i.se & 0x7fff; + int e = u.i.se; if (e < 0x3fff + 1) - /* |x| < 2, invalid if x < 1 or nan */ + /* 0 <= x < 2, invalid if x < 1 */ return log1pl(x-1 + sqrtl((x-1)*(x-1)+2*(x-1))); if (e < 0x3fff + 32) - /* |x| < 0x1p32 */ + /* 2 <= x < 0x1p32 */ return logl(2*x - 1/(x+sqrtl(x*x-1))); + if (e & 0x8000) + /* x < 0 or x = -0, invalid */ + return (x - x) / (x - x); + /* 0x1p32 <= x or nan */ return logl(x) + 0.693147180559945309417232121458176568L; } #elif LDBL_MANT_DIG == 113 && LDBL_MAX_EXP == 16384 diff --git a/src/math/arm/fabs.c b/src/math/arm/fabs.c index f890520a..6e1d367d 100644 --- a/src/math/arm/fabs.c +++ b/src/math/arm/fabs.c @@ -1,6 +1,6 @@ #include <math.h> -#if __ARM_PCS_VFP +#if __ARM_PCS_VFP && __ARM_FP&8 double fabs(double x) { diff --git a/src/math/arm/sqrt.c b/src/math/arm/sqrt.c index 874af960..567e2e91 100644 --- a/src/math/arm/sqrt.c +++ b/src/math/arm/sqrt.c @@ -1,6 +1,6 @@ #include <math.h> -#if __ARM_PCS_VFP || (__VFP_FP__ && !__SOFTFP__) +#if (__ARM_PCS_VFP || (__VFP_FP__ && !__SOFTFP__)) && (__ARM_FP&8) double sqrt(double x) { diff --git a/src/math/cosh.c b/src/math/cosh.c index 100f8231..490c15fb 100644 --- a/src/math/cosh.c +++ b/src/math/cosh.c @@ -35,6 +35,6 @@ double cosh(double x) /* |x| > log(DBL_MAX) or nan */ /* note: the result is stored to handle overflow */ - t = __expo2(x); + t = __expo2(x, 1.0); return t; } diff --git a/src/math/coshf.c b/src/math/coshf.c index b09f2ee5..e739cff9 100644 --- a/src/math/coshf.c +++ b/src/math/coshf.c @@ -28,6 +28,6 @@ float coshf(float x) } /* |x| > log(FLT_MAX) or nan */ - t = __expo2f(x); + t = __expo2f(x, 1.0f); return t; } diff --git a/src/math/expm1f.c b/src/math/expm1f.c index 297e0b44..09a41afe 100644 --- a/src/math/expm1f.c +++ b/src/math/expm1f.c @@ -16,7 +16,6 @@ #include "libm.h" static const float -o_threshold = 8.8721679688e+01, /* 0x42b17180 */ ln2_hi = 6.9313812256e-01, /* 0x3f317180 */ ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */ invln2 = 1.4426950216e+00, /* 0x3fb8aa3b */ @@ -41,7 +40,7 @@ float expm1f(float x) return x; if (sign) return -1; - if (x > o_threshold) { + if (hx > 0x42b17217) { /* x > log(FLT_MAX) */ x *= 0x1p127f; return x; } diff --git a/src/math/fma.c b/src/math/fma.c index 0c6f90c9..adfadca8 100644 --- a/src/math/fma.c +++ b/src/math/fma.c @@ -53,7 +53,7 @@ double fma(double x, double y, double z) return x*y + z; if (nz.e >= ZEROINFNAN) { if (nz.e > ZEROINFNAN) /* z==0 */ - return x*y + z; + return x*y; return z; } diff --git a/src/math/fmaf.c b/src/math/fmaf.c index 80f5cd8a..7c65acf1 100644 --- a/src/math/fmaf.c +++ b/src/math/fmaf.c @@ -77,17 +77,16 @@ float fmaf(float x, float y, float z) * If result is inexact, and exactly halfway between two float values, * we need to adjust the low-order bit in the direction of the error. */ -#ifdef FE_TOWARDZERO - fesetround(FE_TOWARDZERO); -#endif - volatile double vxy = xy; /* XXX work around gcc CSE bug */ - double adjusted_result = vxy + z; - fesetround(FE_TONEAREST); - if (result == adjusted_result) { - u.f = adjusted_result; + double err; + int neg = u.i >> 63; + if (neg == (z > xy)) + err = xy - result + z; + else + err = z - result + xy; + if (neg == (err < 0)) u.i++; - adjusted_result = u.f; - } - z = adjusted_result; + else + u.i--; + z = u.f; return z; } diff --git a/src/math/i386/exp.c b/src/math/i386/exp.c deleted file mode 100644 index 11282284..00000000 --- a/src/math/i386/exp.c +++ /dev/null @@ -1 +0,0 @@ -// see exp_ld.s diff --git a/src/math/i386/exp2.s b/src/math/i386/exp2.s deleted file mode 100644 index f335a3e5..00000000 --- a/src/math/i386/exp2.s +++ /dev/null @@ -1 +0,0 @@ -# see exp.s diff --git a/src/math/i386/exp2f.s b/src/math/i386/exp2f.s deleted file mode 100644 index f335a3e5..00000000 --- a/src/math/i386/exp2f.s +++ /dev/null @@ -1 +0,0 @@ -# see exp.s diff --git a/src/math/i386/exp2l.s b/src/math/i386/exp2l.s index f335a3e5..8125761d 100644 --- a/src/math/i386/exp2l.s +++ b/src/math/i386/exp2l.s @@ -1 +1 @@ -# see exp.s +# see exp_ld.s diff --git a/src/math/i386/exp_ld.s b/src/math/i386/exp_ld.s index df87c497..99cba01f 100644 --- a/src/math/i386/exp_ld.s +++ b/src/math/i386/exp_ld.s @@ -1,35 +1,8 @@ -.global expm1f -.type expm1f,@function -expm1f: - flds 4(%esp) - mov 4(%esp),%eax - add %eax,%eax - cmp $0x01000000,%eax - jae 1f - # subnormal x, return x with underflow - fld %st(0) - fmul %st(1) - fstps 4(%esp) - ret - .global expm1l .type expm1l,@function expm1l: fldt 4(%esp) - jmp 1f - -.global expm1 -.type expm1,@function -expm1: - fldl 4(%esp) - mov 8(%esp),%eax - add %eax,%eax - cmp $0x00200000,%eax - jae 1f - # subnormal x, return x with underflow - fsts 4(%esp) - ret -1: fldl2e + fldl2e fmulp mov $0xc2820000,%eax push %eax @@ -59,12 +32,6 @@ expm1: fsubrp ret -.global exp2f -.type exp2f,@function -exp2f: - flds 4(%esp) - jmp 1f - .global exp2l .global __exp2l .hidden __exp2l @@ -72,26 +39,6 @@ exp2f: exp2l: __exp2l: fldt 4(%esp) - jmp 1f - -.global expf -.type expf,@function -expf: - flds 4(%esp) - jmp 2f - -.global exp -.type exp,@function -exp: - fldl 4(%esp) -2: fldl2e - fmulp - jmp 1f - -.global exp2 -.type exp2,@function -exp2: - fldl 4(%esp) 1: sub $12,%esp fld %st(0) fstpt (%esp) diff --git a/src/math/i386/expf.s b/src/math/i386/expf.s deleted file mode 100644 index f335a3e5..00000000 --- a/src/math/i386/expf.s +++ /dev/null @@ -1 +0,0 @@ -# see exp.s diff --git a/src/math/i386/expm1.s b/src/math/i386/expm1.s deleted file mode 100644 index f335a3e5..00000000 --- a/src/math/i386/expm1.s +++ /dev/null @@ -1 +0,0 @@ -# see exp.s diff --git a/src/math/i386/expm1f.s b/src/math/i386/expm1f.s deleted file mode 100644 index f335a3e5..00000000 --- a/src/math/i386/expm1f.s +++ /dev/null @@ -1 +0,0 @@ -# see exp.s diff --git a/src/math/i386/expm1l.s b/src/math/i386/expm1l.s index f335a3e5..8125761d 100644 --- a/src/math/i386/expm1l.s +++ b/src/math/i386/expm1l.s @@ -1 +1 @@ -# see exp.s +# see exp_ld.s diff --git a/src/math/i386/fabs.c b/src/math/i386/fabs.c new file mode 100644 index 00000000..39672786 --- /dev/null +++ b/src/math/i386/fabs.c @@ -0,0 +1,7 @@ +#include <math.h> + +double fabs(double x) +{ + __asm__ ("fabs" : "+t"(x)); + return x; +} diff --git a/src/math/i386/fabs.s b/src/math/i386/fabs.s deleted file mode 100644 index d66ea9a1..00000000 --- a/src/math/i386/fabs.s +++ /dev/null @@ -1,6 +0,0 @@ -.global fabs -.type fabs,@function -fabs: - fldl 4(%esp) - fabs - ret diff --git a/src/math/i386/fabsf.c b/src/math/i386/fabsf.c new file mode 100644 index 00000000..d882eee3 --- /dev/null +++ b/src/math/i386/fabsf.c @@ -0,0 +1,7 @@ +#include <math.h> + +float fabsf(float x) +{ + __asm__ ("fabs" : "+t"(x)); + return x; +} diff --git a/src/math/i386/fabsf.s b/src/math/i386/fabsf.s deleted file mode 100644 index a981c422..00000000 --- a/src/math/i386/fabsf.s +++ /dev/null @@ -1,6 +0,0 @@ -.global fabsf -.type fabsf,@function -fabsf: - flds 4(%esp) - fabs - ret diff --git a/src/math/i386/fabsl.c b/src/math/i386/fabsl.c new file mode 100644 index 00000000..cc1c9ed9 --- /dev/null +++ b/src/math/i386/fabsl.c @@ -0,0 +1,7 @@ +#include <math.h> + +long double fabsl(long double x) +{ + __asm__ ("fabs" : "+t"(x)); + return x; +} diff --git a/src/math/i386/fabsl.s b/src/math/i386/fabsl.s deleted file mode 100644 index ceef9e4c..00000000 --- a/src/math/i386/fabsl.s +++ /dev/null @@ -1,6 +0,0 @@ -.global fabsl -.type fabsl,@function -fabsl: - fldt 4(%esp) - fabs - ret diff --git a/src/math/i386/fmod.c b/src/math/i386/fmod.c new file mode 100644 index 00000000..ea0c58d9 --- /dev/null +++ b/src/math/i386/fmod.c @@ -0,0 +1,10 @@ +#include <math.h> + +double fmod(double x, double y) +{ + unsigned short fpsr; + // fprem does not introduce excess precision into x + do __asm__ ("fprem; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y)); + while (fpsr & 0x400); + return x; +} diff --git a/src/math/i386/fmod.s b/src/math/i386/fmod.s deleted file mode 100644 index 2113b3c5..00000000 --- a/src/math/i386/fmod.s +++ /dev/null @@ -1,11 +0,0 @@ -.global fmod -.type fmod,@function -fmod: - fldl 12(%esp) - fldl 4(%esp) -1: fprem - fnstsw %ax - sahf - jp 1b - fstp %st(1) - ret diff --git a/src/math/i386/fmodf.c b/src/math/i386/fmodf.c new file mode 100644 index 00000000..90b56ab0 --- /dev/null +++ b/src/math/i386/fmodf.c @@ -0,0 +1,10 @@ +#include <math.h> + +float fmodf(float x, float y) +{ + unsigned short fpsr; + // fprem does not introduce excess precision into x + do __asm__ ("fprem; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y)); + while (fpsr & 0x400); + return x; +} diff --git a/src/math/i386/fmodf.s b/src/math/i386/fmodf.s deleted file mode 100644 index e04e2a56..00000000 --- a/src/math/i386/fmodf.s +++ /dev/null @@ -1,11 +0,0 @@ -.global fmodf -.type fmodf,@function -fmodf: - flds 8(%esp) - flds 4(%esp) -1: fprem - fnstsw %ax - sahf - jp 1b - fstp %st(1) - ret diff --git a/src/math/i386/fmodl.c b/src/math/i386/fmodl.c new file mode 100644 index 00000000..3daeab06 --- /dev/null +++ b/src/math/i386/fmodl.c @@ -0,0 +1,9 @@ +#include <math.h> + +long double fmodl(long double x, long double y) +{ + unsigned short fpsr; + do __asm__ ("fprem; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y)); + while (fpsr & 0x400); + return x; +} diff --git a/src/math/i386/fmodl.s b/src/math/i386/fmodl.s deleted file mode 100644 index 0cb3fe9b..00000000 --- a/src/math/i386/fmodl.s +++ /dev/null @@ -1,11 +0,0 @@ -.global fmodl -.type fmodl,@function -fmodl: - fldt 16(%esp) - fldt 4(%esp) -1: fprem - fnstsw %ax - sahf - jp 1b - fstp %st(1) - ret diff --git a/src/math/i386/llrint.c b/src/math/i386/llrint.c new file mode 100644 index 00000000..aa400817 --- /dev/null +++ b/src/math/i386/llrint.c @@ -0,0 +1,8 @@ +#include <math.h> + +long long llrint(double x) +{ + long long r; + __asm__ ("fistpll %0" : "=m"(r) : "t"(x) : "st"); + return r; +} diff --git a/src/math/i386/llrint.s b/src/math/i386/llrint.s deleted file mode 100644 index 8e89cd91..00000000 --- a/src/math/i386/llrint.s +++ /dev/null @@ -1,8 +0,0 @@ -.global llrint -.type llrint,@function -llrint: - fldl 4(%esp) - fistpll 4(%esp) - mov 4(%esp),%eax - mov 8(%esp),%edx - ret diff --git a/src/math/i386/llrintf.c b/src/math/i386/llrintf.c new file mode 100644 index 00000000..c41a317b --- /dev/null +++ b/src/math/i386/llrintf.c @@ -0,0 +1,8 @@ +#include <math.h> + +long long llrintf(float x) +{ + long long r; + __asm__ ("fistpll %0" : "=m"(r) : "t"(x) : "st"); + return r; +} diff --git a/src/math/i386/llrintf.s b/src/math/i386/llrintf.s deleted file mode 100644 index aa850c6c..00000000 --- a/src/math/i386/llrintf.s +++ /dev/null @@ -1,9 +0,0 @@ -.global llrintf -.type llrintf,@function -llrintf: - sub $8,%esp - flds 12(%esp) - fistpll (%esp) - pop %eax - pop %edx - ret diff --git a/src/math/i386/llrintl.c b/src/math/i386/llrintl.c new file mode 100644 index 00000000..c439ef28 --- /dev/null +++ b/src/math/i386/llrintl.c @@ -0,0 +1,8 @@ +#include <math.h> + +long long llrintl(long double x) +{ + long long r; + __asm__ ("fistpll %0" : "=m"(r) : "t"(x) : "st"); + return r; +} diff --git a/src/math/i386/llrintl.s b/src/math/i386/llrintl.s deleted file mode 100644 index 1cfb56f1..00000000 --- a/src/math/i386/llrintl.s +++ /dev/null @@ -1,8 +0,0 @@ -.global llrintl -.type llrintl,@function -llrintl: - fldt 4(%esp) - fistpll 4(%esp) - mov 4(%esp),%eax - mov 8(%esp),%edx - ret diff --git a/src/math/i386/lrint.c b/src/math/i386/lrint.c new file mode 100644 index 00000000..89563ab2 --- /dev/null +++ b/src/math/i386/lrint.c @@ -0,0 +1,8 @@ +#include <math.h> + +long lrint(double x) +{ + long r; + __asm__ ("fistpl %0" : "=m"(r) : "t"(x) : "st"); + return r; +} diff --git a/src/math/i386/lrint.s b/src/math/i386/lrint.s deleted file mode 100644 index 02b83d9f..00000000 --- a/src/math/i386/lrint.s +++ /dev/null @@ -1,7 +0,0 @@ -.global lrint -.type lrint,@function -lrint: - fldl 4(%esp) - fistpl 4(%esp) - mov 4(%esp),%eax - ret diff --git a/src/math/i386/lrintf.c b/src/math/i386/lrintf.c new file mode 100644 index 00000000..0bbf29de --- /dev/null +++ b/src/math/i386/lrintf.c @@ -0,0 +1,8 @@ +#include <math.h> + +long lrintf(float x) +{ + long r; + __asm__ ("fistpl %0" : "=m"(r) : "t"(x) : "st"); + return r; +} diff --git a/src/math/i386/lrintf.s b/src/math/i386/lrintf.s deleted file mode 100644 index 907aac29..00000000 --- a/src/math/i386/lrintf.s +++ /dev/null @@ -1,7 +0,0 @@ -.global lrintf -.type lrintf,@function -lrintf: - flds 4(%esp) - fistpl 4(%esp) - mov 4(%esp),%eax - ret diff --git a/src/math/i386/lrintl.c b/src/math/i386/lrintl.c new file mode 100644 index 00000000..eb8c0902 --- /dev/null +++ b/src/math/i386/lrintl.c @@ -0,0 +1,8 @@ +#include <math.h> + +long lrintl(long double x) +{ + long r; + __asm__ ("fistpl %0" : "=m"(r) : "t"(x) : "st"); + return r; +} diff --git a/src/math/i386/lrintl.s b/src/math/i386/lrintl.s deleted file mode 100644 index 3ae05aac..00000000 --- a/src/math/i386/lrintl.s +++ /dev/null @@ -1,7 +0,0 @@ -.global lrintl -.type lrintl,@function -lrintl: - fldt 4(%esp) - fistpl 4(%esp) - mov 4(%esp),%eax - ret diff --git a/src/math/i386/remainder.c b/src/math/i386/remainder.c new file mode 100644 index 00000000..c083df90 --- /dev/null +++ b/src/math/i386/remainder.c @@ -0,0 +1,12 @@ +#include <math.h> + +double remainder(double x, double y) +{ + unsigned short fpsr; + // fprem1 does not introduce excess precision into x + do __asm__ ("fprem1; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y)); + while (fpsr & 0x400); + return x; +} + +weak_alias(remainder, drem); diff --git a/src/math/i386/remainder.s b/src/math/i386/remainder.s deleted file mode 100644 index ab1da95d..00000000 --- a/src/math/i386/remainder.s +++ /dev/null @@ -1,14 +0,0 @@ -.global remainder -.type remainder,@function -remainder: -.weak drem -.type drem,@function -drem: - fldl 12(%esp) - fldl 4(%esp) -1: fprem1 - fnstsw %ax - sahf - jp 1b - fstp %st(1) - ret diff --git a/src/math/i386/remainderf.c b/src/math/i386/remainderf.c new file mode 100644 index 00000000..280207d2 --- /dev/null +++ b/src/math/i386/remainderf.c @@ -0,0 +1,12 @@ +#include <math.h> + +float remainderf(float x, float y) +{ + unsigned short fpsr; + // fprem1 does not introduce excess precision into x + do __asm__ ("fprem1; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y)); + while (fpsr & 0x400); + return x; +} + +weak_alias(remainderf, dremf); diff --git a/src/math/i386/remainderf.s b/src/math/i386/remainderf.s deleted file mode 100644 index 6a7378a3..00000000 --- a/src/math/i386/remainderf.s +++ /dev/null @@ -1,14 +0,0 @@ -.global remainderf -.type remainderf,@function -remainderf: -.weak dremf -.type dremf,@function -dremf: - flds 8(%esp) - flds 4(%esp) -1: fprem1 - fnstsw %ax - sahf - jp 1b - fstp %st(1) - ret diff --git a/src/math/i386/remainderl.c b/src/math/i386/remainderl.c new file mode 100644 index 00000000..8cf75071 --- /dev/null +++ b/src/math/i386/remainderl.c @@ -0,0 +1,9 @@ +#include <math.h> + +long double remainderl(long double x, long double y) +{ + unsigned short fpsr; + do __asm__ ("fprem1; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y)); + while (fpsr & 0x400); + return x; +} diff --git a/src/math/i386/remainderl.s b/src/math/i386/remainderl.s deleted file mode 100644 index b41518ed..00000000 --- a/src/math/i386/remainderl.s +++ /dev/null @@ -1,11 +0,0 @@ -.global remainderl -.type remainderl,@function -remainderl: - fldt 16(%esp) - fldt 4(%esp) -1: fprem1 - fnstsw %ax - sahf - jp 1b - fstp %st(1) - ret diff --git a/src/math/i386/rint.c b/src/math/i386/rint.c new file mode 100644 index 00000000..a5276a60 --- /dev/null +++ b/src/math/i386/rint.c @@ -0,0 +1,7 @@ +#include <math.h> + +double rint(double x) +{ + __asm__ ("frndint" : "+t"(x)); + return x; +} diff --git a/src/math/i386/rint.s b/src/math/i386/rint.s deleted file mode 100644 index bb99a11c..00000000 --- a/src/math/i386/rint.s +++ /dev/null @@ -1,6 +0,0 @@ -.global rint -.type rint,@function -rint: - fldl 4(%esp) - frndint - ret diff --git a/src/math/i386/rintf.c b/src/math/i386/rintf.c new file mode 100644 index 00000000..bb4121a4 --- /dev/null +++ b/src/math/i386/rintf.c @@ -0,0 +1,7 @@ +#include <math.h> + +float rintf(float x) +{ + __asm__ ("frndint" : "+t"(x)); + return x; +} diff --git a/src/math/i386/rintf.s b/src/math/i386/rintf.s deleted file mode 100644 index bce4c5a6..00000000 --- a/src/math/i386/rintf.s +++ /dev/null @@ -1,6 +0,0 @@ -.global rintf -.type rintf,@function -rintf: - flds 4(%esp) - frndint - ret diff --git a/src/math/i386/rintl.c b/src/math/i386/rintl.c new file mode 100644 index 00000000..e1a92077 --- /dev/null +++ b/src/math/i386/rintl.c @@ -0,0 +1,7 @@ +#include <math.h> + +long double rintl(long double x) +{ + __asm__ ("frndint" : "+t"(x)); + return x; +} diff --git a/src/math/i386/rintl.s b/src/math/i386/rintl.s deleted file mode 100644 index cd2bf9a9..00000000 --- a/src/math/i386/rintl.s +++ /dev/null @@ -1,6 +0,0 @@ -.global rintl -.type rintl,@function -rintl: - fldt 4(%esp) - frndint - ret diff --git a/src/math/i386/sqrt.c b/src/math/i386/sqrt.c new file mode 100644 index 00000000..934fbcca --- /dev/null +++ b/src/math/i386/sqrt.c @@ -0,0 +1,15 @@ +#include "libm.h" + +double sqrt(double x) +{ + union ldshape ux; + unsigned fpsr; + __asm__ ("fsqrt; fnstsw %%ax": "=t"(ux.f), "=a"(fpsr) : "0"(x)); + if ((ux.i.m & 0x7ff) != 0x400) + return (double)ux.f; + /* Rounding to double would have encountered an exact halfway case. + Adjust mantissa downwards if fsqrt rounded up, else upwards. + (result of fsqrt could not have been exact) */ + ux.i.m ^= (fpsr & 0x200) + 0x300; + return (double)ux.f; +} diff --git a/src/math/i386/sqrt.s b/src/math/i386/sqrt.s deleted file mode 100644 index 57837e25..00000000 --- a/src/math/i386/sqrt.s +++ /dev/null @@ -1,21 +0,0 @@ -.global sqrt -.type sqrt,@function -sqrt: fldl 4(%esp) - fsqrt - fnstsw %ax - sub $12,%esp - fld %st(0) - fstpt (%esp) - mov (%esp),%ecx - and $0x7ff,%ecx - cmp $0x400,%ecx - jnz 1f - and $0x200,%eax - sub $0x100,%eax - sub %eax,(%esp) - fstp %st(0) - fldt (%esp) -1: add $12,%esp - fstpl 4(%esp) - fldl 4(%esp) - ret diff --git a/src/math/i386/sqrtf.c b/src/math/i386/sqrtf.c new file mode 100644 index 00000000..41c65c2b --- /dev/null +++ b/src/math/i386/sqrtf.c @@ -0,0 +1,12 @@ +#include <math.h> + +float sqrtf(float x) +{ + long double t; + /* The long double result has sufficient precision so that + * second rounding to float still keeps the returned value + * correctly rounded, see Pierre Roux, "Innocuous Double + * Rounding of Basic Arithmetic Operations". */ + __asm__ ("fsqrt" : "=t"(t) : "0"(x)); + return (float)t; +} diff --git a/src/math/i386/sqrtf.s b/src/math/i386/sqrtf.s deleted file mode 100644 index 9e944f45..00000000 --- a/src/math/i386/sqrtf.s +++ /dev/null @@ -1,7 +0,0 @@ -.global sqrtf -.type sqrtf,@function -sqrtf: flds 4(%esp) - fsqrt - fstps 4(%esp) - flds 4(%esp) - ret diff --git a/src/math/i386/sqrtl.c b/src/math/i386/sqrtl.c new file mode 100644 index 00000000..864cfcc4 --- /dev/null +++ b/src/math/i386/sqrtl.c @@ -0,0 +1,7 @@ +#include <math.h> + +long double sqrtl(long double x) +{ + __asm__ ("fsqrt" : "+t"(x)); + return x; +} diff --git a/src/math/i386/sqrtl.s b/src/math/i386/sqrtl.s deleted file mode 100644 index e0d42616..00000000 --- a/src/math/i386/sqrtl.s +++ /dev/null @@ -1,5 +0,0 @@ -.global sqrtl -.type sqrtl,@function -sqrtl: fldt 4(%esp) - fsqrt - ret diff --git a/src/math/logf.c b/src/math/logf.c index 7ee5d7fe..e4c2237c 100644 --- a/src/math/logf.c +++ b/src/math/logf.c @@ -53,7 +53,7 @@ float logf(float x) tmp = ix - OFF; i = (tmp >> (23 - LOGF_TABLE_BITS)) % N; k = (int32_t)tmp >> 23; /* arithmetic shift */ - iz = ix - (tmp & 0x1ff << 23); + iz = ix - (tmp & 0xff800000); invc = T[i].invc; logc = T[i].logc; z = (double_t)asfloat(iz); diff --git a/src/math/m68k/sqrtl.c b/src/math/m68k/sqrtl.c new file mode 100644 index 00000000..b1c303c7 --- /dev/null +++ b/src/math/m68k/sqrtl.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __HAVE_68881__ + +long double sqrtl(long double x) +{ + __asm__ ("fsqrt.x %1,%0" : "=f"(x) : "fm"(x)); + return x; +} + +#else + +#include "../sqrtl.c" + +#endif diff --git a/src/math/powerpc/fabs.c b/src/math/powerpc/fabs.c index 0efc21ef..9453a3aa 100644 --- a/src/math/powerpc/fabs.c +++ b/src/math/powerpc/fabs.c @@ -1,6 +1,6 @@ #include <math.h> -#if defined(_SOFT_FLOAT) || defined(BROKEN_PPC_D_ASM) +#if defined(_SOFT_FLOAT) || defined(__NO_FPRS__) || defined(BROKEN_PPC_D_ASM) #include "../fabs.c" diff --git a/src/math/powerpc/fabsf.c b/src/math/powerpc/fabsf.c index d88b5911..2e9da588 100644 --- a/src/math/powerpc/fabsf.c +++ b/src/math/powerpc/fabsf.c @@ -1,6 +1,6 @@ #include <math.h> -#ifdef _SOFT_FLOAT +#if defined(_SOFT_FLOAT) || defined(__NO_FPRS__) #include "../fabsf.c" diff --git a/src/math/powerpc/fma.c b/src/math/powerpc/fma.c index 135c9903..0eb2ba1e 100644 --- a/src/math/powerpc/fma.c +++ b/src/math/powerpc/fma.c @@ -1,6 +1,6 @@ #include <math.h> -#if defined(_SOFT_FLOAT) || defined(BROKEN_PPC_D_ASM) +#if defined(_SOFT_FLOAT) || defined(__NO_FPRS__) || defined(BROKEN_PPC_D_ASM) #include "../fma.c" diff --git a/src/math/powerpc/fmaf.c b/src/math/powerpc/fmaf.c index a99a2a3b..dc1a749d 100644 --- a/src/math/powerpc/fmaf.c +++ b/src/math/powerpc/fmaf.c @@ -1,6 +1,6 @@ #include <math.h> -#ifdef _SOFT_FLOAT +#if defined(_SOFT_FLOAT) || defined(__NO_FPRS__) #include "../fmaf.c" diff --git a/src/math/powl.c b/src/math/powl.c index 5b6da07b..6f64ea71 100644 --- a/src/math/powl.c +++ b/src/math/powl.c @@ -212,25 +212,33 @@ long double powl(long double x, long double y) } if (x == 1.0) return 1.0; /* 1**y = 1, even if y is nan */ - if (x == -1.0 && !isfinite(y)) - return 1.0; /* -1**inf = 1 */ if (y == 0.0) return 1.0; /* x**0 = 1, even if x is nan */ if (y == 1.0) return x; - if (y >= LDBL_MAX) { - if (x > 1.0 || x < -1.0) - return INFINITY; - if (x != 0.0) - return 0.0; - } - if (y <= -LDBL_MAX) { - if (x > 1.0 || x < -1.0) + /* if y*log2(x) < log2(LDBL_TRUE_MIN)-1 then x^y uflows to 0 + if y*log2(x) > -log2(LDBL_TRUE_MIN)+1 > LDBL_MAX_EXP then x^y oflows + if |x|!=1 then |log2(x)| > |log(x)| > LDBL_EPSILON/2 so + x^y oflows/uflows if |y|*LDBL_EPSILON/2 > -log2(LDBL_TRUE_MIN)+1 */ + if (fabsl(y) > 2*(-LDBL_MIN_EXP+LDBL_MANT_DIG+1)/LDBL_EPSILON) { + /* y is not an odd int */ + if (x == -1.0) + return 1.0; + if (y == INFINITY) { + if (x > 1.0 || x < -1.0) + return INFINITY; return 0.0; - if (x != 0.0 || y == -INFINITY) + } + if (y == -INFINITY) { + if (x > 1.0 || x < -1.0) + return 0.0; return INFINITY; + } + if ((x > 1.0 || x < -1.0) == (y > 0)) + return huge * huge; + return twom10000 * twom10000; } - if (x >= LDBL_MAX) { + if (x == INFINITY) { if (y > 0.0) return INFINITY; return 0.0; @@ -253,7 +261,7 @@ long double powl(long double x, long double y) yoddint = 1; } - if (x <= -LDBL_MAX) { + if (x == -INFINITY) { if (y > 0.0) { if (yoddint) return -INFINITY; diff --git a/src/math/riscv32/copysign.c b/src/math/riscv32/copysign.c new file mode 100644 index 00000000..c7854178 --- /dev/null +++ b/src/math/riscv32/copysign.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 64 + +double copysign(double x, double y) +{ + __asm__ ("fsgnj.d %0, %1, %2" : "=f"(x) : "f"(x), "f"(y)); + return x; +} + +#else + +#include "../copysign.c" + +#endif diff --git a/src/math/riscv32/copysignf.c b/src/math/riscv32/copysignf.c new file mode 100644 index 00000000..a125611a --- /dev/null +++ b/src/math/riscv32/copysignf.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 32 + +float copysignf(float x, float y) +{ + __asm__ ("fsgnj.s %0, %1, %2" : "=f"(x) : "f"(x), "f"(y)); + return x; +} + +#else + +#include "../copysignf.c" + +#endif diff --git a/src/math/riscv32/fabs.c b/src/math/riscv32/fabs.c new file mode 100644 index 00000000..5290b6f0 --- /dev/null +++ b/src/math/riscv32/fabs.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 64 + +double fabs(double x) +{ + __asm__ ("fabs.d %0, %1" : "=f"(x) : "f"(x)); + return x; +} + +#else + +#include "../fabs.c" + +#endif diff --git a/src/math/riscv32/fabsf.c b/src/math/riscv32/fabsf.c new file mode 100644 index 00000000..f5032e35 --- /dev/null +++ b/src/math/riscv32/fabsf.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 32 + +float fabsf(float x) +{ + __asm__ ("fabs.s %0, %1" : "=f"(x) : "f"(x)); + return x; +} + +#else + +#include "../fabsf.c" + +#endif diff --git a/src/math/riscv32/fma.c b/src/math/riscv32/fma.c new file mode 100644 index 00000000..99b05713 --- /dev/null +++ b/src/math/riscv32/fma.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 64 + +double fma(double x, double y, double z) +{ + __asm__ ("fmadd.d %0, %1, %2, %3" : "=f"(x) : "f"(x), "f"(y), "f"(z)); + return x; +} + +#else + +#include "../fma.c" + +#endif diff --git a/src/math/riscv32/fmaf.c b/src/math/riscv32/fmaf.c new file mode 100644 index 00000000..f9dc47ed --- /dev/null +++ b/src/math/riscv32/fmaf.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 32 + +float fmaf(float x, float y, float z) +{ + __asm__ ("fmadd.s %0, %1, %2, %3" : "=f"(x) : "f"(x), "f"(y), "f"(z)); + return x; +} + +#else + +#include "../fmaf.c" + +#endif diff --git a/src/math/riscv32/fmax.c b/src/math/riscv32/fmax.c new file mode 100644 index 00000000..023709cd --- /dev/null +++ b/src/math/riscv32/fmax.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 64 + +double fmax(double x, double y) +{ + __asm__ ("fmax.d %0, %1, %2" : "=f"(x) : "f"(x), "f"(y)); + return x; +} + +#else + +#include "../fmax.c" + +#endif diff --git a/src/math/riscv32/fmaxf.c b/src/math/riscv32/fmaxf.c new file mode 100644 index 00000000..863d2bd1 --- /dev/null +++ b/src/math/riscv32/fmaxf.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 32 + +float fmaxf(float x, float y) +{ + __asm__ ("fmax.s %0, %1, %2" : "=f"(x) : "f"(x), "f"(y)); + return x; +} + +#else + +#include "../fmaxf.c" + +#endif diff --git a/src/math/riscv32/fmin.c b/src/math/riscv32/fmin.c new file mode 100644 index 00000000..a4e3b067 --- /dev/null +++ b/src/math/riscv32/fmin.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 64 + +double fmin(double x, double y) +{ + __asm__ ("fmin.d %0, %1, %2" : "=f"(x) : "f"(x), "f"(y)); + return x; +} + +#else + +#include "../fmin.c" + +#endif diff --git a/src/math/riscv32/fminf.c b/src/math/riscv32/fminf.c new file mode 100644 index 00000000..32156e80 --- /dev/null +++ b/src/math/riscv32/fminf.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 32 + +float fminf(float x, float y) +{ + __asm__ ("fmin.s %0, %1, %2" : "=f"(x) : "f"(x), "f"(y)); + return x; +} + +#else + +#include "../fminf.c" + +#endif diff --git a/src/math/riscv32/sqrt.c b/src/math/riscv32/sqrt.c new file mode 100644 index 00000000..867a504c --- /dev/null +++ b/src/math/riscv32/sqrt.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 64 + +double sqrt(double x) +{ + __asm__ ("fsqrt.d %0, %1" : "=f"(x) : "f"(x)); + return x; +} + +#else + +#include "../sqrt.c" + +#endif diff --git a/src/math/riscv32/sqrtf.c b/src/math/riscv32/sqrtf.c new file mode 100644 index 00000000..610c2cf8 --- /dev/null +++ b/src/math/riscv32/sqrtf.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 32 + +float sqrtf(float x) +{ + __asm__ ("fsqrt.s %0, %1" : "=f"(x) : "f"(x)); + return x; +} + +#else + +#include "../sqrtf.c" + +#endif diff --git a/src/math/sinh.c b/src/math/sinh.c index 00022c4e..a01951ae 100644 --- a/src/math/sinh.c +++ b/src/math/sinh.c @@ -34,6 +34,6 @@ double sinh(double x) /* |x| > log(DBL_MAX) or nan */ /* note: the result is stored to handle overflow */ - t = 2*h*__expo2(absx); + t = __expo2(absx, 2*h); return t; } diff --git a/src/math/sinhf.c b/src/math/sinhf.c index 6ad19ea2..b9caa793 100644 --- a/src/math/sinhf.c +++ b/src/math/sinhf.c @@ -26,6 +26,6 @@ float sinhf(float x) } /* |x| > logf(FLT_MAX) or nan */ - t = 2*h*__expo2f(absx); + t = __expo2f(absx, 2*h); return t; } diff --git a/src/math/sqrt.c b/src/math/sqrt.c index f1f6d76c..5ba26559 100644 --- a/src/math/sqrt.c +++ b/src/math/sqrt.c @@ -1,184 +1,158 @@ -/* origin: FreeBSD /usr/src/lib/msun/src/e_sqrt.c */ -/* - * ==================================================== - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. - * - * Developed at SunSoft, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - */ -/* sqrt(x) - * Return correctly rounded sqrt. - * ------------------------------------------ - * | Use the hardware sqrt if you have one | - * ------------------------------------------ - * Method: - * Bit by bit method using integer arithmetic. (Slow, but portable) - * 1. Normalization - * Scale x to y in [1,4) with even powers of 2: - * find an integer k such that 1 <= (y=x*2^(2k)) < 4, then - * sqrt(x) = 2^k * sqrt(y) - * 2. Bit by bit computation - * Let q = sqrt(y) truncated to i bit after binary point (q = 1), - * i 0 - * i+1 2 - * s = 2*q , and y = 2 * ( y - q ). (1) - * i i i i - * - * To compute q from q , one checks whether - * i+1 i - * - * -(i+1) 2 - * (q + 2 ) <= y. (2) - * i - * -(i+1) - * If (2) is false, then q = q ; otherwise q = q + 2 . - * i+1 i i+1 i - * - * With some algebric manipulation, it is not difficult to see - * that (2) is equivalent to - * -(i+1) - * s + 2 <= y (3) - * i i - * - * The advantage of (3) is that s and y can be computed by - * i i - * the following recurrence formula: - * if (3) is false - * - * s = s , y = y ; (4) - * i+1 i i+1 i - * - * otherwise, - * -i -(i+1) - * s = s + 2 , y = y - s - 2 (5) - * i+1 i i+1 i i - * - * One may easily use induction to prove (4) and (5). - * Note. Since the left hand side of (3) contain only i+2 bits, - * it does not necessary to do a full (53-bit) comparison - * in (3). - * 3. Final rounding - * After generating the 53 bits result, we compute one more bit. - * Together with the remainder, we can decide whether the - * result is exact, bigger than 1/2ulp, or less than 1/2ulp - * (it will never equal to 1/2ulp). - * The rounding mode can be detected by checking whether - * huge + tiny is equal to huge, and whether huge - tiny is - * equal to huge for some floating point number "huge" and "tiny". - * - * Special cases: - * sqrt(+-0) = +-0 ... exact - * sqrt(inf) = inf - * sqrt(-ve) = NaN ... with invalid signal - * sqrt(NaN) = NaN ... with invalid signal for signaling NaN - */ - +#include <stdint.h> +#include <math.h> #include "libm.h" +#include "sqrt_data.h" -static const double tiny = 1.0e-300; +#define FENV_SUPPORT 1 -double sqrt(double x) +/* returns a*b*2^-32 - e, with error 0 <= e < 1. */ +static inline uint32_t mul32(uint32_t a, uint32_t b) { - double z; - int32_t sign = (int)0x80000000; - int32_t ix0,s0,q,m,t,i; - uint32_t r,t1,s1,ix1,q1; + return (uint64_t)a*b >> 32; +} - EXTRACT_WORDS(ix0, ix1, x); +/* returns a*b*2^-64 - e, with error 0 <= e < 3. */ +static inline uint64_t mul64(uint64_t a, uint64_t b) +{ + uint64_t ahi = a>>32; + uint64_t alo = a&0xffffffff; + uint64_t bhi = b>>32; + uint64_t blo = b&0xffffffff; + return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32); +} - /* take care of Inf and NaN */ - if ((ix0&0x7ff00000) == 0x7ff00000) { - return x*x + x; /* sqrt(NaN)=NaN, sqrt(+inf)=+inf, sqrt(-inf)=sNaN */ - } - /* take care of zero */ - if (ix0 <= 0) { - if (((ix0&~sign)|ix1) == 0) - return x; /* sqrt(+-0) = +-0 */ - if (ix0 < 0) - return (x-x)/(x-x); /* sqrt(-ve) = sNaN */ - } - /* normalize x */ - m = ix0>>20; - if (m == 0) { /* subnormal x */ - while (ix0 == 0) { - m -= 21; - ix0 |= (ix1>>11); - ix1 <<= 21; - } - for (i=0; (ix0&0x00100000) == 0; i++) - ix0<<=1; - m -= i - 1; - ix0 |= ix1>>(32-i); - ix1 <<= i; - } - m -= 1023; /* unbias exponent */ - ix0 = (ix0&0x000fffff)|0x00100000; - if (m & 1) { /* odd m, double x to make it even */ - ix0 += ix0 + ((ix1&sign)>>31); - ix1 += ix1; - } - m >>= 1; /* m = [m/2] */ - - /* generate sqrt(x) bit by bit */ - ix0 += ix0 + ((ix1&sign)>>31); - ix1 += ix1; - q = q1 = s0 = s1 = 0; /* [q,q1] = sqrt(x) */ - r = 0x00200000; /* r = moving bit from right to left */ - - while (r != 0) { - t = s0 + r; - if (t <= ix0) { - s0 = t + r; - ix0 -= t; - q += r; - } - ix0 += ix0 + ((ix1&sign)>>31); - ix1 += ix1; - r >>= 1; - } +double sqrt(double x) +{ + uint64_t ix, top, m; - r = sign; - while (r != 0) { - t1 = s1 + r; - t = s0; - if (t < ix0 || (t == ix0 && t1 <= ix1)) { - s1 = t1 + r; - if ((t1&sign) == sign && (s1&sign) == 0) - s0++; - ix0 -= t; - if (ix1 < t1) - ix0--; - ix1 -= t1; - q1 += r; - } - ix0 += ix0 + ((ix1&sign)>>31); - ix1 += ix1; - r >>= 1; + /* special case handling. */ + ix = asuint64(x); + top = ix >> 52; + if (predict_false(top - 0x001 >= 0x7ff - 0x001)) { + /* x < 0x1p-1022 or inf or nan. */ + if (ix * 2 == 0) + return x; + if (ix == 0x7ff0000000000000) + return x; + if (ix > 0x7ff0000000000000) + return __math_invalid(x); + /* x is subnormal, normalize it. */ + ix = asuint64(x * 0x1p52); + top = ix >> 52; + top -= 52; } - /* use floating add to find out rounding direction */ - if ((ix0|ix1) != 0) { - z = 1.0 - tiny; /* raise inexact flag */ - if (z >= 1.0) { - z = 1.0 + tiny; - if (q1 == (uint32_t)0xffffffff) { - q1 = 0; - q++; - } else if (z > 1.0) { - if (q1 == (uint32_t)0xfffffffe) - q++; - q1 += 2; - } else - q1 += q1 & 1; - } + /* argument reduction: + x = 4^e m; with integer e, and m in [1, 4) + m: fixed point representation [2.62] + 2^e is the exponent part of the result. */ + int even = top & 1; + m = (ix << 11) | 0x8000000000000000; + if (even) m >>= 1; + top = (top + 0x3ff) >> 1; + + /* approximate r ~ 1/sqrt(m) and s ~ sqrt(m) when m in [1,4) + + initial estimate: + 7bit table lookup (1bit exponent and 6bit significand). + + iterative approximation: + using 2 goldschmidt iterations with 32bit int arithmetics + and a final iteration with 64bit int arithmetics. + + details: + + the relative error (e = r0 sqrt(m)-1) of a linear estimate + (r0 = a m + b) is |e| < 0.085955 ~ 0x1.6p-4 at best, + a table lookup is faster and needs one less iteration + 6 bit lookup table (128b) gives |e| < 0x1.f9p-8 + 7 bit lookup table (256b) gives |e| < 0x1.fdp-9 + for single and double prec 6bit is enough but for quad + prec 7bit is needed (or modified iterations). to avoid + one more iteration >=13bit table would be needed (16k). + + a newton-raphson iteration for r is + w = r*r + u = 3 - m*w + r = r*u/2 + can use a goldschmidt iteration for s at the end or + s = m*r + + first goldschmidt iteration is + s = m*r + u = 3 - s*r + r = r*u/2 + s = s*u/2 + next goldschmidt iteration is + u = 3 - s*r + r = r*u/2 + s = s*u/2 + and at the end r is not computed only s. + + they use the same amount of operations and converge at the + same quadratic rate, i.e. if + r1 sqrt(m) - 1 = e, then + r2 sqrt(m) - 1 = -3/2 e^2 - 1/2 e^3 + the advantage of goldschmidt is that the mul for s and r + are independent (computed in parallel), however it is not + "self synchronizing": it only uses the input m in the + first iteration so rounding errors accumulate. at the end + or when switching to larger precision arithmetics rounding + errors dominate so the first iteration should be used. + + the fixed point representations are + m: 2.30 r: 0.32, s: 2.30, d: 2.30, u: 2.30, three: 2.30 + and after switching to 64 bit + m: 2.62 r: 0.64, s: 2.62, d: 2.62, u: 2.62, three: 2.62 */ + + static const uint64_t three = 0xc0000000; + uint64_t r, s, d, u, i; + + i = (ix >> 46) % 128; + r = (uint32_t)__rsqrt_tab[i] << 16; + /* |r sqrt(m) - 1| < 0x1.fdp-9 */ + s = mul32(m>>32, r); + /* |s/sqrt(m) - 1| < 0x1.fdp-9 */ + d = mul32(s, r); + u = three - d; + r = mul32(r, u) << 1; + /* |r sqrt(m) - 1| < 0x1.7bp-16 */ + s = mul32(s, u) << 1; + /* |s/sqrt(m) - 1| < 0x1.7bp-16 */ + d = mul32(s, r); + u = three - d; + r = mul32(r, u) << 1; + /* |r sqrt(m) - 1| < 0x1.3704p-29 (measured worst-case) */ + r = r << 32; + s = mul64(m, r); + d = mul64(s, r); + u = (three<<32) - d; + s = mul64(s, u); /* repr: 3.61 */ + /* -0x1p-57 < s - sqrt(m) < 0x1.8001p-61 */ + s = (s - 2) >> 9; /* repr: 12.52 */ + /* -0x1.09p-52 < s - sqrt(m) < -0x1.fffcp-63 */ + + /* s < sqrt(m) < s + 0x1.09p-52, + compute nearest rounded result: + the nearest result to 52 bits is either s or s+0x1p-52, + we can decide by comparing (2^52 s + 0.5)^2 to 2^104 m. */ + uint64_t d0, d1, d2; + double y, t; + d0 = (m << 42) - s*s; + d1 = s - d0; + d2 = d1 + s + 1; + s += d1 >> 63; + s &= 0x000fffffffffffff; + s |= top << 52; + y = asdouble(s); + if (FENV_SUPPORT) { + /* handle rounding modes and inexact exception: + only (s+1)^2 == 2^42 m case is exact otherwise + add a tiny value to cause the fenv effects. */ + uint64_t tiny = predict_false(d2==0) ? 0 : 0x0010000000000000; + tiny |= (d1^d2) & 0x8000000000000000; + t = asdouble(tiny); + y = eval_as_double(y + t); } - ix0 = (q>>1) + 0x3fe00000; - ix1 = q1>>1; - if (q&1) - ix1 |= sign; - INSERT_WORDS(z, ix0 + ((uint32_t)m << 20), ix1); - return z; + return y; } diff --git a/src/math/sqrt_data.c b/src/math/sqrt_data.c new file mode 100644 index 00000000..61bc22f4 --- /dev/null +++ b/src/math/sqrt_data.c @@ -0,0 +1,19 @@ +#include "sqrt_data.h" +const uint16_t __rsqrt_tab[128] = { +0xb451,0xb2f0,0xb196,0xb044,0xaef9,0xadb6,0xac79,0xab43, +0xaa14,0xa8eb,0xa7c8,0xa6aa,0xa592,0xa480,0xa373,0xa26b, +0xa168,0xa06a,0x9f70,0x9e7b,0x9d8a,0x9c9d,0x9bb5,0x9ad1, +0x99f0,0x9913,0x983a,0x9765,0x9693,0x95c4,0x94f8,0x9430, +0x936b,0x92a9,0x91ea,0x912e,0x9075,0x8fbe,0x8f0a,0x8e59, +0x8daa,0x8cfe,0x8c54,0x8bac,0x8b07,0x8a64,0x89c4,0x8925, +0x8889,0x87ee,0x8756,0x86c0,0x862b,0x8599,0x8508,0x8479, +0x83ec,0x8361,0x82d8,0x8250,0x81c9,0x8145,0x80c2,0x8040, +0xff02,0xfd0e,0xfb25,0xf947,0xf773,0xf5aa,0xf3ea,0xf234, +0xf087,0xeee3,0xed47,0xebb3,0xea27,0xe8a3,0xe727,0xe5b2, +0xe443,0xe2dc,0xe17a,0xe020,0xdecb,0xdd7d,0xdc34,0xdaf1, +0xd9b3,0xd87b,0xd748,0xd61a,0xd4f1,0xd3cd,0xd2ad,0xd192, +0xd07b,0xcf69,0xce5b,0xcd51,0xcc4a,0xcb48,0xca4a,0xc94f, +0xc858,0xc764,0xc674,0xc587,0xc49d,0xc3b7,0xc2d4,0xc1f4, +0xc116,0xc03c,0xbf65,0xbe90,0xbdbe,0xbcef,0xbc23,0xbb59, +0xba91,0xb9cc,0xb90a,0xb84a,0xb78c,0xb6d0,0xb617,0xb560, +}; diff --git a/src/math/sqrt_data.h b/src/math/sqrt_data.h new file mode 100644 index 00000000..260c7f9c --- /dev/null +++ b/src/math/sqrt_data.h @@ -0,0 +1,13 @@ +#ifndef _SQRT_DATA_H +#define _SQRT_DATA_H + +#include <features.h> +#include <stdint.h> + +/* if x in [1,2): i = (int)(64*x); + if x in [2,4): i = (int)(32*x-64); + __rsqrt_tab[i]*2^-16 is estimating 1/sqrt(x) with small relative error: + |__rsqrt_tab[i]*0x1p-16*sqrt(x) - 1| < -0x1.fdp-9 < 2^-8 */ +extern hidden const uint16_t __rsqrt_tab[128]; + +#endif diff --git a/src/math/sqrtf.c b/src/math/sqrtf.c index d6ace38a..740d81cb 100644 --- a/src/math/sqrtf.c +++ b/src/math/sqrtf.c @@ -1,83 +1,83 @@ -/* origin: FreeBSD /usr/src/lib/msun/src/e_sqrtf.c */ -/* - * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com. - */ -/* - * ==================================================== - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. - * - * Developed at SunPro, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - */ - +#include <stdint.h> +#include <math.h> #include "libm.h" +#include "sqrt_data.h" -static const float tiny = 1.0e-30; +#define FENV_SUPPORT 1 -float sqrtf(float x) +static inline uint32_t mul32(uint32_t a, uint32_t b) { - float z; - int32_t sign = (int)0x80000000; - int32_t ix,s,q,m,t,i; - uint32_t r; + return (uint64_t)a*b >> 32; +} - GET_FLOAT_WORD(ix, x); +/* see sqrt.c for more detailed comments. */ - /* take care of Inf and NaN */ - if ((ix&0x7f800000) == 0x7f800000) - return x*x + x; /* sqrt(NaN)=NaN, sqrt(+inf)=+inf, sqrt(-inf)=sNaN */ +float sqrtf(float x) +{ + uint32_t ix, m, m1, m0, even, ey; - /* take care of zero */ - if (ix <= 0) { - if ((ix&~sign) == 0) - return x; /* sqrt(+-0) = +-0 */ - if (ix < 0) - return (x-x)/(x-x); /* sqrt(-ve) = sNaN */ - } - /* normalize x */ - m = ix>>23; - if (m == 0) { /* subnormal x */ - for (i = 0; (ix&0x00800000) == 0; i++) - ix<<=1; - m -= i - 1; + ix = asuint(x); + if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000)) { + /* x < 0x1p-126 or inf or nan. */ + if (ix * 2 == 0) + return x; + if (ix == 0x7f800000) + return x; + if (ix > 0x7f800000) + return __math_invalidf(x); + /* x is subnormal, normalize it. */ + ix = asuint(x * 0x1p23f); + ix -= 23 << 23; } - m -= 127; /* unbias exponent */ - ix = (ix&0x007fffff)|0x00800000; - if (m&1) /* odd m, double x to make it even */ - ix += ix; - m >>= 1; /* m = [m/2] */ - /* generate sqrt(x) bit by bit */ - ix += ix; - q = s = 0; /* q = sqrt(x) */ - r = 0x01000000; /* r = moving bit from right to left */ + /* x = 4^e m; with int e and m in [1, 4). */ + even = ix & 0x00800000; + m1 = (ix << 8) | 0x80000000; + m0 = (ix << 7) & 0x7fffffff; + m = even ? m0 : m1; - while (r != 0) { - t = s + r; - if (t <= ix) { - s = t+r; - ix -= t; - q += r; - } - ix += ix; - r >>= 1; - } + /* 2^e is the exponent part of the return value. */ + ey = ix >> 1; + ey += 0x3f800000 >> 1; + ey &= 0x7f800000; + + /* compute r ~ 1/sqrt(m), s ~ sqrt(m) with 2 goldschmidt iterations. */ + static const uint32_t three = 0xc0000000; + uint32_t r, s, d, u, i; + i = (ix >> 17) % 128; + r = (uint32_t)__rsqrt_tab[i] << 16; + /* |r*sqrt(m) - 1| < 0x1p-8 */ + s = mul32(m, r); + /* |s/sqrt(m) - 1| < 0x1p-8 */ + d = mul32(s, r); + u = three - d; + r = mul32(r, u) << 1; + /* |r*sqrt(m) - 1| < 0x1.7bp-16 */ + s = mul32(s, u) << 1; + /* |s/sqrt(m) - 1| < 0x1.7bp-16 */ + d = mul32(s, r); + u = three - d; + s = mul32(s, u); + /* -0x1.03p-28 < s/sqrt(m) - 1 < 0x1.fp-31 */ + s = (s - 1)>>6; + /* s < sqrt(m) < s + 0x1.08p-23 */ - /* use floating add to find out rounding direction */ - if (ix != 0) { - z = 1.0f - tiny; /* raise inexact flag */ - if (z >= 1.0f) { - z = 1.0f + tiny; - if (z > 1.0f) - q += 2; - else - q += q & 1; - } + /* compute nearest rounded result. */ + uint32_t d0, d1, d2; + float y, t; + d0 = (m << 16) - s*s; + d1 = s - d0; + d2 = d1 + s + 1; + s += d1 >> 31; + s &= 0x007fffff; + s |= ey; + y = asfloat(s); + if (FENV_SUPPORT) { + /* handle rounding and inexact exception. */ + uint32_t tiny = predict_false(d2==0) ? 0 : 0x01000000; + tiny |= (d1^d2) & 0x80000000; + t = asfloat(tiny); + y = eval_as_float(y + t); } - ix = (q>>1) + 0x3f000000; - SET_FLOAT_WORD(z, ix + ((uint32_t)m << 23)); - return z; + return y; } diff --git a/src/math/sqrtl.c b/src/math/sqrtl.c index 83a8f80c..a231b3f2 100644 --- a/src/math/sqrtl.c +++ b/src/math/sqrtl.c @@ -1,7 +1,259 @@ +#include <stdint.h> #include <math.h> +#include <float.h> +#include "libm.h" +#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024 long double sqrtl(long double x) { - /* FIXME: implement in C, this is for LDBL_MANT_DIG == 64 only */ return sqrt(x); } +#elif (LDBL_MANT_DIG == 113 || LDBL_MANT_DIG == 64) && LDBL_MAX_EXP == 16384 +#include "sqrt_data.h" + +#define FENV_SUPPORT 1 + +typedef struct { + uint64_t hi; + uint64_t lo; +} u128; + +/* top: 16 bit sign+exponent, x: significand. */ +static inline long double mkldbl(uint64_t top, u128 x) +{ + union ldshape u; +#if LDBL_MANT_DIG == 113 + u.i2.hi = x.hi; + u.i2.lo = x.lo; + u.i2.hi &= 0x0000ffffffffffff; + u.i2.hi |= top << 48; +#elif LDBL_MANT_DIG == 64 + u.i.se = top; + u.i.m = x.lo; + /* force the top bit on non-zero (and non-subnormal) results. */ + if (top & 0x7fff) + u.i.m |= 0x8000000000000000; +#endif + return u.f; +} + +/* return: top 16 bit is sign+exp and following bits are the significand. */ +static inline u128 asu128(long double x) +{ + union ldshape u = {.f=x}; + u128 r; +#if LDBL_MANT_DIG == 113 + r.hi = u.i2.hi; + r.lo = u.i2.lo; +#elif LDBL_MANT_DIG == 64 + r.lo = u.i.m<<49; + /* ignore the top bit: pseudo numbers are not handled. */ + r.hi = u.i.m>>15; + r.hi &= 0x0000ffffffffffff; + r.hi |= (uint64_t)u.i.se << 48; +#endif + return r; +} + +/* returns a*b*2^-32 - e, with error 0 <= e < 1. */ +static inline uint32_t mul32(uint32_t a, uint32_t b) +{ + return (uint64_t)a*b >> 32; +} + +/* returns a*b*2^-64 - e, with error 0 <= e < 3. */ +static inline uint64_t mul64(uint64_t a, uint64_t b) +{ + uint64_t ahi = a>>32; + uint64_t alo = a&0xffffffff; + uint64_t bhi = b>>32; + uint64_t blo = b&0xffffffff; + return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32); +} + +static inline u128 add64(u128 a, uint64_t b) +{ + u128 r; + r.lo = a.lo + b; + r.hi = a.hi; + if (r.lo < a.lo) + r.hi++; + return r; +} + +static inline u128 add128(u128 a, u128 b) +{ + u128 r; + r.lo = a.lo + b.lo; + r.hi = a.hi + b.hi; + if (r.lo < a.lo) + r.hi++; + return r; +} + +static inline u128 sub64(u128 a, uint64_t b) +{ + u128 r; + r.lo = a.lo - b; + r.hi = a.hi; + if (a.lo < b) + r.hi--; + return r; +} + +static inline u128 sub128(u128 a, u128 b) +{ + u128 r; + r.lo = a.lo - b.lo; + r.hi = a.hi - b.hi; + if (a.lo < b.lo) + r.hi--; + return r; +} + +/* a<<n, 0 <= n <= 127 */ +static inline u128 lsh(u128 a, int n) +{ + if (n == 0) + return a; + if (n >= 64) { + a.hi = a.lo<<(n-64); + a.lo = 0; + } else { + a.hi = (a.hi<<n) | (a.lo>>(64-n)); + a.lo = a.lo<<n; + } + return a; +} + +/* a>>n, 0 <= n <= 127 */ +static inline u128 rsh(u128 a, int n) +{ + if (n == 0) + return a; + if (n >= 64) { + a.lo = a.hi>>(n-64); + a.hi = 0; + } else { + a.lo = (a.lo>>n) | (a.hi<<(64-n)); + a.hi = a.hi>>n; + } + return a; +} + +/* returns a*b exactly. */ +static inline u128 mul64_128(uint64_t a, uint64_t b) +{ + u128 r; + uint64_t ahi = a>>32; + uint64_t alo = a&0xffffffff; + uint64_t bhi = b>>32; + uint64_t blo = b&0xffffffff; + uint64_t lo1 = ((ahi*blo)&0xffffffff) + ((alo*bhi)&0xffffffff) + (alo*blo>>32); + uint64_t lo2 = (alo*blo)&0xffffffff; + r.hi = ahi*bhi + (ahi*blo>>32) + (alo*bhi>>32) + (lo1>>32); + r.lo = (lo1<<32) + lo2; + return r; +} + +/* returns a*b*2^-128 - e, with error 0 <= e < 7. */ +static inline u128 mul128(u128 a, u128 b) +{ + u128 hi = mul64_128(a.hi, b.hi); + uint64_t m1 = mul64(a.hi, b.lo); + uint64_t m2 = mul64(a.lo, b.hi); + return add64(add64(hi, m1), m2); +} + +/* returns a*b % 2^128. */ +static inline u128 mul128_tail(u128 a, u128 b) +{ + u128 lo = mul64_128(a.lo, b.lo); + lo.hi += a.hi*b.lo + a.lo*b.hi; + return lo; +} + + +/* see sqrt.c for detailed comments. */ + +long double sqrtl(long double x) +{ + u128 ix, ml; + uint64_t top; + + ix = asu128(x); + top = ix.hi >> 48; + if (predict_false(top - 0x0001 >= 0x7fff - 0x0001)) { + /* x < 0x1p-16382 or inf or nan. */ + if (2*ix.hi == 0 && ix.lo == 0) + return x; + if (ix.hi == 0x7fff000000000000 && ix.lo == 0) + return x; + if (top >= 0x7fff) + return __math_invalidl(x); + /* x is subnormal, normalize it. */ + ix = asu128(x * 0x1p112); + top = ix.hi >> 48; + top -= 112; + } + + /* x = 4^e m; with int e and m in [1, 4) */ + int even = top & 1; + ml = lsh(ix, 15); + ml.hi |= 0x8000000000000000; + if (even) ml = rsh(ml, 1); + top = (top + 0x3fff) >> 1; + + /* r ~ 1/sqrt(m) */ + const uint64_t three = 0xc0000000; + uint64_t r, s, d, u, i; + i = (ix.hi >> 42) % 128; + r = (uint32_t)__rsqrt_tab[i] << 16; + /* |r sqrt(m) - 1| < 0x1p-8 */ + s = mul32(ml.hi>>32, r); + d = mul32(s, r); + u = three - d; + r = mul32(u, r) << 1; + /* |r sqrt(m) - 1| < 0x1.7bp-16, switch to 64bit */ + r = r<<32; + s = mul64(ml.hi, r); + d = mul64(s, r); + u = (three<<32) - d; + r = mul64(u, r) << 1; + /* |r sqrt(m) - 1| < 0x1.a5p-31 */ + s = mul64(u, s) << 1; + d = mul64(s, r); + u = (three<<32) - d; + r = mul64(u, r) << 1; + /* |r sqrt(m) - 1| < 0x1.c001p-59, switch to 128bit */ + + const u128 threel = {.hi=three<<32, .lo=0}; + u128 rl, sl, dl, ul; + rl.hi = r; + rl.lo = 0; + sl = mul128(ml, rl); + dl = mul128(sl, rl); + ul = sub128(threel, dl); + sl = mul128(ul, sl); /* repr: 3.125 */ + /* -0x1p-116 < s - sqrt(m) < 0x3.8001p-125 */ + sl = rsh(sub64(sl, 4), 125-(LDBL_MANT_DIG-1)); + /* s < sqrt(m) < s + 1 ULP + tiny */ + + long double y; + u128 d2, d1, d0; + d0 = sub128(lsh(ml, 2*(LDBL_MANT_DIG-1)-126), mul128_tail(sl,sl)); + d1 = sub128(sl, d0); + d2 = add128(add64(sl, 1), d1); + sl = add64(sl, d1.hi >> 63); + y = mkldbl(top, sl); + if (FENV_SUPPORT) { + /* handle rounding modes and inexact exception. */ + top = predict_false((d2.hi|d2.lo)==0) ? 0 : 1; + top |= ((d1.hi^d2.hi)&0x8000000000000000) >> 48; + y += mkldbl(top, (u128){0}); + } + return y; +} +#else +#error unsupported long double format +#endif diff --git a/src/math/x86_64/fabs.c b/src/math/x86_64/fabs.c new file mode 100644 index 00000000..16562477 --- /dev/null +++ b/src/math/x86_64/fabs.c @@ -0,0 +1,10 @@ +#include <math.h> + +double fabs(double x) +{ + double t; + __asm__ ("pcmpeqd %0, %0" : "=x"(t)); // t = ~0 + __asm__ ("psrlq $1, %0" : "+x"(t)); // t >>= 1 + __asm__ ("andps %1, %0" : "+x"(x) : "x"(t)); // x &= t + return x; +} diff --git a/src/math/x86_64/fabs.s b/src/math/x86_64/fabs.s deleted file mode 100644 index 5715005e..00000000 --- a/src/math/x86_64/fabs.s +++ /dev/null @@ -1,9 +0,0 @@ -.global fabs -.type fabs,@function -fabs: - xor %eax,%eax - dec %rax - shr %rax - movq %rax,%xmm1 - andpd %xmm1,%xmm0 - ret diff --git a/src/math/x86_64/fabsf.c b/src/math/x86_64/fabsf.c new file mode 100644 index 00000000..36ea7481 --- /dev/null +++ b/src/math/x86_64/fabsf.c @@ -0,0 +1,10 @@ +#include <math.h> + +float fabsf(float x) +{ + float t; + __asm__ ("pcmpeqd %0, %0" : "=x"(t)); // t = ~0 + __asm__ ("psrld $1, %0" : "+x"(t)); // t >>= 1 + __asm__ ("andps %1, %0" : "+x"(x) : "x"(t)); // x &= t + return x; +} diff --git a/src/math/x86_64/fabsf.s b/src/math/x86_64/fabsf.s deleted file mode 100644 index 501a1f17..00000000 --- a/src/math/x86_64/fabsf.s +++ /dev/null @@ -1,7 +0,0 @@ -.global fabsf -.type fabsf,@function -fabsf: - mov $0x7fffffff,%eax - movq %rax,%xmm1 - andps %xmm1,%xmm0 - ret diff --git a/src/math/x86_64/fabsl.c b/src/math/x86_64/fabsl.c new file mode 100644 index 00000000..cc1c9ed9 --- /dev/null +++ b/src/math/x86_64/fabsl.c @@ -0,0 +1,7 @@ +#include <math.h> + +long double fabsl(long double x) +{ + __asm__ ("fabs" : "+t"(x)); + return x; +} diff --git a/src/math/x86_64/fabsl.s b/src/math/x86_64/fabsl.s deleted file mode 100644 index 4e7ab525..00000000 --- a/src/math/x86_64/fabsl.s +++ /dev/null @@ -1,6 +0,0 @@ -.global fabsl -.type fabsl,@function -fabsl: - fldt 8(%rsp) - fabs - ret diff --git a/src/math/x86_64/fmodl.c b/src/math/x86_64/fmodl.c new file mode 100644 index 00000000..3daeab06 --- /dev/null +++ b/src/math/x86_64/fmodl.c @@ -0,0 +1,9 @@ +#include <math.h> + +long double fmodl(long double x, long double y) +{ + unsigned short fpsr; + do __asm__ ("fprem; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y)); + while (fpsr & 0x400); + return x; +} diff --git a/src/math/x86_64/fmodl.s b/src/math/x86_64/fmodl.s deleted file mode 100644 index ea07b402..00000000 --- a/src/math/x86_64/fmodl.s +++ /dev/null @@ -1,11 +0,0 @@ -.global fmodl -.type fmodl,@function -fmodl: - fldt 24(%rsp) - fldt 8(%rsp) -1: fprem - fnstsw %ax - testb $4,%ah - jnz 1b - fstp %st(1) - ret diff --git a/src/math/x86_64/llrint.c b/src/math/x86_64/llrint.c new file mode 100644 index 00000000..dd38a722 --- /dev/null +++ b/src/math/x86_64/llrint.c @@ -0,0 +1,8 @@ +#include <math.h> + +long long llrint(double x) +{ + long long r; + __asm__ ("cvtsd2si %1, %0" : "=r"(r) : "x"(x)); + return r; +} diff --git a/src/math/x86_64/llrint.s b/src/math/x86_64/llrint.s deleted file mode 100644 index bf476498..00000000 --- a/src/math/x86_64/llrint.s +++ /dev/null @@ -1,5 +0,0 @@ -.global llrint -.type llrint,@function -llrint: - cvtsd2si %xmm0,%rax - ret diff --git a/src/math/x86_64/llrintf.c b/src/math/x86_64/llrintf.c new file mode 100644 index 00000000..fc8625e8 --- /dev/null +++ b/src/math/x86_64/llrintf.c @@ -0,0 +1,8 @@ +#include <math.h> + +long long llrintf(float x) +{ + long long r; + __asm__ ("cvtss2si %1, %0" : "=r"(r) : "x"(x)); + return r; +} diff --git a/src/math/x86_64/llrintf.s b/src/math/x86_64/llrintf.s deleted file mode 100644 index d7204ac0..00000000 --- a/src/math/x86_64/llrintf.s +++ /dev/null @@ -1,5 +0,0 @@ -.global llrintf -.type llrintf,@function -llrintf: - cvtss2si %xmm0,%rax - ret diff --git a/src/math/x86_64/llrintl.c b/src/math/x86_64/llrintl.c new file mode 100644 index 00000000..c439ef28 --- /dev/null +++ b/src/math/x86_64/llrintl.c @@ -0,0 +1,8 @@ +#include <math.h> + +long long llrintl(long double x) +{ + long long r; + __asm__ ("fistpll %0" : "=m"(r) : "t"(x) : "st"); + return r; +} diff --git a/src/math/x86_64/llrintl.s b/src/math/x86_64/llrintl.s deleted file mode 100644 index 1ec0817d..00000000 --- a/src/math/x86_64/llrintl.s +++ /dev/null @@ -1,7 +0,0 @@ -.global llrintl -.type llrintl,@function -llrintl: - fldt 8(%rsp) - fistpll 8(%rsp) - mov 8(%rsp),%rax - ret diff --git a/src/math/x86_64/lrint.c b/src/math/x86_64/lrint.c new file mode 100644 index 00000000..a742fec6 --- /dev/null +++ b/src/math/x86_64/lrint.c @@ -0,0 +1,8 @@ +#include <math.h> + +long lrint(double x) +{ + long r; + __asm__ ("cvtsd2si %1, %0" : "=r"(r) : "x"(x)); + return r; +} diff --git a/src/math/x86_64/lrint.s b/src/math/x86_64/lrint.s deleted file mode 100644 index 15fc2454..00000000 --- a/src/math/x86_64/lrint.s +++ /dev/null @@ -1,5 +0,0 @@ -.global lrint -.type lrint,@function -lrint: - cvtsd2si %xmm0,%rax - ret diff --git a/src/math/x86_64/lrintf.c b/src/math/x86_64/lrintf.c new file mode 100644 index 00000000..2ba5639d --- /dev/null +++ b/src/math/x86_64/lrintf.c @@ -0,0 +1,8 @@ +#include <math.h> + +long lrintf(float x) +{ + long r; + __asm__ ("cvtss2si %1, %0" : "=r"(r) : "x"(x)); + return r; +} diff --git a/src/math/x86_64/lrintf.s b/src/math/x86_64/lrintf.s deleted file mode 100644 index 488423d2..00000000 --- a/src/math/x86_64/lrintf.s +++ /dev/null @@ -1,5 +0,0 @@ -.global lrintf -.type lrintf,@function -lrintf: - cvtss2si %xmm0,%rax - ret diff --git a/src/math/x86_64/lrintl.c b/src/math/x86_64/lrintl.c new file mode 100644 index 00000000..068e2e4d --- /dev/null +++ b/src/math/x86_64/lrintl.c @@ -0,0 +1,8 @@ +#include <math.h> + +long lrintl(long double x) +{ + long r; + __asm__ ("fistpll %0" : "=m"(r) : "t"(x) : "st"); + return r; +} diff --git a/src/math/x86_64/lrintl.s b/src/math/x86_64/lrintl.s deleted file mode 100644 index d587b12b..00000000 --- a/src/math/x86_64/lrintl.s +++ /dev/null @@ -1,7 +0,0 @@ -.global lrintl -.type lrintl,@function -lrintl: - fldt 8(%rsp) - fistpll 8(%rsp) - mov 8(%rsp),%rax - ret diff --git a/src/math/x86_64/remainderl.c b/src/math/x86_64/remainderl.c new file mode 100644 index 00000000..8cf75071 --- /dev/null +++ b/src/math/x86_64/remainderl.c @@ -0,0 +1,9 @@ +#include <math.h> + +long double remainderl(long double x, long double y) +{ + unsigned short fpsr; + do __asm__ ("fprem1; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y)); + while (fpsr & 0x400); + return x; +} diff --git a/src/math/x86_64/remainderl.s b/src/math/x86_64/remainderl.s deleted file mode 100644 index cb3857b4..00000000 --- a/src/math/x86_64/remainderl.s +++ /dev/null @@ -1,11 +0,0 @@ -.global remainderl -.type remainderl,@function -remainderl: - fldt 24(%rsp) - fldt 8(%rsp) -1: fprem1 - fnstsw %ax - testb $4,%ah - jnz 1b - fstp %st(1) - ret diff --git a/src/math/x86_64/remquol.c b/src/math/x86_64/remquol.c new file mode 100644 index 00000000..60eef089 --- /dev/null +++ b/src/math/x86_64/remquol.c @@ -0,0 +1,32 @@ +#include <math.h> + +long double remquol(long double x, long double y, int *quo) +{ + signed char *cx = (void *)&x, *cy = (void *)&y; + /* By ensuring that addresses of x and y cannot be discarded, + * this empty asm guides GCC into representing extraction of + * their sign bits as memory loads rather than making x and y + * not-address-taken internally and using bitfield operations, + * which in the end wouldn't work out, as extraction from FPU + * registers needs to go through memory anyway. This way GCC + * should manage to use incoming stack slots without spills. */ + __asm__ ("" :: "X"(cx), "X"(cy)); + + long double t = x; + unsigned fpsr; + do __asm__ ("fprem1; fnstsw %%ax" : "+t"(t), "=a"(fpsr) : "u"(y)); + while (fpsr & 0x400); + /* C0, C1, C3 flags in x87 status word carry low bits of quotient: + * 15 14 13 12 11 10 9 8 + * . C3 . . . C2 C1 C0 + * . b1 . . . 0 b0 b2 */ + unsigned char i = fpsr >> 8; + i = i>>4 | i<<4; + /* i[5:2] is now {b0 b2 ? b1}. Retrieve {0 b2 b1 b0} via + * in-register table lookup. */ + unsigned qbits = 0x7575313164642020 >> (i & 60); + qbits &= 7; + + *quo = (cx[9]^cy[9]) < 0 ? -qbits : qbits; + return t; +} diff --git a/src/math/x86_64/rintl.c b/src/math/x86_64/rintl.c new file mode 100644 index 00000000..e1a92077 --- /dev/null +++ b/src/math/x86_64/rintl.c @@ -0,0 +1,7 @@ +#include <math.h> + +long double rintl(long double x) +{ + __asm__ ("frndint" : "+t"(x)); + return x; +} diff --git a/src/math/x86_64/rintl.s b/src/math/x86_64/rintl.s deleted file mode 100644 index 64e663cd..00000000 --- a/src/math/x86_64/rintl.s +++ /dev/null @@ -1,6 +0,0 @@ -.global rintl -.type rintl,@function -rintl: - fldt 8(%rsp) - frndint - ret diff --git a/src/math/x86_64/sqrt.c b/src/math/x86_64/sqrt.c new file mode 100644 index 00000000..657e09e3 --- /dev/null +++ b/src/math/x86_64/sqrt.c @@ -0,0 +1,7 @@ +#include <math.h> + +double sqrt(double x) +{ + __asm__ ("sqrtsd %1, %0" : "=x"(x) : "x"(x)); + return x; +} diff --git a/src/math/x86_64/sqrt.s b/src/math/x86_64/sqrt.s deleted file mode 100644 index d3c609f9..00000000 --- a/src/math/x86_64/sqrt.s +++ /dev/null @@ -1,4 +0,0 @@ -.global sqrt -.type sqrt,@function -sqrt: sqrtsd %xmm0, %xmm0 - ret diff --git a/src/math/x86_64/sqrtf.c b/src/math/x86_64/sqrtf.c new file mode 100644 index 00000000..720baec6 --- /dev/null +++ b/src/math/x86_64/sqrtf.c @@ -0,0 +1,7 @@ +#include <math.h> + +float sqrtf(float x) +{ + __asm__ ("sqrtss %1, %0" : "=x"(x) : "x"(x)); + return x; +} diff --git a/src/math/x86_64/sqrtf.s b/src/math/x86_64/sqrtf.s deleted file mode 100644 index eec48c60..00000000 --- a/src/math/x86_64/sqrtf.s +++ /dev/null @@ -1,4 +0,0 @@ -.global sqrtf -.type sqrtf,@function -sqrtf: sqrtss %xmm0, %xmm0 - ret diff --git a/src/math/x86_64/sqrtl.c b/src/math/x86_64/sqrtl.c new file mode 100644 index 00000000..864cfcc4 --- /dev/null +++ b/src/math/x86_64/sqrtl.c @@ -0,0 +1,7 @@ +#include <math.h> + +long double sqrtl(long double x) +{ + __asm__ ("fsqrt" : "+t"(x)); + return x; +} diff --git a/src/math/x86_64/sqrtl.s b/src/math/x86_64/sqrtl.s deleted file mode 100644 index 23cd687d..00000000 --- a/src/math/x86_64/sqrtl.s +++ /dev/null @@ -1,5 +0,0 @@ -.global sqrtl -.type sqrtl,@function -sqrtl: fldt 8(%rsp) - fsqrt - ret |