diff options
Diffstat (limited to 'src/math')
194 files changed, 3959 insertions, 2262 deletions
| diff --git a/src/math/__expo2.c b/src/math/__expo2.c index 740ac680..248f052b 100644 --- a/src/math/__expo2.c +++ b/src/math/__expo2.c @@ -5,12 +5,13 @@ static const int k = 2043;  static const double kln2 = 0x1.62066151add8bp+10;  /* exp(x)/2 for x >= log(DBL_MAX), slightly better than 0.5*exp(x/2)*exp(x/2) */ -double __expo2(double x) +double __expo2(double x, double sign)  {  	double scale;  	/* note that k is odd and scale*scale overflows */  	INSERT_WORDS(scale, (uint32_t)(0x3ff + k/2) << 20, 0);  	/* exp(x - k ln2) * 2**(k-1) */ -	return exp(x - kln2) * scale * scale; +	/* in directed rounding correct sign before rounding or overflow is important */ +	return exp(x - kln2) * (sign * scale) * scale;  } diff --git a/src/math/__expo2f.c b/src/math/__expo2f.c index 5163e418..538eb09c 100644 --- a/src/math/__expo2f.c +++ b/src/math/__expo2f.c @@ -5,12 +5,13 @@ static const int k = 235;  static const float kln2 = 0x1.45c778p+7f;  /* expf(x)/2 for x >= log(FLT_MAX), slightly better than 0.5f*expf(x/2)*expf(x/2) */ -float __expo2f(float x) +float __expo2f(float x, float sign)  {  	float scale;  	/* note that k is odd and scale*scale overflows */  	SET_FLOAT_WORD(scale, (uint32_t)(0x7f + k/2) << 23);  	/* exp(x - k ln2) * 2**(k-1) */ -	return expf(x - kln2) * scale * scale; +	/* in directed rounding correct sign before rounding or overflow is important */ +	return expf(x - kln2) * (sign * scale) * scale;  } diff --git a/src/math/__math_divzero.c b/src/math/__math_divzero.c new file mode 100644 index 00000000..59d21350 --- /dev/null +++ b/src/math/__math_divzero.c @@ -0,0 +1,6 @@ +#include "libm.h" + +double __math_divzero(uint32_t sign) +{ +	return fp_barrier(sign ? -1.0 : 1.0) / 0.0; +} diff --git a/src/math/__math_divzerof.c b/src/math/__math_divzerof.c new file mode 100644 index 00000000..ce046f3e --- /dev/null +++ b/src/math/__math_divzerof.c @@ -0,0 +1,6 @@ +#include "libm.h" + +float __math_divzerof(uint32_t sign) +{ +	return fp_barrierf(sign ? -1.0f : 1.0f) / 0.0f; +} diff --git a/src/math/__math_invalid.c b/src/math/__math_invalid.c new file mode 100644 index 00000000..17740490 --- /dev/null +++ b/src/math/__math_invalid.c @@ -0,0 +1,6 @@ +#include "libm.h" + +double __math_invalid(double x) +{ +	return (x - x) / (x - x); +} diff --git a/src/math/__math_invalidf.c b/src/math/__math_invalidf.c new file mode 100644 index 00000000..357d4b12 --- /dev/null +++ b/src/math/__math_invalidf.c @@ -0,0 +1,6 @@ +#include "libm.h" + +float __math_invalidf(float x) +{ +	return (x - x) / (x - x); +} diff --git a/src/math/__math_invalidl.c b/src/math/__math_invalidl.c new file mode 100644 index 00000000..1fca99de --- /dev/null +++ b/src/math/__math_invalidl.c @@ -0,0 +1,9 @@ +#include <float.h> +#include "libm.h" + +#if LDBL_MANT_DIG != DBL_MANT_DIG +long double __math_invalidl(long double x) +{ +	return (x - x) / (x - x); +} +#endif diff --git a/src/math/__math_oflow.c b/src/math/__math_oflow.c new file mode 100644 index 00000000..c85dbf98 --- /dev/null +++ b/src/math/__math_oflow.c @@ -0,0 +1,6 @@ +#include "libm.h" + +double __math_oflow(uint32_t sign) +{ +	return __math_xflow(sign, 0x1p769); +} diff --git a/src/math/__math_oflowf.c b/src/math/__math_oflowf.c new file mode 100644 index 00000000..fa7d0620 --- /dev/null +++ b/src/math/__math_oflowf.c @@ -0,0 +1,6 @@ +#include "libm.h" + +float __math_oflowf(uint32_t sign) +{ +	return __math_xflowf(sign, 0x1p97f); +} diff --git a/src/math/__math_uflow.c b/src/math/__math_uflow.c new file mode 100644 index 00000000..b90594ae --- /dev/null +++ b/src/math/__math_uflow.c @@ -0,0 +1,6 @@ +#include "libm.h" + +double __math_uflow(uint32_t sign) +{ +	return __math_xflow(sign, 0x1p-767); +} diff --git a/src/math/__math_uflowf.c b/src/math/__math_uflowf.c new file mode 100644 index 00000000..94d50f2b --- /dev/null +++ b/src/math/__math_uflowf.c @@ -0,0 +1,6 @@ +#include "libm.h" + +float __math_uflowf(uint32_t sign) +{ +	return __math_xflowf(sign, 0x1p-95f); +} diff --git a/src/math/__math_xflow.c b/src/math/__math_xflow.c new file mode 100644 index 00000000..744203c4 --- /dev/null +++ b/src/math/__math_xflow.c @@ -0,0 +1,6 @@ +#include "libm.h" + +double __math_xflow(uint32_t sign, double y) +{ +	return eval_as_double(fp_barrier(sign ? -y : y) * y); +} diff --git a/src/math/__math_xflowf.c b/src/math/__math_xflowf.c new file mode 100644 index 00000000..f2c84784 --- /dev/null +++ b/src/math/__math_xflowf.c @@ -0,0 +1,6 @@ +#include "libm.h" + +float __math_xflowf(uint32_t sign, float y) +{ +	return eval_as_float(fp_barrierf(sign ? -y : y) * y); +} diff --git a/src/math/__rem_pio2.c b/src/math/__rem_pio2.c index d403f81c..dcf672fb 100644 --- a/src/math/__rem_pio2.c +++ b/src/math/__rem_pio2.c @@ -36,6 +36,7 @@   */  static const double  toint   = 1.5/EPS, +pio4    = 0x1.921fb54442d18p-1,  invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */  pio2_1  = 1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */  pio2_1t = 6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */ @@ -117,11 +118,23 @@ int __rem_pio2(double x, double *y)  	}  	if (ix < 0x413921fb) {  /* |x| ~< 2^20*(pi/2), medium size */  medium: -		/* rint(x/(pi/2)), Assume round-to-nearest. */ +		/* rint(x/(pi/2)) */  		fn = (double_t)x*invpio2 + toint - toint;  		n = (int32_t)fn;  		r = x - fn*pio2_1;  		w = fn*pio2_1t;  /* 1st round, good to 85 bits */ +		/* Matters with directed rounding. */ +		if (predict_false(r - w < -pio4)) { +			n--; +			fn--; +			r = x - fn*pio2_1; +			w = fn*pio2_1t; +		} else if (predict_false(r - w > pio4)) { +			n++; +			fn++; +			r = x - fn*pio2_1; +			w = fn*pio2_1t; +		}  		y[0] = r - w;  		u.f = y[0];  		ey = u.i>>52 & 0x7ff; diff --git a/src/math/__rem_pio2f.c b/src/math/__rem_pio2f.c index 4473c1c4..e6765643 100644 --- a/src/math/__rem_pio2f.c +++ b/src/math/__rem_pio2f.c @@ -35,6 +35,7 @@   */  static const double  toint   = 1.5/EPS, +pio4    = 0x1.921fb6p-1,  invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */  pio2_1  = 1.57079631090164184570e+00, /* 0x3FF921FB, 0x50000000 */  pio2_1t = 1.58932547735281966916e-08; /* 0x3E5110b4, 0x611A6263 */ @@ -50,10 +51,20 @@ int __rem_pio2f(float x, double *y)  	ix = u.i & 0x7fffffff;  	/* 25+53 bit pi is good enough for medium size */  	if (ix < 0x4dc90fdb) {  /* |x| ~< 2^28*(pi/2), medium size */ -		/* Use a specialized rint() to get fn.  Assume round-to-nearest. */ +		/* Use a specialized rint() to get fn. */  		fn = (double_t)x*invpio2 + toint - toint;  		n  = (int32_t)fn;  		*y = x - fn*pio2_1 - fn*pio2_1t; +		/* Matters with directed rounding. */ +		if (predict_false(*y < -pio4)) { +			n--; +			fn--; +			*y = x - fn*pio2_1 - fn*pio2_1t; +		} else if (predict_false(*y > pio4)) { +			n++; +			fn++; +			*y = x - fn*pio2_1 - fn*pio2_1t; +		}  		return n;  	}  	if(ix>=0x7f800000) {  /* x is inf or NaN */ diff --git a/src/math/__rem_pio2l.c b/src/math/__rem_pio2l.c index 77255bd8..236b2def 100644 --- a/src/math/__rem_pio2l.c +++ b/src/math/__rem_pio2l.c @@ -44,6 +44,7 @@ pio2_1 =  1.57079632679597125389e+00, /* 0x3FF921FB, 0x54444000 */  pio2_2 = -1.07463465549783099519e-12, /* -0x12e7b967674000.0p-92 */  pio2_3 =  6.36831716351370313614e-25; /*  0x18a2e037074000.0p-133 */  static const long double +pio4    =  0x1.921fb54442d1846ap-1L,  invpio2 =  6.36619772367581343076e-01L, /*  0xa2f9836e4e44152a.0p-64 */  pio2_1t = -1.07463465549719416346e-12L, /* -0x973dcb3b399d747f.0p-103 */  pio2_2t =  6.36831716351095013979e-25L, /*  0xc51701b839a25205.0p-144 */ @@ -57,6 +58,7 @@ pio2_3t = -2.75299651904407171810e-37L; /* -0xbb5bf6c7ddd660ce.0p-185 */  #define NX 5  #define NY 3  static const long double +pio4    =  0x1.921fb54442d18469898cc51701b8p-1L,  invpio2 =  6.3661977236758134307553505349005747e-01L,	/*  0x145f306dc9c882a53f84eafa3ea6a.0p-113 */  pio2_1  =  1.5707963267948966192292994253909555e+00L,	/*  0x1921fb54442d18469800000000000.0p-112 */  pio2_1t =  2.0222662487959507323996846200947577e-21L,	/*  0x13198a2e03707344a4093822299f3.0p-181 */ @@ -76,11 +78,23 @@ int __rem_pio2l(long double x, long double *y)  	u.f = x;  	ex = u.i.se & 0x7fff;  	if (SMALL(u)) { -		/* rint(x/(pi/2)), Assume round-to-nearest. */ +		/* rint(x/(pi/2)) */  		fn = x*invpio2 + toint - toint;  		n = QUOBITS(fn);  		r = x-fn*pio2_1;  		w = fn*pio2_1t;  /* 1st round good to 102/180 bits (ld80/ld128) */ +		/* Matters with directed rounding. */ +		if (predict_false(r - w < -pio4)) { +			n--; +			fn--; +			r = x - fn*pio2_1; +			w = fn*pio2_1t; +		} else if (predict_false(r - w > pio4)) { +			n++; +			fn++; +			r = x - fn*pio2_1; +			w = fn*pio2_1t; +		}  		y[0] = r-w;  		u.f = y[0];  		ey = u.i.se & 0x7fff; diff --git a/src/math/acoshf.c b/src/math/acoshf.c index 8a4ec4d5..b773d48e 100644 --- a/src/math/acoshf.c +++ b/src/math/acoshf.c @@ -15,12 +15,12 @@ float acoshf(float x)  	uint32_t a = u.i & 0x7fffffff;  	if (a < 0x3f800000+(1<<23)) -		/* |x| < 2, invalid if x < 1 or nan */ +		/* |x| < 2, invalid if x < 1 */  		/* up to 2ulp error in [1,1.125] */  		return log1pf(x-1 + sqrtf((x-1)*(x-1)+2*(x-1))); -	if (a < 0x3f800000+(12<<23)) -		/* |x| < 0x1p12 */ +	if (u.i < 0x3f800000+(12<<23)) +		/* 2 <= x < 0x1p12 */  		return logf(2*x - 1/(x+sqrtf(x*x-1))); -	/* x >= 0x1p12 */ +	/* x >= 0x1p12 or x <= -2 or nan */  	return logf(x) + 0.693147180559945309417232121458176568f;  } diff --git a/src/math/acoshl.c b/src/math/acoshl.c index 8d4b43f6..943cec17 100644 --- a/src/math/acoshl.c +++ b/src/math/acoshl.c @@ -10,14 +10,18 @@ long double acoshl(long double x)  long double acoshl(long double x)  {  	union ldshape u = {x}; -	int e = u.i.se & 0x7fff; +	int e = u.i.se;  	if (e < 0x3fff + 1) -		/* |x| < 2, invalid if x < 1 or nan */ +		/* 0 <= x < 2, invalid if x < 1 */  		return log1pl(x-1 + sqrtl((x-1)*(x-1)+2*(x-1)));  	if (e < 0x3fff + 32) -		/* |x| < 0x1p32 */ +		/* 2 <= x < 0x1p32 */  		return logl(2*x - 1/(x+sqrtl(x*x-1))); +	if (e & 0x8000) +		/* x < 0 or x = -0, invalid */ +		return (x - x) / (x - x); +	/* 0x1p32 <= x or nan */  	return logl(x) + 0.693147180559945309417232121458176568L;  }  #elif LDBL_MANT_DIG == 113 && LDBL_MAX_EXP == 16384 diff --git a/src/math/arm/fabs.c b/src/math/arm/fabs.c index f890520a..6e1d367d 100644 --- a/src/math/arm/fabs.c +++ b/src/math/arm/fabs.c @@ -1,6 +1,6 @@  #include <math.h> -#if __ARM_PCS_VFP +#if __ARM_PCS_VFP && __ARM_FP&8  double fabs(double x)  { diff --git a/src/math/arm/sqrt.c b/src/math/arm/sqrt.c index 874af960..567e2e91 100644 --- a/src/math/arm/sqrt.c +++ b/src/math/arm/sqrt.c @@ -1,6 +1,6 @@  #include <math.h> -#if __ARM_PCS_VFP || (__VFP_FP__ && !__SOFTFP__) +#if (__ARM_PCS_VFP || (__VFP_FP__ && !__SOFTFP__)) && (__ARM_FP&8)  double sqrt(double x)  { diff --git a/src/math/atanl.c b/src/math/atanl.c index 79a3edb8..c3b0c926 100644 --- a/src/math/atanl.c +++ b/src/math/atanl.c @@ -70,21 +70,21 @@ static long double T_odd(long double x)  #elif LDBL_MANT_DIG == 113  #define EXPMAN(u) ((u.i.se & 0x7fff)<<8 | u.i.top>>8) -const long double atanhi[] = { +static const long double atanhi[] = {  	 4.63647609000806116214256231461214397e-01L,  	 7.85398163397448309615660845819875699e-01L,  	 9.82793723247329067985710611014666038e-01L,  	 1.57079632679489661923132169163975140e+00L,  }; -const long double atanlo[] = { +static const long double atanlo[] = {  	 4.89509642257333492668618435220297706e-36L,  	 2.16795253253094525619926100651083806e-35L,  	-2.31288434538183565909319952098066272e-35L,  	 4.33590506506189051239852201302167613e-35L,  }; -const long double aT[] = { +static const long double aT[] = {  	 3.33333333333333333333333333333333125e-01L,  	-1.99999999999999999999999999999180430e-01L,  	 1.42857142857142857142857142125269827e-01L, diff --git a/src/math/cosh.c b/src/math/cosh.c index 100f8231..490c15fb 100644 --- a/src/math/cosh.c +++ b/src/math/cosh.c @@ -35,6 +35,6 @@ double cosh(double x)  	/* |x| > log(DBL_MAX) or nan */  	/* note: the result is stored to handle overflow */ -	t = __expo2(x); +	t = __expo2(x, 1.0);  	return t;  } diff --git a/src/math/coshf.c b/src/math/coshf.c index b09f2ee5..e739cff9 100644 --- a/src/math/coshf.c +++ b/src/math/coshf.c @@ -28,6 +28,6 @@ float coshf(float x)  	}  	/* |x| > log(FLT_MAX) or nan */ -	t = __expo2f(x); +	t = __expo2f(x, 1.0f);  	return t;  } diff --git a/src/math/exp.c b/src/math/exp.c index 9ea672fa..b764d73c 100644 --- a/src/math/exp.c +++ b/src/math/exp.c @@ -1,134 +1,134 @@ -/* origin: FreeBSD /usr/src/lib/msun/src/e_exp.c */  /* - * ==================================================== - * Copyright (C) 2004 by Sun Microsystems, Inc. All rights reserved. + * Double-precision e^x function.   * - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - */ -/* exp(x) - * Returns the exponential of x. - * - * Method - *   1. Argument reduction: - *      Reduce x to an r so that |r| <= 0.5*ln2 ~ 0.34658. - *      Given x, find r and integer k such that - * - *               x = k*ln2 + r,  |r| <= 0.5*ln2. - * - *      Here r will be represented as r = hi-lo for better - *      accuracy. - * - *   2. Approximation of exp(r) by a special rational function on - *      the interval [0,0.34658]: - *      Write - *          R(r**2) = r*(exp(r)+1)/(exp(r)-1) = 2 + r*r/6 - r**4/360 + ... - *      We use a special Remez algorithm on [0,0.34658] to generate - *      a polynomial of degree 5 to approximate R. The maximum error - *      of this polynomial approximation is bounded by 2**-59. In - *      other words, - *          R(z) ~ 2.0 + P1*z + P2*z**2 + P3*z**3 + P4*z**4 + P5*z**5 - *      (where z=r*r, and the values of P1 to P5 are listed below) - *      and - *          |                  5          |     -59 - *          | 2.0+P1*z+...+P5*z   -  R(z) | <= 2 - *          |                             | - *      The computation of exp(r) thus becomes - *                              2*r - *              exp(r) = 1 + ---------- - *                            R(r) - r - *                                 r*c(r) - *                     = 1 + r + ----------- (for better accuracy) - *                                2 - c(r) - *      where - *                              2       4             10 - *              c(r) = r - (P1*r  + P2*r  + ... + P5*r   ). - * - *   3. Scale back to obtain exp(x): - *      From step 1, we have - *         exp(x) = 2^k * exp(r) - * - * Special cases: - *      exp(INF) is INF, exp(NaN) is NaN; - *      exp(-INF) is 0, and - *      for finite argument, only exp(0)=1 is exact. - * - * Accuracy: - *      according to an error analysis, the error is always less than - *      1 ulp (unit in the last place). - * - * Misc. info. - *      For IEEE double - *          if x >  709.782712893383973096 then exp(x) overflows - *          if x < -745.133219101941108420 then exp(x) underflows + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT   */ +#include <math.h> +#include <stdint.h>  #include "libm.h" +#include "exp_data.h" -static const double -half[2] = {0.5,-0.5}, -ln2hi = 6.93147180369123816490e-01, /* 0x3fe62e42, 0xfee00000 */ -ln2lo = 1.90821492927058770002e-10, /* 0x3dea39ef, 0x35793c76 */ -invln2 = 1.44269504088896338700e+00, /* 0x3ff71547, 0x652b82fe */ -P1   =  1.66666666666666019037e-01, /* 0x3FC55555, 0x5555553E */ -P2   = -2.77777777770155933842e-03, /* 0xBF66C16C, 0x16BEBD93 */ -P3   =  6.61375632143793436117e-05, /* 0x3F11566A, 0xAF25DE2C */ -P4   = -1.65339022054652515390e-06, /* 0xBEBBBD41, 0xC5D26BF1 */ -P5   =  4.13813679705723846039e-08; /* 0x3E663769, 0x72BEA4D0 */ +#define N (1 << EXP_TABLE_BITS) +#define InvLn2N __exp_data.invln2N +#define NegLn2hiN __exp_data.negln2hiN +#define NegLn2loN __exp_data.negln2loN +#define Shift __exp_data.shift +#define T __exp_data.tab +#define C2 __exp_data.poly[5 - EXP_POLY_ORDER] +#define C3 __exp_data.poly[6 - EXP_POLY_ORDER] +#define C4 __exp_data.poly[7 - EXP_POLY_ORDER] +#define C5 __exp_data.poly[8 - EXP_POLY_ORDER] -double exp(double x) +/* Handle cases that may overflow or underflow when computing the result that +   is scale*(1+TMP) without intermediate rounding.  The bit representation of +   scale is in SBITS, however it has a computed exponent that may have +   overflown into the sign bit so that needs to be adjusted before using it as +   a double.  (int32_t)KI is the k used in the argument reduction and exponent +   adjustment of scale, positive k here means the result may overflow and +   negative k means the result may underflow.  */ +static inline double specialcase(double_t tmp, uint64_t sbits, uint64_t ki)  { -	double_t hi, lo, c, xx, y; -	int k, sign; -	uint32_t hx; - -	GET_HIGH_WORD(hx, x); -	sign = hx>>31; -	hx &= 0x7fffffff;  /* high word of |x| */ +	double_t scale, y; -	/* special cases */ -	if (hx >= 0x4086232b) {  /* if |x| >= 708.39... */ -		if (isnan(x)) -			return x; -		if (x > 709.782712893383973096) { -			/* overflow if x!=inf */ -			x *= 0x1p1023; -			return x; -		} -		if (x < -708.39641853226410622) { -			/* underflow if x!=-inf */ -			FORCE_EVAL((float)(-0x1p-149/x)); -			if (x < -745.13321910194110842) -				return 0; -		} +	if ((ki & 0x80000000) == 0) { +		/* k > 0, the exponent of scale might have overflowed by <= 460.  */ +		sbits -= 1009ull << 52; +		scale = asdouble(sbits); +		y = 0x1p1009 * (scale + scale * tmp); +		return eval_as_double(y); +	} +	/* k < 0, need special care in the subnormal range.  */ +	sbits += 1022ull << 52; +	scale = asdouble(sbits); +	y = scale + scale * tmp; +	if (y < 1.0) { +		/* Round y to the right precision before scaling it into the subnormal +		 range to avoid double rounding that can cause 0.5+E/2 ulp error where +		 E is the worst-case ulp error outside the subnormal range.  So this +		 is only useful if the goal is better than 1 ulp worst-case error.  */ +		double_t hi, lo; +		lo = scale - y + scale * tmp; +		hi = 1.0 + y; +		lo = 1.0 - hi + y + lo; +		y = eval_as_double(hi + lo) - 1.0; +		/* Avoid -0.0 with downward rounding.  */ +		if (WANT_ROUNDING && y == 0.0) +			y = 0.0; +		/* The underflow exception needs to be signaled explicitly.  */ +		fp_force_eval(fp_barrier(0x1p-1022) * 0x1p-1022);  	} +	y = 0x1p-1022 * y; +	return eval_as_double(y); +} -	/* argument reduction */ -	if (hx > 0x3fd62e42) {  /* if |x| > 0.5 ln2 */ -		if (hx >= 0x3ff0a2b2)  /* if |x| >= 1.5 ln2 */ -			k = (int)(invln2*x + half[sign]); -		else -			k = 1 - sign - sign; -		hi = x - k*ln2hi;  /* k*ln2hi is exact here */ -		lo = k*ln2lo; -		x = hi - lo; -	} else if (hx > 0x3e300000)  {  /* if |x| > 2**-28 */ -		k = 0; -		hi = x; -		lo = 0; -	} else { -		/* inexact if x!=0 */ -		FORCE_EVAL(0x1p1023 + x); -		return 1 + x; +/* Top 12 bits of a double (sign and exponent bits).  */ +static inline uint32_t top12(double x) +{ +	return asuint64(x) >> 52; +} + +double exp(double x) +{ +	uint32_t abstop; +	uint64_t ki, idx, top, sbits; +	double_t kd, z, r, r2, scale, tail, tmp; + +	abstop = top12(x) & 0x7ff; +	if (predict_false(abstop - top12(0x1p-54) >= top12(512.0) - top12(0x1p-54))) { +		if (abstop - top12(0x1p-54) >= 0x80000000) +			/* Avoid spurious underflow for tiny x.  */ +			/* Note: 0 is common input.  */ +			return WANT_ROUNDING ? 1.0 + x : 1.0; +		if (abstop >= top12(1024.0)) { +			if (asuint64(x) == asuint64(-INFINITY)) +				return 0.0; +			if (abstop >= top12(INFINITY)) +				return 1.0 + x; +			if (asuint64(x) >> 63) +				return __math_uflow(0); +			else +				return __math_oflow(0); +		} +		/* Large x is special cased below.  */ +		abstop = 0;  	} -	/* x is now in primary range */ -	xx = x*x; -	c = x - xx*(P1+xx*(P2+xx*(P3+xx*(P4+xx*P5)))); -	y = 1 + (x*c/(2-c) - lo + hi); -	if (k == 0) -		return y; -	return scalbn(y, k); +	/* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */ +	/* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */ +	z = InvLn2N * x; +#if TOINT_INTRINSICS +	kd = roundtoint(z); +	ki = converttoint(z); +#elif EXP_USE_TOINT_NARROW +	/* z - kd is in [-0.5-2^-16, 0.5] in all rounding modes.  */ +	kd = eval_as_double(z + Shift); +	ki = asuint64(kd) >> 16; +	kd = (double_t)(int32_t)ki; +#else +	/* z - kd is in [-1, 1] in non-nearest rounding modes.  */ +	kd = eval_as_double(z + Shift); +	ki = asuint64(kd); +	kd -= Shift; +#endif +	r = x + kd * NegLn2hiN + kd * NegLn2loN; +	/* 2^(k/N) ~= scale * (1 + tail).  */ +	idx = 2 * (ki % N); +	top = ki << (52 - EXP_TABLE_BITS); +	tail = asdouble(T[idx]); +	/* This is only a valid scale when -1023*N < k < 1024*N.  */ +	sbits = T[idx + 1] + top; +	/* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1).  */ +	/* Evaluation is optimized assuming superscalar pipelined execution.  */ +	r2 = r * r; +	/* Without fma the worst case error is 0.25/N ulp larger.  */ +	/* Worst case error is less than 0.5+1.11/N+(abs poly error * 2^53) ulp.  */ +	tmp = tail + r + r2 * (C2 + r * C3) + r2 * r2 * (C4 + r * C5); +	if (predict_false(abstop == 0)) +		return specialcase(tmp, sbits, ki); +	scale = asdouble(sbits); +	/* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there +	   is no spurious underflow here even without fma.  */ +	return eval_as_double(scale + scale * tmp);  } diff --git a/src/math/exp2.c b/src/math/exp2.c index e14adba5..e0ff54bd 100644 --- a/src/math/exp2.c +++ b/src/math/exp2.c @@ -1,375 +1,121 @@ -/* origin: FreeBSD /usr/src/lib/msun/src/s_exp2.c */ -/*- - * Copyright (c) 2005 David Schultz <das@FreeBSD.ORG> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - *    notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - *    notice, this list of conditions and the following disclaimer in the - *    documentation and/or other materials provided with the distribution. +/* + * Double-precision 2^x function.   * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT   */ +#include <math.h> +#include <stdint.h>  #include "libm.h" +#include "exp_data.h" -#define TBLSIZE 256 +#define N (1 << EXP_TABLE_BITS) +#define Shift __exp_data.exp2_shift +#define T __exp_data.tab +#define C1 __exp_data.exp2_poly[0] +#define C2 __exp_data.exp2_poly[1] +#define C3 __exp_data.exp2_poly[2] +#define C4 __exp_data.exp2_poly[3] +#define C5 __exp_data.exp2_poly[4] -static const double -redux = 0x1.8p52 / TBLSIZE, -P1    = 0x1.62e42fefa39efp-1, -P2    = 0x1.ebfbdff82c575p-3, -P3    = 0x1.c6b08d704a0a6p-5, -P4    = 0x1.3b2ab88f70400p-7, -P5    = 0x1.5d88003875c74p-10; +/* Handle cases that may overflow or underflow when computing the result that +   is scale*(1+TMP) without intermediate rounding.  The bit representation of +   scale is in SBITS, however it has a computed exponent that may have +   overflown into the sign bit so that needs to be adjusted before using it as +   a double.  (int32_t)KI is the k used in the argument reduction and exponent +   adjustment of scale, positive k here means the result may overflow and +   negative k means the result may underflow.  */ +static inline double specialcase(double_t tmp, uint64_t sbits, uint64_t ki) +{ +	double_t scale, y; -static const double tbl[TBLSIZE * 2] = { -/*  exp2(z + eps)          eps     */ -  0x1.6a09e667f3d5dp-1,  0x1.9880p-44, -  0x1.6b052fa751744p-1,  0x1.8000p-50, -  0x1.6c012750bd9fep-1, -0x1.8780p-45, -  0x1.6cfdcddd476bfp-1,  0x1.ec00p-46, -  0x1.6dfb23c651a29p-1, -0x1.8000p-50, -  0x1.6ef9298593ae3p-1, -0x1.c000p-52, -  0x1.6ff7df9519386p-1, -0x1.fd80p-45, -  0x1.70f7466f42da3p-1, -0x1.c880p-45, -  0x1.71f75e8ec5fc3p-1,  0x1.3c00p-46, -  0x1.72f8286eacf05p-1, -0x1.8300p-44, -  0x1.73f9a48a58152p-1, -0x1.0c00p-47, -  0x1.74fbd35d7ccfcp-1,  0x1.f880p-45, -  0x1.75feb564267f1p-1,  0x1.3e00p-47, -  0x1.77024b1ab6d48p-1, -0x1.7d00p-45, -  0x1.780694fde5d38p-1, -0x1.d000p-50, -  0x1.790b938ac1d00p-1,  0x1.3000p-49, -  0x1.7a11473eb0178p-1, -0x1.d000p-49, -  0x1.7b17b0976d060p-1,  0x1.0400p-45, -  0x1.7c1ed0130c133p-1,  0x1.0000p-53, -  0x1.7d26a62ff8636p-1, -0x1.6900p-45, -  0x1.7e2f336cf4e3bp-1, -0x1.2e00p-47, -  0x1.7f3878491c3e8p-1, -0x1.4580p-45, -  0x1.80427543e1b4ep-1,  0x1.3000p-44, -  0x1.814d2add1071ap-1,  0x1.f000p-47, -  0x1.82589994ccd7ep-1, -0x1.1c00p-45, -  0x1.8364c1eb942d0p-1,  0x1.9d00p-45, -  0x1.8471a4623cab5p-1,  0x1.7100p-43, -  0x1.857f4179f5bbcp-1,  0x1.2600p-45, -  0x1.868d99b4491afp-1, -0x1.2c40p-44, -  0x1.879cad931a395p-1, -0x1.3000p-45, -  0x1.88ac7d98a65b8p-1, -0x1.a800p-45, -  0x1.89bd0a4785800p-1, -0x1.d000p-49, -  0x1.8ace5422aa223p-1,  0x1.3280p-44, -  0x1.8be05bad619fap-1,  0x1.2b40p-43, -  0x1.8cf3216b54383p-1, -0x1.ed00p-45, -  0x1.8e06a5e08664cp-1, -0x1.0500p-45, -  0x1.8f1ae99157807p-1,  0x1.8280p-45, -  0x1.902fed0282c0ep-1, -0x1.cb00p-46, -  0x1.9145b0b91ff96p-1, -0x1.5e00p-47, -  0x1.925c353aa2ff9p-1,  0x1.5400p-48, -  0x1.93737b0cdc64ap-1,  0x1.7200p-46, -  0x1.948b82b5f98aep-1, -0x1.9000p-47, -  0x1.95a44cbc852cbp-1,  0x1.5680p-45, -  0x1.96bdd9a766f21p-1, -0x1.6d00p-44, -  0x1.97d829fde4e2ap-1, -0x1.1000p-47, -  0x1.98f33e47a23a3p-1,  0x1.d000p-45, -  0x1.9a0f170ca0604p-1, -0x1.8a40p-44, -  0x1.9b2bb4d53ff89p-1,  0x1.55c0p-44, -  0x1.9c49182a3f15bp-1,  0x1.6b80p-45, -  0x1.9d674194bb8c5p-1, -0x1.c000p-49, -  0x1.9e86319e3238ep-1,  0x1.7d00p-46, -  0x1.9fa5e8d07f302p-1,  0x1.6400p-46, -  0x1.a0c667b5de54dp-1, -0x1.5000p-48, -  0x1.a1e7aed8eb8f6p-1,  0x1.9e00p-47, -  0x1.a309bec4a2e27p-1,  0x1.ad80p-45, -  0x1.a42c980460a5dp-1, -0x1.af00p-46, -  0x1.a5503b23e259bp-1,  0x1.b600p-47, -  0x1.a674a8af46213p-1,  0x1.8880p-44, -  0x1.a799e1330b3a7p-1,  0x1.1200p-46, -  0x1.a8bfe53c12e8dp-1,  0x1.6c00p-47, -  0x1.a9e6b5579fcd2p-1, -0x1.9b80p-45, -  0x1.ab0e521356fb8p-1,  0x1.b700p-45, -  0x1.ac36bbfd3f381p-1,  0x1.9000p-50, -  0x1.ad5ff3a3c2780p-1,  0x1.4000p-49, -  0x1.ae89f995ad2a3p-1, -0x1.c900p-45, -  0x1.afb4ce622f367p-1,  0x1.6500p-46, -  0x1.b0e07298db790p-1,  0x1.fd40p-45, -  0x1.b20ce6c9a89a9p-1,  0x1.2700p-46, -  0x1.b33a2b84f1a4bp-1,  0x1.d470p-43, -  0x1.b468415b747e7p-1, -0x1.8380p-44, -  0x1.b59728de5593ap-1,  0x1.8000p-54, -  0x1.b6c6e29f1c56ap-1,  0x1.ad00p-47, -  0x1.b7f76f2fb5e50p-1,  0x1.e800p-50, -  0x1.b928cf22749b2p-1, -0x1.4c00p-47, -  0x1.ba5b030a10603p-1, -0x1.d700p-47, -  0x1.bb8e0b79a6f66p-1,  0x1.d900p-47, -  0x1.bcc1e904bc1ffp-1,  0x1.2a00p-47, -  0x1.bdf69c3f3a16fp-1, -0x1.f780p-46, -  0x1.bf2c25bd71db8p-1, -0x1.0a00p-46, -  0x1.c06286141b2e9p-1, -0x1.1400p-46, -  0x1.c199bdd8552e0p-1,  0x1.be00p-47, -  0x1.c2d1cd9fa64eep-1, -0x1.9400p-47, -  0x1.c40ab5fffd02fp-1, -0x1.ed00p-47, -  0x1.c544778fafd15p-1,  0x1.9660p-44, -  0x1.c67f12e57d0cbp-1, -0x1.a100p-46, -  0x1.c7ba88988c1b6p-1, -0x1.8458p-42, -  0x1.c8f6d9406e733p-1, -0x1.a480p-46, -  0x1.ca3405751c4dfp-1,  0x1.b000p-51, -  0x1.cb720dcef9094p-1,  0x1.1400p-47, -  0x1.ccb0f2e6d1689p-1,  0x1.0200p-48, -  0x1.cdf0b555dc412p-1,  0x1.3600p-48, -  0x1.cf3155b5bab3bp-1, -0x1.6900p-47, -  0x1.d072d4a0789bcp-1,  0x1.9a00p-47, -  0x1.d1b532b08c8fap-1, -0x1.5e00p-46, -  0x1.d2f87080d8a85p-1,  0x1.d280p-46, -  0x1.d43c8eacaa203p-1,  0x1.1a00p-47, -  0x1.d5818dcfba491p-1,  0x1.f000p-50, -  0x1.d6c76e862e6a1p-1, -0x1.3a00p-47, -  0x1.d80e316c9834ep-1, -0x1.cd80p-47, -  0x1.d955d71ff6090p-1,  0x1.4c00p-48, -  0x1.da9e603db32aep-1,  0x1.f900p-48, -  0x1.dbe7cd63a8325p-1,  0x1.9800p-49, -  0x1.dd321f301b445p-1, -0x1.5200p-48, -  0x1.de7d5641c05bfp-1, -0x1.d700p-46, -  0x1.dfc97337b9aecp-1, -0x1.6140p-46, -  0x1.e11676b197d5ep-1,  0x1.b480p-47, -  0x1.e264614f5a3e7p-1,  0x1.0ce0p-43, -  0x1.e3b333b16ee5cp-1,  0x1.c680p-47, -  0x1.e502ee78b3fb4p-1, -0x1.9300p-47, -  0x1.e653924676d68p-1, -0x1.5000p-49, -  0x1.e7a51fbc74c44p-1, -0x1.7f80p-47, -  0x1.e8f7977cdb726p-1, -0x1.3700p-48, -  0x1.ea4afa2a490e8p-1,  0x1.5d00p-49, -  0x1.eb9f4867ccae4p-1,  0x1.61a0p-46, -  0x1.ecf482d8e680dp-1,  0x1.5500p-48, -  0x1.ee4aaa2188514p-1,  0x1.6400p-51, -  0x1.efa1bee615a13p-1, -0x1.e800p-49, -  0x1.f0f9c1cb64106p-1, -0x1.a880p-48, -  0x1.f252b376bb963p-1, -0x1.c900p-45, -  0x1.f3ac948dd7275p-1,  0x1.a000p-53, -  0x1.f50765b6e4524p-1, -0x1.4f00p-48, -  0x1.f6632798844fdp-1,  0x1.a800p-51, -  0x1.f7bfdad9cbe38p-1,  0x1.abc0p-48, -  0x1.f91d802243c82p-1, -0x1.4600p-50, -  0x1.fa7c1819e908ep-1, -0x1.b0c0p-47, -  0x1.fbdba3692d511p-1, -0x1.0e00p-51, -  0x1.fd3c22b8f7194p-1, -0x1.0de8p-46, -  0x1.fe9d96b2a23eep-1,  0x1.e430p-49, -  0x1.0000000000000p+0,  0x0.0000p+0, -  0x1.00b1afa5abcbep+0, -0x1.3400p-52, -  0x1.0163da9fb3303p+0, -0x1.2170p-46, -  0x1.02168143b0282p+0,  0x1.a400p-52, -  0x1.02c9a3e77806cp+0,  0x1.f980p-49, -  0x1.037d42e11bbcap+0, -0x1.7400p-51, -  0x1.04315e86e7f89p+0,  0x1.8300p-50, -  0x1.04e5f72f65467p+0, -0x1.a3f0p-46, -  0x1.059b0d315855ap+0, -0x1.2840p-47, -  0x1.0650a0e3c1f95p+0,  0x1.1600p-48, -  0x1.0706b29ddf71ap+0,  0x1.5240p-46, -  0x1.07bd42b72a82dp+0, -0x1.9a00p-49, -  0x1.0874518759bd0p+0,  0x1.6400p-49, -  0x1.092bdf66607c8p+0, -0x1.0780p-47, -  0x1.09e3ecac6f383p+0, -0x1.8000p-54, -  0x1.0a9c79b1f3930p+0,  0x1.fa00p-48, -  0x1.0b5586cf988fcp+0, -0x1.ac80p-48, -  0x1.0c0f145e46c8ap+0,  0x1.9c00p-50, -  0x1.0cc922b724816p+0,  0x1.5200p-47, -  0x1.0d83b23395dd8p+0, -0x1.ad00p-48, -  0x1.0e3ec32d3d1f3p+0,  0x1.bac0p-46, -  0x1.0efa55fdfa9a6p+0, -0x1.4e80p-47, -  0x1.0fb66affed2f0p+0, -0x1.d300p-47, -  0x1.1073028d7234bp+0,  0x1.1500p-48, -  0x1.11301d0125b5bp+0,  0x1.c000p-49, -  0x1.11edbab5e2af9p+0,  0x1.6bc0p-46, -  0x1.12abdc06c31d5p+0,  0x1.8400p-49, -  0x1.136a814f2047dp+0, -0x1.ed00p-47, -  0x1.1429aaea92de9p+0,  0x1.8e00p-49, -  0x1.14e95934f3138p+0,  0x1.b400p-49, -  0x1.15a98c8a58e71p+0,  0x1.5300p-47, -  0x1.166a45471c3dfp+0,  0x1.3380p-47, -  0x1.172b83c7d5211p+0,  0x1.8d40p-45, -  0x1.17ed48695bb9fp+0, -0x1.5d00p-47, -  0x1.18af9388c8d93p+0, -0x1.c880p-46, -  0x1.1972658375d66p+0,  0x1.1f00p-46, -  0x1.1a35beb6fcba7p+0,  0x1.0480p-46, -  0x1.1af99f81387e3p+0, -0x1.7390p-43, -  0x1.1bbe084045d54p+0,  0x1.4e40p-45, -  0x1.1c82f95281c43p+0, -0x1.a200p-47, -  0x1.1d4873168b9b2p+0,  0x1.3800p-49, -  0x1.1e0e75eb44031p+0,  0x1.ac00p-49, -  0x1.1ed5022fcd938p+0,  0x1.1900p-47, -  0x1.1f9c18438cdf7p+0, -0x1.b780p-46, -  0x1.2063b88628d8fp+0,  0x1.d940p-45, -  0x1.212be3578a81ep+0,  0x1.8000p-50, -  0x1.21f49917ddd41p+0,  0x1.b340p-45, -  0x1.22bdda2791323p+0,  0x1.9f80p-46, -  0x1.2387a6e7561e7p+0, -0x1.9c80p-46, -  0x1.2451ffb821427p+0,  0x1.2300p-47, -  0x1.251ce4fb2a602p+0, -0x1.3480p-46, -  0x1.25e85711eceb0p+0,  0x1.2700p-46, -  0x1.26b4565e27d16p+0,  0x1.1d00p-46, -  0x1.2780e341de00fp+0,  0x1.1ee0p-44, -  0x1.284dfe1f5633ep+0, -0x1.4c00p-46, -  0x1.291ba7591bb30p+0, -0x1.3d80p-46, -  0x1.29e9df51fdf09p+0,  0x1.8b00p-47, -  0x1.2ab8a66d10e9bp+0, -0x1.27c0p-45, -  0x1.2b87fd0dada3ap+0,  0x1.a340p-45, -  0x1.2c57e39771af9p+0, -0x1.0800p-46, -  0x1.2d285a6e402d9p+0, -0x1.ed00p-47, -  0x1.2df961f641579p+0, -0x1.4200p-48, -  0x1.2ecafa93e2ecfp+0, -0x1.4980p-45, -  0x1.2f9d24abd8822p+0, -0x1.6300p-46, -  0x1.306fe0a31b625p+0, -0x1.2360p-44, -  0x1.31432edeea50bp+0, -0x1.0df8p-40, -  0x1.32170fc4cd7b8p+0, -0x1.2480p-45, -  0x1.32eb83ba8e9a2p+0, -0x1.5980p-45, -  0x1.33c08b2641766p+0,  0x1.ed00p-46, -  0x1.3496266e3fa27p+0, -0x1.c000p-50, -  0x1.356c55f929f0fp+0, -0x1.0d80p-44, -  0x1.36431a2de88b9p+0,  0x1.2c80p-45, -  0x1.371a7373aaa39p+0,  0x1.0600p-45, -  0x1.37f26231e74fep+0, -0x1.6600p-46, -  0x1.38cae6d05d838p+0, -0x1.ae00p-47, -  0x1.39a401b713ec3p+0, -0x1.4720p-43, -  0x1.3a7db34e5a020p+0,  0x1.8200p-47, -  0x1.3b57fbfec6e95p+0,  0x1.e800p-44, -  0x1.3c32dc313a8f2p+0,  0x1.f800p-49, -  0x1.3d0e544ede122p+0, -0x1.7a00p-46, -  0x1.3dea64c1234bbp+0,  0x1.6300p-45, -  0x1.3ec70df1c4eccp+0, -0x1.8a60p-43, -  0x1.3fa4504ac7e8cp+0, -0x1.cdc0p-44, -  0x1.40822c367a0bbp+0,  0x1.5b80p-45, -  0x1.4160a21f72e95p+0,  0x1.ec00p-46, -  0x1.423fb27094646p+0, -0x1.3600p-46, -  0x1.431f5d950a920p+0,  0x1.3980p-45, -  0x1.43ffa3f84b9ebp+0,  0x1.a000p-48, -  0x1.44e0860618919p+0, -0x1.6c00p-48, -  0x1.45c2042a7d201p+0, -0x1.bc00p-47, -  0x1.46a41ed1d0016p+0, -0x1.2800p-46, -  0x1.4786d668b3326p+0,  0x1.0e00p-44, -  0x1.486a2b5c13c00p+0, -0x1.d400p-45, -  0x1.494e1e192af04p+0,  0x1.c200p-47, -  0x1.4a32af0d7d372p+0, -0x1.e500p-46, -  0x1.4b17dea6db801p+0,  0x1.7800p-47, -  0x1.4bfdad53629e1p+0, -0x1.3800p-46, -  0x1.4ce41b817c132p+0,  0x1.0800p-47, -  0x1.4dcb299fddddbp+0,  0x1.c700p-45, -  0x1.4eb2d81d8ab96p+0, -0x1.ce00p-46, -  0x1.4f9b2769d2d02p+0,  0x1.9200p-46, -  0x1.508417f4531c1p+0, -0x1.8c00p-47, -  0x1.516daa2cf662ap+0, -0x1.a000p-48, -  0x1.5257de83f51eap+0,  0x1.a080p-43, -  0x1.5342b569d4edap+0, -0x1.6d80p-45, -  0x1.542e2f4f6ac1ap+0, -0x1.2440p-44, -  0x1.551a4ca5d94dbp+0,  0x1.83c0p-43, -  0x1.56070dde9116bp+0,  0x1.4b00p-45, -  0x1.56f4736b529dep+0,  0x1.15a0p-43, -  0x1.57e27dbe2c40ep+0, -0x1.9e00p-45, -  0x1.58d12d497c76fp+0, -0x1.3080p-45, -  0x1.59c0827ff0b4cp+0,  0x1.dec0p-43, -  0x1.5ab07dd485427p+0, -0x1.4000p-51, -  0x1.5ba11fba87af4p+0,  0x1.0080p-44, -  0x1.5c9268a59460bp+0, -0x1.6c80p-45, -  0x1.5d84590998e3fp+0,  0x1.69a0p-43, -  0x1.5e76f15ad20e1p+0, -0x1.b400p-46, -  0x1.5f6a320dcebcap+0,  0x1.7700p-46, -  0x1.605e1b976dcb8p+0,  0x1.6f80p-45, -  0x1.6152ae6cdf715p+0,  0x1.1000p-47, -  0x1.6247eb03a5531p+0, -0x1.5d00p-46, -  0x1.633dd1d1929b5p+0, -0x1.2d00p-46, -  0x1.6434634ccc313p+0, -0x1.a800p-49, -  0x1.652b9febc8efap+0, -0x1.8600p-45, -  0x1.6623882553397p+0,  0x1.1fe0p-40, -  0x1.671c1c708328ep+0, -0x1.7200p-44, -  0x1.68155d44ca97ep+0,  0x1.6800p-49, -  0x1.690f4b19e9471p+0, -0x1.9780p-45, -}; +	if ((ki & 0x80000000) == 0) { +		/* k > 0, the exponent of scale might have overflowed by 1.  */ +		sbits -= 1ull << 52; +		scale = asdouble(sbits); +		y = 2 * (scale + scale * tmp); +		return eval_as_double(y); +	} +	/* k < 0, need special care in the subnormal range.  */ +	sbits += 1022ull << 52; +	scale = asdouble(sbits); +	y = scale + scale * tmp; +	if (y < 1.0) { +		/* Round y to the right precision before scaling it into the subnormal +		   range to avoid double rounding that can cause 0.5+E/2 ulp error where +		   E is the worst-case ulp error outside the subnormal range.  So this +		   is only useful if the goal is better than 1 ulp worst-case error.  */ +		double_t hi, lo; +		lo = scale - y + scale * tmp; +		hi = 1.0 + y; +		lo = 1.0 - hi + y + lo; +		y = eval_as_double(hi + lo) - 1.0; +		/* Avoid -0.0 with downward rounding.  */ +		if (WANT_ROUNDING && y == 0.0) +			y = 0.0; +		/* The underflow exception needs to be signaled explicitly.  */ +		fp_force_eval(fp_barrier(0x1p-1022) * 0x1p-1022); +	} +	y = 0x1p-1022 * y; +	return eval_as_double(y); +} + +/* Top 12 bits of a double (sign and exponent bits).  */ +static inline uint32_t top12(double x) +{ +	return asuint64(x) >> 52; +} -/* - * exp2(x): compute the base 2 exponential of x - * - * Accuracy: Peak error < 0.503 ulp for normalized results. - * - * Method: (accurate tables) - * - *   Reduce x: - *     x = k + y, for integer k and |y| <= 1/2. - *     Thus we have exp2(x) = 2**k * exp2(y). - * - *   Reduce y: - *     y = i/TBLSIZE + z - eps[i] for integer i near y * TBLSIZE. - *     Thus we have exp2(y) = exp2(i/TBLSIZE) * exp2(z - eps[i]), - *     with |z - eps[i]| <= 2**-9 + 2**-39 for the table used. - * - *   We compute exp2(i/TBLSIZE) via table lookup and exp2(z - eps[i]) via - *   a degree-5 minimax polynomial with maximum error under 1.3 * 2**-61. - *   The values in exp2t[] and eps[] are chosen such that - *   exp2t[i] = exp2(i/TBLSIZE + eps[i]), and eps[i] is a small offset such - *   that exp2t[i] is accurate to 2**-64. - * - *   Note that the range of i is +-TBLSIZE/2, so we actually index the tables - *   by i0 = i + TBLSIZE/2.  For cache efficiency, exp2t[] and eps[] are - *   virtual tables, interleaved in the real table tbl[]. - * - *   This method is due to Gal, with many details due to Gal and Bachelis: - * - *      Gal, S. and Bachelis, B.  An Accurate Elementary Mathematical Library - *      for the IEEE Floating Point Standard.  TOMS 17(1), 26-46 (1991). - */  double exp2(double x)  { -	double_t r, t, z; -	uint32_t ix, i0; -	union {double f; uint64_t i;} u = {x}; -	union {uint32_t u; int32_t i;} k; +	uint32_t abstop; +	uint64_t ki, idx, top, sbits; +	double_t kd, r, r2, scale, tail, tmp; -	/* Filter out exceptional cases. */ -	ix = u.i>>32 & 0x7fffffff; -	if (ix >= 0x408ff000) {  /* |x| >= 1022 or nan */ -		if (ix >= 0x40900000 && u.i>>63 == 0) {  /* x >= 1024 or nan */ -			/* overflow */ -			x *= 0x1p1023; -			return x; -		} -		if (ix >= 0x7ff00000)  /* -inf or -nan */ -			return -1/x; -		if (u.i>>63) {  /* x <= -1022 */ -			/* underflow */ -			if (x <= -1075 || x - 0x1p52 + 0x1p52 != x) -				FORCE_EVAL((float)(-0x1p-149/x)); -			if (x <= -1075) -				return 0; +	abstop = top12(x) & 0x7ff; +	if (predict_false(abstop - top12(0x1p-54) >= top12(512.0) - top12(0x1p-54))) { +		if (abstop - top12(0x1p-54) >= 0x80000000) +			/* Avoid spurious underflow for tiny x.  */ +			/* Note: 0 is common input.  */ +			return WANT_ROUNDING ? 1.0 + x : 1.0; +		if (abstop >= top12(1024.0)) { +			if (asuint64(x) == asuint64(-INFINITY)) +				return 0.0; +			if (abstop >= top12(INFINITY)) +				return 1.0 + x; +			if (!(asuint64(x) >> 63)) +				return __math_oflow(0); +			else if (asuint64(x) >= asuint64(-1075.0)) +				return __math_uflow(0);  		} -	} else if (ix < 0x3c900000) {  /* |x| < 0x1p-54 */ -		return 1.0 + x; +		if (2 * asuint64(x) > 2 * asuint64(928.0)) +			/* Large x is special cased below.  */ +			abstop = 0;  	} -	/* Reduce x, computing z, i0, and k. */ -	u.f = x + redux; -	i0 = u.i; -	i0 += TBLSIZE / 2; -	k.u = i0 / TBLSIZE * TBLSIZE; -	k.i /= TBLSIZE; -	i0 %= TBLSIZE; -	u.f -= redux; -	z = x - u.f; - -	/* Compute r = exp2(y) = exp2t[i0] * p(z - eps[i]). */ -	t = tbl[2*i0];       /* exp2t[i0] */ -	z -= tbl[2*i0 + 1];  /* eps[i0]   */ -	r = t + t * z * (P1 + z * (P2 + z * (P3 + z * (P4 + z * P5)))); - -	return scalbn(r, k.i); +	/* exp2(x) = 2^(k/N) * 2^r, with 2^r in [2^(-1/2N),2^(1/2N)].  */ +	/* x = k/N + r, with int k and r in [-1/2N, 1/2N].  */ +	kd = eval_as_double(x + Shift); +	ki = asuint64(kd); /* k.  */ +	kd -= Shift; /* k/N for int k.  */ +	r = x - kd; +	/* 2^(k/N) ~= scale * (1 + tail).  */ +	idx = 2 * (ki % N); +	top = ki << (52 - EXP_TABLE_BITS); +	tail = asdouble(T[idx]); +	/* This is only a valid scale when -1023*N < k < 1024*N.  */ +	sbits = T[idx + 1] + top; +	/* exp2(x) = 2^(k/N) * 2^r ~= scale + scale * (tail + 2^r - 1).  */ +	/* Evaluation is optimized assuming superscalar pipelined execution.  */ +	r2 = r * r; +	/* Without fma the worst case error is 0.5/N ulp larger.  */ +	/* Worst case error is less than 0.5+0.86/N+(abs poly error * 2^53) ulp.  */ +	tmp = tail + r * C1 + r2 * (C2 + r * C3) + r2 * r2 * (C4 + r * C5); +	if (predict_false(abstop == 0)) +		return specialcase(tmp, sbits, ki); +	scale = asdouble(sbits); +	/* Note: tmp == 0 or |tmp| > 2^-65 and scale > 2^-928, so there +	   is no spurious underflow here even without fma.  */ +	return eval_as_double(scale + scale * tmp);  } diff --git a/src/math/exp2f.c b/src/math/exp2f.c index 296b6343..0360482c 100644 --- a/src/math/exp2f.c +++ b/src/math/exp2f.c @@ -1,126 +1,69 @@ -/* origin: FreeBSD /usr/src/lib/msun/src/s_exp2f.c */ -/*- - * Copyright (c) 2005 David Schultz <das@FreeBSD.ORG> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - *    notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - *    notice, this list of conditions and the following disclaimer in the - *    documentation and/or other materials provided with the distribution. +/* + * Single-precision 2^x function.   * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT   */ +#include <math.h> +#include <stdint.h>  #include "libm.h" +#include "exp2f_data.h" -#define TBLSIZE 16 +/* +EXP2F_TABLE_BITS = 5 +EXP2F_POLY_ORDER = 3 -static const float -redux = 0x1.8p23f / TBLSIZE, -P1    = 0x1.62e430p-1f, -P2    = 0x1.ebfbe0p-3f, -P3    = 0x1.c6b348p-5f, -P4    = 0x1.3b2c9cp-7f; +ULP error: 0.502 (nearest rounding.) +Relative error: 1.69 * 2^-34 in [-1/64, 1/64] (before rounding.) +Wrong count: 168353 (all nearest rounding wrong results with fma.) +Non-nearest ULP error: 1 (rounded ULP error) +*/ -static const double exp2ft[TBLSIZE] = { -  0x1.6a09e667f3bcdp-1, -  0x1.7a11473eb0187p-1, -  0x1.8ace5422aa0dbp-1, -  0x1.9c49182a3f090p-1, -  0x1.ae89f995ad3adp-1, -  0x1.c199bdd85529cp-1, -  0x1.d5818dcfba487p-1, -  0x1.ea4afa2a490dap-1, -  0x1.0000000000000p+0, -  0x1.0b5586cf9890fp+0, -  0x1.172b83c7d517bp+0, -  0x1.2387a6e756238p+0, -  0x1.306fe0a31b715p+0, -  0x1.3dea64c123422p+0, -  0x1.4bfdad5362a27p+0, -  0x1.5ab07dd485429p+0, -}; +#define N (1 << EXP2F_TABLE_BITS) +#define T __exp2f_data.tab +#define C __exp2f_data.poly +#define SHIFT __exp2f_data.shift_scaled + +static inline uint32_t top12(float x) +{ +	return asuint(x) >> 20; +} -/* - * exp2f(x): compute the base 2 exponential of x - * - * Accuracy: Peak error < 0.501 ulp; location of peak: -0.030110927. - * - * Method: (equally-spaced tables) - * - *   Reduce x: - *     x = k + y, for integer k and |y| <= 1/2. - *     Thus we have exp2f(x) = 2**k * exp2(y). - * - *   Reduce y: - *     y = i/TBLSIZE + z for integer i near y * TBLSIZE. - *     Thus we have exp2(y) = exp2(i/TBLSIZE) * exp2(z), - *     with |z| <= 2**-(TBLSIZE+1). - * - *   We compute exp2(i/TBLSIZE) via table lookup and exp2(z) via a - *   degree-4 minimax polynomial with maximum error under 1.4 * 2**-33. - *   Using double precision for everything except the reduction makes - *   roundoff error insignificant and simplifies the scaling step. - * - *   This method is due to Tang, but I do not use his suggested parameters: - * - *      Tang, P.  Table-driven Implementation of the Exponential Function - *      in IEEE Floating-Point Arithmetic.  TOMS 15(2), 144-157 (1989). - */  float exp2f(float x)  { -	double_t t, r, z; -	union {float f; uint32_t i;} u = {x}; -	union {double f; uint64_t i;} uk; -	uint32_t ix, i0, k; +	uint32_t abstop; +	uint64_t ki, t; +	double_t kd, xd, z, r, r2, y, s; -	/* Filter out exceptional cases. */ -	ix = u.i & 0x7fffffff; -	if (ix > 0x42fc0000) {  /* |x| > 126 */ -		if (ix > 0x7f800000) /* NaN */ -			return x; -		if (u.i >= 0x43000000 && u.i < 0x80000000) {  /* x >= 128 */ -			x *= 0x1p127f; -			return x; -		} -		if (u.i >= 0x80000000) {  /* x < -126 */ -			if (u.i >= 0xc3160000 || (u.i & 0x0000ffff)) -				FORCE_EVAL(-0x1p-149f/x); -			if (u.i >= 0xc3160000)  /* x <= -150 */ -				return 0; -		} -	} else if (ix <= 0x33000000) {  /* |x| <= 0x1p-25 */ -		return 1.0f + x; +	xd = (double_t)x; +	abstop = top12(x) & 0x7ff; +	if (predict_false(abstop >= top12(128.0f))) { +		/* |x| >= 128 or x is nan.  */ +		if (asuint(x) == asuint(-INFINITY)) +			return 0.0f; +		if (abstop >= top12(INFINITY)) +			return x + x; +		if (x > 0.0f) +			return __math_oflowf(0); +		if (x <= -150.0f) +			return __math_uflowf(0);  	} -	/* Reduce x, computing z, i0, and k. */ -	u.f = x + redux; -	i0 = u.i; -	i0 += TBLSIZE / 2; -	k = i0 / TBLSIZE; -	uk.i = (uint64_t)(0x3ff + k)<<52; -	i0 &= TBLSIZE - 1; -	u.f -= redux; -	z = x - u.f; -	/* Compute r = exp2(y) = exp2ft[i0] * p(z). */ -	r = exp2ft[i0]; -	t = r * z; -	r = r + t * (P1 + z * P2) + t * (z * z) * (P3 + z * P4); +	/* x = k/N + r with r in [-1/(2N), 1/(2N)] and int k.  */ +	kd = eval_as_double(xd + SHIFT); +	ki = asuint64(kd); +	kd -= SHIFT; /* k/N for int k.  */ +	r = xd - kd; -	/* Scale by 2**k */ -	return r * uk.f; +	/* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */ +	t = T[ki % N]; +	t += ki << (52 - EXP2F_TABLE_BITS); +	s = asdouble(t); +	z = C[0] * r + C[1]; +	r2 = r * r; +	y = C[2] * r + 1; +	y = z * r2 + y; +	y = y * s; +	return eval_as_float(y);  } diff --git a/src/math/exp2f_data.c b/src/math/exp2f_data.c new file mode 100644 index 00000000..be324727 --- /dev/null +++ b/src/math/exp2f_data.c @@ -0,0 +1,35 @@ +/* + * Shared data between expf, exp2f and powf. + * + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "exp2f_data.h" + +#define N (1 << EXP2F_TABLE_BITS) + +const struct exp2f_data __exp2f_data = { +  /* tab[i] = uint(2^(i/N)) - (i << 52-BITS) +     used for computing 2^(k/N) for an int |k| < 150 N as +     double(tab[k%N] + (k << 52-BITS)) */ +  .tab = { +0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51, +0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1, +0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, +0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585, +0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13, +0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, +0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069, +0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540, +  }, +  .shift_scaled = 0x1.8p+52 / N, +  .poly = { +  0x1.c6af84b912394p-5, 0x1.ebfce50fac4f3p-3, 0x1.62e42ff0c52d6p-1, +  }, +  .shift = 0x1.8p+52, +  .invln2_scaled = 0x1.71547652b82fep+0 * N, +  .poly_scaled = { +  0x1.c6af84b912394p-5/N/N/N, 0x1.ebfce50fac4f3p-3/N/N, 0x1.62e42ff0c52d6p-1/N, +  }, +}; diff --git a/src/math/exp2f_data.h b/src/math/exp2f_data.h new file mode 100644 index 00000000..fe744f15 --- /dev/null +++ b/src/math/exp2f_data.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#ifndef _EXP2F_DATA_H +#define _EXP2F_DATA_H + +#include <features.h> +#include <stdint.h> + +/* Shared between expf, exp2f and powf.  */ +#define EXP2F_TABLE_BITS 5 +#define EXP2F_POLY_ORDER 3 +extern hidden const struct exp2f_data { +	uint64_t tab[1 << EXP2F_TABLE_BITS]; +	double shift_scaled; +	double poly[EXP2F_POLY_ORDER]; +	double shift; +	double invln2_scaled; +	double poly_scaled[EXP2F_POLY_ORDER]; +} __exp2f_data; + +#endif diff --git a/src/math/exp_data.c b/src/math/exp_data.c new file mode 100644 index 00000000..21be0146 --- /dev/null +++ b/src/math/exp_data.c @@ -0,0 +1,182 @@ +/* + * Shared data between exp, exp2 and pow. + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "exp_data.h" + +#define N (1 << EXP_TABLE_BITS) + +const struct exp_data __exp_data = { +// N/ln2 +.invln2N = 0x1.71547652b82fep0 * N, +// -ln2/N +.negln2hiN = -0x1.62e42fefa0000p-8, +.negln2loN = -0x1.cf79abc9e3b3ap-47, +// Used for rounding when !TOINT_INTRINSICS +#if EXP_USE_TOINT_NARROW +.shift = 0x1800000000.8p0, +#else +.shift = 0x1.8p52, +#endif +// exp polynomial coefficients. +.poly = { +// abs error: 1.555*2^-66 +// ulp error: 0.509 (0.511 without fma) +// if |x| < ln2/256+eps +// abs error if |x| < ln2/256+0x1p-15: 1.09*2^-65 +// abs error if |x| < ln2/128: 1.7145*2^-56 +0x1.ffffffffffdbdp-2, +0x1.555555555543cp-3, +0x1.55555cf172b91p-5, +0x1.1111167a4d017p-7, +}, +.exp2_shift = 0x1.8p52 / N, +// exp2 polynomial coefficients. +.exp2_poly = { +// abs error: 1.2195*2^-65 +// ulp error: 0.507 (0.511 without fma) +// if |x| < 1/256 +// abs error if |x| < 1/128: 1.9941*2^-56 +0x1.62e42fefa39efp-1, +0x1.ebfbdff82c424p-3, +0x1.c6b08d70cf4b5p-5, +0x1.3b2abd24650ccp-7, +0x1.5d7e09b4e3a84p-10, +}, +// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N) +// tab[2*k] = asuint64(T[k]) +// tab[2*k+1] = asuint64(H[k]) - (k << 52)/N +.tab = { +0x0, 0x3ff0000000000000, +0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335, +0xbc7160139cd8dc5d, 0x3fefec9a3e778061, +0xbc905e7a108766d1, 0x3fefe315e86e7f85, +0x3c8cd2523567f613, 0x3fefd9b0d3158574, +0xbc8bce8023f98efa, 0x3fefd06b29ddf6de, +0x3c60f74e61e6c861, 0x3fefc74518759bc8, +0x3c90a3e45b33d399, 0x3fefbe3ecac6f383, +0x3c979aa65d837b6d, 0x3fefb5586cf9890f, +0x3c8eb51a92fdeffc, 0x3fefac922b7247f7, +0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2, +0xbc6a033489906e0b, 0x3fef9b66affed31b, +0xbc9556522a2fbd0e, 0x3fef9301d0125b51, +0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc, +0xbc91c923b9d5f416, 0x3fef829aaea92de0, +0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51, +0xbc801b15eaa59348, 0x3fef72b83c7d517b, +0xbc8f1ff055de323d, 0x3fef6af9388c8dea, +0x3c8b898c3f1353bf, 0x3fef635beb6fcb75, +0xbc96d99c7611eb26, 0x3fef5be084045cd4, +0x3c9aecf73e3a2f60, 0x3fef54873168b9aa, +0xbc8fe782cb86389d, 0x3fef4d5022fcd91d, +0x3c8a6f4144a6c38d, 0x3fef463b88628cd6, +0x3c807a05b0e4047d, 0x3fef3f49917ddc96, +0x3c968efde3a8a894, 0x3fef387a6e756238, +0x3c875e18f274487d, 0x3fef31ce4fb2a63f, +0x3c80472b981fe7f2, 0x3fef2b4565e27cdd, +0xbc96b87b3f71085e, 0x3fef24dfe1f56381, +0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1, +0xbc3d219b1a6fbffa, 0x3fef187fd0dad990, +0x3c8b3782720c0ab4, 0x3fef1285a6e4030b, +0x3c6e149289cecb8f, 0x3fef0cafa93e2f56, +0x3c834d754db0abb6, 0x3fef06fe0a31b715, +0x3c864201e2ac744c, 0x3fef0170fc4cd831, +0x3c8fdd395dd3f84a, 0x3feefc08b26416ff, +0xbc86a3803b8e5b04, 0x3feef6c55f929ff1, +0xbc924aedcc4b5068, 0x3feef1a7373aa9cb, +0xbc9907f81b512d8e, 0x3feeecae6d05d866, +0xbc71d1e83e9436d2, 0x3feee7db34e59ff7, +0xbc991919b3ce1b15, 0x3feee32dc313a8e5, +0x3c859f48a72a4c6d, 0x3feedea64c123422, +0xbc9312607a28698a, 0x3feeda4504ac801c, +0xbc58a78f4817895b, 0x3feed60a21f72e2a, +0xbc7c2c9b67499a1b, 0x3feed1f5d950a897, +0x3c4363ed60c2ac11, 0x3feece086061892d, +0x3c9666093b0664ef, 0x3feeca41ed1d0057, +0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0, +0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de, +0x3c7690cebb7aafb0, 0x3feebfdad5362a27, +0x3c931dbdeb54e077, 0x3feebcb299fddd0d, +0xbc8f94340071a38e, 0x3feeb9b2769d2ca7, +0xbc87deccdc93a349, 0x3feeb6daa2cf6642, +0xbc78dec6bd0f385f, 0x3feeb42b569d4f82, +0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f, +0x3c93350518fdd78e, 0x3feeaf4736b527da, +0x3c7b98b72f8a9b05, 0x3feead12d497c7fd, +0x3c9063e1e21c5409, 0x3feeab07dd485429, +0x3c34c7855019c6ea, 0x3feea9268a5946b7, +0x3c9432e62b64c035, 0x3feea76f15ad2148, +0xbc8ce44a6199769f, 0x3feea5e1b976dc09, +0xbc8c33c53bef4da8, 0x3feea47eb03a5585, +0xbc845378892be9ae, 0x3feea34634ccc320, +0xbc93cedd78565858, 0x3feea23882552225, +0x3c5710aa807e1964, 0x3feea155d44ca973, +0xbc93b3efbf5e2228, 0x3feea09e667f3bcd, +0xbc6a12ad8734b982, 0x3feea012750bdabf, +0xbc6367efb86da9ee, 0x3fee9fb23c651a2f, +0xbc80dc3d54e08851, 0x3fee9f7df9519484, +0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74, +0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174, +0xbc8619321e55e68a, 0x3fee9feb564267c9, +0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f, +0xbc7b32dcb94da51d, 0x3feea11473eb0187, +0x3c94ecfd5467c06b, 0x3feea1ed0130c132, +0x3c65ebe1abd66c55, 0x3feea2f336cf4e62, +0xbc88a1c52fb3cf42, 0x3feea427543e1a12, +0xbc9369b6f13b3734, 0x3feea589994cce13, +0xbc805e843a19ff1e, 0x3feea71a4623c7ad, +0xbc94d450d872576e, 0x3feea8d99b4492ed, +0x3c90ad675b0e8a00, 0x3feeaac7d98a6699, +0x3c8db72fc1f0eab4, 0x3feeace5422aa0db, +0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c, +0x3c7bf68359f35f44, 0x3feeb1ae99157736, +0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6, +0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5, +0xbc6c23f97c90b959, 0x3feeba44cbc8520f, +0xbc92434322f4f9aa, 0x3feebd829fde4e50, +0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba, +0x3c71affc2b91ce27, 0x3feec49182a3f090, +0x3c6dd235e10a73bb, 0x3feec86319e32323, +0xbc87c50422622263, 0x3feecc667b5de565, +0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33, +0xbc91bbd1d3bcbb15, 0x3feed503b23e255d, +0x3c90cc319cee31d2, 0x3feed99e1330b358, +0x3c8469846e735ab3, 0x3feede6b5579fdbf, +0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a, +0x3c8c1a7792cb3387, 0x3feee89f995ad3ad, +0xbc907b8f4ad1d9fa, 0x3feeee07298db666, +0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb, +0xbc90a40e3da6f640, 0x3feef9728de5593a, +0xbc68d6f438ad9334, 0x3feeff76f2fb5e47, +0xbc91eee26b588a35, 0x3fef05b030a1064a, +0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2, +0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09, +0x3c736eae30af0cb3, 0x3fef199bdd85529c, +0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a, +0x3c84e08fd10959ac, 0x3fef27f12e57d14b, +0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5, +0x3c676b2c6c921968, 0x3fef3720dcef9069, +0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa, +0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c, +0xbc900dae3875a949, 0x3fef4f87080d89f2, +0x3c74a385a63d07a7, 0x3fef5818dcfba487, +0xbc82919e2040220f, 0x3fef60e316c98398, +0x3c8e5a50d5c192ac, 0x3fef69e603db3285, +0x3c843a59ac016b4b, 0x3fef7321f301b460, +0xbc82d52107b43e1f, 0x3fef7c97337b9b5f, +0xbc892ab93b470dc9, 0x3fef864614f5a129, +0x3c74b604603a88d3, 0x3fef902ee78b3ff6, +0x3c83c5ec519d7271, 0x3fef9a51fbc74c83, +0xbc8ff7128fd391f0, 0x3fefa4afa2a490da, +0xbc8dae98e223747d, 0x3fefaf482d8e67f1, +0x3c8ec3bc41aa2008, 0x3fefba1bee615a27, +0x3c842b94c3a9eb32, 0x3fefc52b376bba97, +0x3c8a64a931d185ee, 0x3fefd0765b6e4540, +0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14, +0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8, +0x3c5305c14160cc89, 0x3feff3c22b8f71f1, +}, +}; diff --git a/src/math/exp_data.h b/src/math/exp_data.h new file mode 100644 index 00000000..3e24bac5 --- /dev/null +++ b/src/math/exp_data.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#ifndef _EXP_DATA_H +#define _EXP_DATA_H + +#include <features.h> +#include <stdint.h> + +#define EXP_TABLE_BITS 7 +#define EXP_POLY_ORDER 5 +#define EXP_USE_TOINT_NARROW 0 +#define EXP2_POLY_ORDER 5 +extern hidden const struct exp_data { +	double invln2N; +	double shift; +	double negln2hiN; +	double negln2loN; +	double poly[4]; /* Last four coefficients.  */ +	double exp2_shift; +	double exp2_poly[EXP2_POLY_ORDER]; +	uint64_t tab[2*(1 << EXP_TABLE_BITS)]; +} __exp_data; + +#endif diff --git a/src/math/expf.c b/src/math/expf.c index feee2b0e..f9fbf8e7 100644 --- a/src/math/expf.c +++ b/src/math/expf.c @@ -1,83 +1,80 @@ -/* origin: FreeBSD /usr/src/lib/msun/src/e_expf.c */  /* - * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com. - */ -/* - * ==================================================== - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * Single-precision e^x function.   * - * Developed at SunPro, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT   */ +#include <math.h> +#include <stdint.h>  #include "libm.h" +#include "exp2f_data.h" -static const float -half[2] = {0.5,-0.5}, -ln2hi   = 6.9314575195e-1f,  /* 0x3f317200 */ -ln2lo   = 1.4286067653e-6f,  /* 0x35bfbe8e */ -invln2  = 1.4426950216e+0f,  /* 0x3fb8aa3b */  /* - * Domain [-0.34568, 0.34568], range ~[-4.278e-9, 4.447e-9]: - * |x*(exp(x)+1)/(exp(x)-1) - p(x)| < 2**-27.74 - */ -P1 =  1.6666625440e-1f, /*  0xaaaa8f.0p-26 */ -P2 = -2.7667332906e-3f; /* -0xb55215.0p-32 */ +EXP2F_TABLE_BITS = 5 +EXP2F_POLY_ORDER = 3 -float expf(float x) +ULP error: 0.502 (nearest rounding.) +Relative error: 1.69 * 2^-34 in [-ln2/64, ln2/64] (before rounding.) +Wrong count: 170635 (all nearest rounding wrong results with fma.) +Non-nearest ULP error: 1 (rounded ULP error) +*/ + +#define N (1 << EXP2F_TABLE_BITS) +#define InvLn2N __exp2f_data.invln2_scaled +#define T __exp2f_data.tab +#define C __exp2f_data.poly_scaled + +static inline uint32_t top12(float x)  { -	float_t hi, lo, c, xx, y; -	int k, sign; -	uint32_t hx; +	return asuint(x) >> 20; +} -	GET_FLOAT_WORD(hx, x); -	sign = hx >> 31;   /* sign bit of x */ -	hx &= 0x7fffffff;  /* high word of |x| */ +float expf(float x) +{ +	uint32_t abstop; +	uint64_t ki, t; +	double_t kd, xd, z, r, r2, y, s; -	/* special cases */ -	if (hx >= 0x42aeac50) {  /* if |x| >= -87.33655f or NaN */ -		if (hx > 0x7f800000) /* NaN */ -			return x; -		if (hx >= 0x42b17218 && !sign) {  /* x >= 88.722839f */ -			/* overflow */ -			x *= 0x1p127f; -			return x; -		} -		if (sign) { -			/* underflow */ -			FORCE_EVAL(-0x1p-149f/x); -			if (hx >= 0x42cff1b5)  /* x <= -103.972084f */ -				return 0; -		} +	xd = (double_t)x; +	abstop = top12(x) & 0x7ff; +	if (predict_false(abstop >= top12(88.0f))) { +		/* |x| >= 88 or x is nan.  */ +		if (asuint(x) == asuint(-INFINITY)) +			return 0.0f; +		if (abstop >= top12(INFINITY)) +			return x + x; +		if (x > 0x1.62e42ep6f) /* x > log(0x1p128) ~= 88.72 */ +			return __math_oflowf(0); +		if (x < -0x1.9fe368p6f) /* x < log(0x1p-150) ~= -103.97 */ +			return __math_uflowf(0);  	} -	/* argument reduction */ -	if (hx > 0x3eb17218) {  /* if |x| > 0.5 ln2 */ -		if (hx > 0x3f851592)  /* if |x| > 1.5 ln2 */ -			k = invln2*x + half[sign]; -		else -			k = 1 - sign - sign; -		hi = x - k*ln2hi;  /* k*ln2hi is exact here */ -		lo = k*ln2lo; -		x = hi - lo; -	} else if (hx > 0x39000000) {  /* |x| > 2**-14 */ -		k = 0; -		hi = x; -		lo = 0; -	} else { -		/* raise inexact */ -		FORCE_EVAL(0x1p127f + x); -		return 1 + x; -	} +	/* x*N/Ln2 = k + r with r in [-1/2, 1/2] and int k.  */ +	z = InvLn2N * xd; + +	/* Round and convert z to int, the result is in [-150*N, 128*N] and +	   ideally ties-to-even rule is used, otherwise the magnitude of r +	   can be bigger which gives larger approximation error.  */ +#if TOINT_INTRINSICS +	kd = roundtoint(z); +	ki = converttoint(z); +#else +# define SHIFT __exp2f_data.shift +	kd = eval_as_double(z + SHIFT); +	ki = asuint64(kd); +	kd -= SHIFT; +#endif +	r = z - kd; -	/* x is now in primary range */ -	xx = x*x; -	c = x - xx*(P1+xx*P2); -	y = 1 + (x*c/(2-c) - lo + hi); -	if (k == 0) -		return y; -	return scalbnf(y, k); +	/* exp(x) = 2^(k/N) * 2^(r/N) ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */ +	t = T[ki % N]; +	t += ki << (52 - EXP2F_TABLE_BITS); +	s = asdouble(t); +	z = C[0] * r + C[1]; +	r2 = r * r; +	y = C[2] * r + 1; +	y = z * r2 + y; +	y = y * s; +	return eval_as_float(y);  } diff --git a/src/math/expm1f.c b/src/math/expm1f.c index 297e0b44..09a41afe 100644 --- a/src/math/expm1f.c +++ b/src/math/expm1f.c @@ -16,7 +16,6 @@  #include "libm.h"  static const float -o_threshold = 8.8721679688e+01, /* 0x42b17180 */  ln2_hi      = 6.9313812256e-01, /* 0x3f317180 */  ln2_lo      = 9.0580006145e-06, /* 0x3717f7d1 */  invln2      = 1.4426950216e+00, /* 0x3fb8aa3b */ @@ -41,7 +40,7 @@ float expm1f(float x)  			return x;  		if (sign)  			return -1; -		if (x > o_threshold) { +		if (hx > 0x42b17217) { /* x > log(FLT_MAX) */  			x *= 0x1p127f;  			return x;  		} diff --git a/src/math/fma.c b/src/math/fma.c index 0c6f90c9..adfadca8 100644 --- a/src/math/fma.c +++ b/src/math/fma.c @@ -53,7 +53,7 @@ double fma(double x, double y, double z)  		return x*y + z;  	if (nz.e >= ZEROINFNAN) {  		if (nz.e > ZEROINFNAN) /* z==0 */ -			return x*y + z; +			return x*y;  		return z;  	} diff --git a/src/math/fmaf.c b/src/math/fmaf.c index 80f5cd8a..7c65acf1 100644 --- a/src/math/fmaf.c +++ b/src/math/fmaf.c @@ -77,17 +77,16 @@ float fmaf(float x, float y, float z)  	 * If result is inexact, and exactly halfway between two float values,  	 * we need to adjust the low-order bit in the direction of the error.  	 */ -#ifdef FE_TOWARDZERO -	fesetround(FE_TOWARDZERO); -#endif -	volatile double vxy = xy;  /* XXX work around gcc CSE bug */ -	double adjusted_result = vxy + z; -	fesetround(FE_TONEAREST); -	if (result == adjusted_result) { -		u.f = adjusted_result; +	double err; +	int neg = u.i >> 63; +	if (neg == (z > xy)) +		err = xy - result + z; +	else +		err = z - result + xy; +	if (neg == (err < 0))  		u.i++; -		adjusted_result = u.f; -	} -	z = adjusted_result; +	else +		u.i--; +	z = u.f;  	return z;  } diff --git a/src/math/i386/acos.s b/src/math/i386/acos.s index 47f365ef..af423a2f 100644 --- a/src/math/i386/acos.s +++ b/src/math/i386/acos.s @@ -1,22 +1,10 @@  # use acos(x) = atan2(fabs(sqrt((1-x)*(1+x))), x) -.global acosf -.type acosf,@function -acosf: -	flds 4(%esp) -	jmp 1f - -.global acosl -.type acosl,@function -acosl: -	fldt 4(%esp) -	jmp 1f -  .global acos  .type acos,@function  acos:  	fldl 4(%esp) -1:	fld %st(0) +	fld %st(0)  	fld1  	fsub %st(0),%st(1)  	fadd %st(2) @@ -25,4 +13,6 @@ acos:  	fabs         # fix sign of zero (matters in downward rounding mode)  	fxch %st(1)  	fpatan +	fstpl 4(%esp) +	fldl 4(%esp)  	ret diff --git a/src/math/i386/acosf.s b/src/math/i386/acosf.s index 6c95509f..d2cdfdbf 100644 --- a/src/math/i386/acosf.s +++ b/src/math/i386/acosf.s @@ -1 +1,16 @@ -# see acos.s +.global acosf +.type acosf,@function +acosf: +	flds 4(%esp) +	fld %st(0) +	fld1 +	fsub %st(0),%st(1) +	fadd %st(2) +	fmulp +	fsqrt +	fabs         # fix sign of zero (matters in downward rounding mode) +	fxch %st(1) +	fpatan +	fstps 4(%esp) +	flds 4(%esp) +	ret diff --git a/src/math/i386/acosl.s b/src/math/i386/acosl.s index 6c95509f..599c8230 100644 --- a/src/math/i386/acosl.s +++ b/src/math/i386/acosl.s @@ -1 +1,14 @@ -# see acos.s +.global acosl +.type acosl,@function +acosl: +	fldt 4(%esp) +	fld %st(0) +	fld1 +	fsub %st(0),%st(1) +	fadd %st(2) +	fmulp +	fsqrt +	fabs         # fix sign of zero (matters in downward rounding mode) +	fxch %st(1) +	fpatan +	ret diff --git a/src/math/i386/asin.s b/src/math/i386/asin.s index a9f691bf..2bc8356f 100644 --- a/src/math/i386/asin.s +++ b/src/math/i386/asin.s @@ -1,26 +1,3 @@ -.global asinf -.type asinf,@function -asinf: -	flds 4(%esp) -	mov 4(%esp),%eax -	add %eax,%eax -	cmp $0x01000000,%eax -	jae 1f -		# subnormal x, return x with underflow -	fnstsw %ax -	and $16,%ax -	jnz 2f -	fld %st(0) -	fmul %st(1) -	fstps 4(%esp) -2:	ret - -.global asinl -.type asinl,@function -asinl: -	fldt 4(%esp) -	jmp 1f -  .global asin  .type asin,@function  asin: @@ -28,18 +5,17 @@ asin:  	mov 8(%esp),%eax  	add %eax,%eax  	cmp $0x00200000,%eax -	jae 1f -		# subnormal x, return x with underflow -	fnstsw %ax -	and $16,%ax -	jnz 2f -	fsts 4(%esp) -2:	ret -1:	fld %st(0) +	jb 1f +	fld %st(0)  	fld1  	fsub %st(0),%st(1)  	fadd %st(2)  	fmulp  	fsqrt  	fpatan +	fstpl 4(%esp) +	fldl 4(%esp) +	ret +		# subnormal x, return x with underflow +1:	fsts 4(%esp)  	ret diff --git a/src/math/i386/asinf.s b/src/math/i386/asinf.s index e07bf599..05909753 100644 --- a/src/math/i386/asinf.s +++ b/src/math/i386/asinf.s @@ -1 +1,23 @@ -# see asin.s +.global asinf +.type asinf,@function +asinf: +	flds 4(%esp) +	mov 4(%esp),%eax +	add %eax,%eax +	cmp $0x01000000,%eax +	jb 1f +	fld %st(0) +	fld1 +	fsub %st(0),%st(1) +	fadd %st(2) +	fmulp +	fsqrt +	fpatan +	fstps 4(%esp) +	flds 4(%esp) +	ret +		# subnormal x, return x with underflow +1:	fld %st(0) +	fmul %st(1) +	fstps 4(%esp) +	ret diff --git a/src/math/i386/asinl.s b/src/math/i386/asinl.s index e07bf599..e973fc85 100644 --- a/src/math/i386/asinl.s +++ b/src/math/i386/asinl.s @@ -1 +1,12 @@ -# see asin.s +.global asinl +.type asinl,@function +asinl: +	fldt 4(%esp) +	fld %st(0) +	fld1 +	fsub %st(0),%st(1) +	fadd %st(2) +	fmulp +	fsqrt +	fpatan +	ret diff --git a/src/math/i386/atan.s b/src/math/i386/atan.s index d73137b2..2c57f6b3 100644 --- a/src/math/i386/atan.s +++ b/src/math/i386/atan.s @@ -8,10 +8,9 @@ atan:  	jb 1f  	fld1  	fpatan +	fstpl 4(%esp) +	fldl 4(%esp)  	ret  		# subnormal x, return x with underflow -1:	fnstsw %ax -	and $16,%ax -	jnz 2f -	fsts 4(%esp) -2:	ret +1:	fsts 4(%esp) +	ret diff --git a/src/math/i386/atan2.s b/src/math/i386/atan2.s index a7d2979b..8bc441b1 100644 --- a/src/math/i386/atan2.s +++ b/src/math/i386/atan2.s @@ -4,14 +4,12 @@ atan2:  	fldl 4(%esp)  	fldl 12(%esp)  	fpatan -	fstl 4(%esp) +	fstpl 4(%esp) +	fldl 4(%esp)  	mov 8(%esp),%eax  	add %eax,%eax  	cmp $0x00200000,%eax  	jae 1f  		# subnormal x, return x with underflow -	fnstsw %ax -	and $16,%ax -	jnz 1f  	fsts 4(%esp)  1:	ret diff --git a/src/math/i386/atan2f.s b/src/math/i386/atan2f.s index 14b88ce5..3908c86d 100644 --- a/src/math/i386/atan2f.s +++ b/src/math/i386/atan2f.s @@ -4,15 +4,13 @@ atan2f:  	flds 4(%esp)  	flds 8(%esp)  	fpatan -	fsts 4(%esp) +	fstps 4(%esp) +	flds 4(%esp)  	mov 4(%esp),%eax  	add %eax,%eax  	cmp $0x01000000,%eax  	jae 1f  		# subnormal x, return x with underflow -	fnstsw %ax -	and $16,%ax -	jnz 1f  	fld %st(0)  	fmul %st(1)  	fstps 4(%esp) diff --git a/src/math/i386/atanf.s b/src/math/i386/atanf.s index 8caddefa..c2cbe2e0 100644 --- a/src/math/i386/atanf.s +++ b/src/math/i386/atanf.s @@ -8,12 +8,11 @@ atanf:  	jb 1f  	fld1  	fpatan +	fstps 4(%esp) +	flds 4(%esp)  	ret  		# subnormal x, return x with underflow -1:	fnstsw %ax -	and $16,%ax -	jnz 2f -	fld %st(0) +1:	fld %st(0)  	fmul %st(1)  	fstps 4(%esp) -2:	ret +	ret diff --git a/src/math/i386/exp2.s b/src/math/i386/exp2.s deleted file mode 100644 index f335a3e5..00000000 --- a/src/math/i386/exp2.s +++ /dev/null @@ -1 +0,0 @@ -# see exp.s diff --git a/src/math/i386/exp2f.s b/src/math/i386/exp2f.s deleted file mode 100644 index f335a3e5..00000000 --- a/src/math/i386/exp2f.s +++ /dev/null @@ -1 +0,0 @@ -# see exp.s diff --git a/src/math/i386/exp2l.s b/src/math/i386/exp2l.s index f335a3e5..8125761d 100644 --- a/src/math/i386/exp2l.s +++ b/src/math/i386/exp2l.s @@ -1 +1 @@ -# see exp.s +# see exp_ld.s diff --git a/src/math/i386/exp.s b/src/math/i386/exp_ld.s index c7aa5b6e..99cba01f 100644 --- a/src/math/i386/exp.s +++ b/src/math/i386/exp_ld.s @@ -1,41 +1,8 @@ -.global expm1f -.type expm1f,@function -expm1f: -	flds 4(%esp) -	mov 4(%esp),%eax -	add %eax,%eax -	cmp $0x01000000,%eax -	jae 1f -		# subnormal x, return x with underflow -	fnstsw %ax -	and $16,%ax -	jnz 2f -	fld %st(0) -	fmul %st(1) -	fstps 4(%esp) -2:	ret -  .global expm1l  .type expm1l,@function  expm1l:  	fldt 4(%esp) -	jmp 1f - -.global expm1 -.type expm1,@function -expm1: -	fldl 4(%esp) -	mov 8(%esp),%eax -	add %eax,%eax -	cmp $0x00200000,%eax -	jae 1f -		# subnormal x, return x with underflow -	fnstsw %ax -	and $16,%ax -	jnz 2f -	fsts 4(%esp) -2:	ret -1:	fldl2e +	fldl2e  	fmulp  	mov $0xc2820000,%eax  	push %eax @@ -65,12 +32,6 @@ expm1:  	fsubrp  	ret -.global exp2f -.type exp2f,@function -exp2f: -	flds 4(%esp) -	jmp 1f -  .global exp2l  .global __exp2l  .hidden __exp2l @@ -78,26 +39,6 @@ exp2f:  exp2l:  __exp2l:  	fldt 4(%esp) -	jmp 1f - -.global expf -.type expf,@function -expf: -	flds 4(%esp) -	jmp 2f - -.global exp -.type exp,@function -exp: -	fldl 4(%esp) -2:	fldl2e -	fmulp -	jmp 1f - -.global exp2 -.type exp2,@function -exp2: -	fldl 4(%esp)  1:	sub $12,%esp  	fld %st(0)  	fstpt (%esp) diff --git a/src/math/i386/expf.s b/src/math/i386/expf.s deleted file mode 100644 index f335a3e5..00000000 --- a/src/math/i386/expf.s +++ /dev/null @@ -1 +0,0 @@ -# see exp.s diff --git a/src/math/i386/expm1.s b/src/math/i386/expm1.s deleted file mode 100644 index f335a3e5..00000000 --- a/src/math/i386/expm1.s +++ /dev/null @@ -1 +0,0 @@ -# see exp.s diff --git a/src/math/i386/expm1f.s b/src/math/i386/expm1f.s deleted file mode 100644 index f335a3e5..00000000 --- a/src/math/i386/expm1f.s +++ /dev/null @@ -1 +0,0 @@ -# see exp.s diff --git a/src/math/i386/expm1l.s b/src/math/i386/expm1l.s index f335a3e5..8125761d 100644 --- a/src/math/i386/expm1l.s +++ b/src/math/i386/expm1l.s @@ -1 +1 @@ -# see exp.s +# see exp_ld.s diff --git a/src/math/i386/fabs.c b/src/math/i386/fabs.c new file mode 100644 index 00000000..39672786 --- /dev/null +++ b/src/math/i386/fabs.c @@ -0,0 +1,7 @@ +#include <math.h> + +double fabs(double x) +{ +	__asm__ ("fabs" : "+t"(x)); +	return x; +} diff --git a/src/math/i386/fabs.s b/src/math/i386/fabs.s deleted file mode 100644 index d66ea9a1..00000000 --- a/src/math/i386/fabs.s +++ /dev/null @@ -1,6 +0,0 @@ -.global fabs -.type fabs,@function -fabs: -	fldl 4(%esp) -	fabs -	ret diff --git a/src/math/i386/fabsf.c b/src/math/i386/fabsf.c new file mode 100644 index 00000000..d882eee3 --- /dev/null +++ b/src/math/i386/fabsf.c @@ -0,0 +1,7 @@ +#include <math.h> + +float fabsf(float x) +{ +	__asm__ ("fabs" : "+t"(x)); +	return x; +} diff --git a/src/math/i386/fabsf.s b/src/math/i386/fabsf.s deleted file mode 100644 index a981c422..00000000 --- a/src/math/i386/fabsf.s +++ /dev/null @@ -1,6 +0,0 @@ -.global fabsf -.type fabsf,@function -fabsf: -	flds 4(%esp) -	fabs -	ret diff --git a/src/math/i386/fabsl.c b/src/math/i386/fabsl.c new file mode 100644 index 00000000..cc1c9ed9 --- /dev/null +++ b/src/math/i386/fabsl.c @@ -0,0 +1,7 @@ +#include <math.h> + +long double fabsl(long double x) +{ +	__asm__ ("fabs" : "+t"(x)); +	return x; +} diff --git a/src/math/i386/fabsl.s b/src/math/i386/fabsl.s deleted file mode 100644 index ceef9e4c..00000000 --- a/src/math/i386/fabsl.s +++ /dev/null @@ -1,6 +0,0 @@ -.global fabsl -.type fabsl,@function -fabsl: -	fldt 4(%esp) -	fabs -	ret diff --git a/src/math/i386/fmod.c b/src/math/i386/fmod.c new file mode 100644 index 00000000..ea0c58d9 --- /dev/null +++ b/src/math/i386/fmod.c @@ -0,0 +1,10 @@ +#include <math.h> + +double fmod(double x, double y) +{ +	unsigned short fpsr; +	// fprem does not introduce excess precision into x +	do __asm__ ("fprem; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y)); +	while (fpsr & 0x400); +	return x; +} diff --git a/src/math/i386/fmod.s b/src/math/i386/fmod.s deleted file mode 100644 index 2113b3c5..00000000 --- a/src/math/i386/fmod.s +++ /dev/null @@ -1,11 +0,0 @@ -.global fmod -.type fmod,@function -fmod: -	fldl 12(%esp) -	fldl 4(%esp) -1:	fprem -	fnstsw %ax -	sahf -	jp 1b -	fstp %st(1) -	ret diff --git a/src/math/i386/fmodf.c b/src/math/i386/fmodf.c new file mode 100644 index 00000000..90b56ab0 --- /dev/null +++ b/src/math/i386/fmodf.c @@ -0,0 +1,10 @@ +#include <math.h> + +float fmodf(float x, float y) +{ +	unsigned short fpsr; +	// fprem does not introduce excess precision into x +	do __asm__ ("fprem; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y)); +	while (fpsr & 0x400); +	return x; +} diff --git a/src/math/i386/fmodf.s b/src/math/i386/fmodf.s deleted file mode 100644 index e04e2a56..00000000 --- a/src/math/i386/fmodf.s +++ /dev/null @@ -1,11 +0,0 @@ -.global fmodf -.type fmodf,@function -fmodf: -	flds 8(%esp) -	flds 4(%esp) -1:	fprem -	fnstsw %ax -	sahf -	jp 1b -	fstp %st(1) -	ret diff --git a/src/math/i386/fmodl.c b/src/math/i386/fmodl.c new file mode 100644 index 00000000..3daeab06 --- /dev/null +++ b/src/math/i386/fmodl.c @@ -0,0 +1,9 @@ +#include <math.h> + +long double fmodl(long double x, long double y) +{ +	unsigned short fpsr; +	do __asm__ ("fprem; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y)); +	while (fpsr & 0x400); +	return x; +} diff --git a/src/math/i386/fmodl.s b/src/math/i386/fmodl.s deleted file mode 100644 index 0cb3fe9b..00000000 --- a/src/math/i386/fmodl.s +++ /dev/null @@ -1,11 +0,0 @@ -.global fmodl -.type fmodl,@function -fmodl: -	fldt 16(%esp) -	fldt 4(%esp) -1:	fprem -	fnstsw %ax -	sahf -	jp 1b -	fstp %st(1) -	ret diff --git a/src/math/i386/llrint.c b/src/math/i386/llrint.c new file mode 100644 index 00000000..aa400817 --- /dev/null +++ b/src/math/i386/llrint.c @@ -0,0 +1,8 @@ +#include <math.h> + +long long llrint(double x) +{ +	long long r; +	__asm__ ("fistpll %0" : "=m"(r) : "t"(x) : "st"); +	return r; +} diff --git a/src/math/i386/llrint.s b/src/math/i386/llrint.s deleted file mode 100644 index 8e89cd91..00000000 --- a/src/math/i386/llrint.s +++ /dev/null @@ -1,8 +0,0 @@ -.global llrint -.type llrint,@function -llrint: -	fldl 4(%esp) -	fistpll 4(%esp) -	mov 4(%esp),%eax -	mov 8(%esp),%edx -	ret diff --git a/src/math/i386/llrintf.c b/src/math/i386/llrintf.c new file mode 100644 index 00000000..c41a317b --- /dev/null +++ b/src/math/i386/llrintf.c @@ -0,0 +1,8 @@ +#include <math.h> + +long long llrintf(float x) +{ +	long long r; +	__asm__ ("fistpll %0" : "=m"(r) : "t"(x) : "st"); +	return r; +} diff --git a/src/math/i386/llrintf.s b/src/math/i386/llrintf.s deleted file mode 100644 index aa850c6c..00000000 --- a/src/math/i386/llrintf.s +++ /dev/null @@ -1,9 +0,0 @@ -.global llrintf -.type llrintf,@function -llrintf: -	sub $8,%esp -	flds 12(%esp) -	fistpll (%esp) -	pop %eax -	pop %edx -	ret diff --git a/src/math/i386/llrintl.c b/src/math/i386/llrintl.c new file mode 100644 index 00000000..c439ef28 --- /dev/null +++ b/src/math/i386/llrintl.c @@ -0,0 +1,8 @@ +#include <math.h> + +long long llrintl(long double x) +{ +	long long r; +	__asm__ ("fistpll %0" : "=m"(r) : "t"(x) : "st"); +	return r; +} diff --git a/src/math/i386/llrintl.s b/src/math/i386/llrintl.s deleted file mode 100644 index 1cfb56f1..00000000 --- a/src/math/i386/llrintl.s +++ /dev/null @@ -1,8 +0,0 @@ -.global llrintl -.type llrintl,@function -llrintl: -	fldt 4(%esp) -	fistpll 4(%esp) -	mov 4(%esp),%eax -	mov 8(%esp),%edx -	ret diff --git a/src/math/i386/log.s b/src/math/i386/log.s index fcccf030..08c59924 100644 --- a/src/math/i386/log.s +++ b/src/math/i386/log.s @@ -4,4 +4,6 @@ log:  	fldln2  	fldl 4(%esp)  	fyl2x +	fstpl 4(%esp) +	fldl 4(%esp)  	ret diff --git a/src/math/i386/log10.s b/src/math/i386/log10.s index 28eb5b2f..120e91ec 100644 --- a/src/math/i386/log10.s +++ b/src/math/i386/log10.s @@ -4,4 +4,6 @@ log10:  	fldlg2  	fldl 4(%esp)  	fyl2x +	fstpl 4(%esp) +	fldl 4(%esp)  	ret diff --git a/src/math/i386/log10f.s b/src/math/i386/log10f.s index c0c0c67e..b055493a 100644 --- a/src/math/i386/log10f.s +++ b/src/math/i386/log10f.s @@ -4,4 +4,6 @@ log10f:  	fldlg2  	flds 4(%esp)  	fyl2x +	fstps 4(%esp) +	flds 4(%esp)  	ret diff --git a/src/math/i386/log1p.s b/src/math/i386/log1p.s index 6b6929c7..f3c95f83 100644 --- a/src/math/i386/log1p.s +++ b/src/math/i386/log1p.s @@ -10,15 +10,16 @@ log1p:  	cmp $0x00100000,%eax  	jb 2f  	fyl2xp1 +	fstpl 4(%esp) +	fldl 4(%esp)  	ret  1:	fld1  	faddp  	fyl2x +	fstpl 4(%esp) +	fldl 4(%esp)  	ret  		# subnormal x, return x with underflow -2:	fnstsw %ax -	and $16,%ax -	jnz 1f -	fsts 4(%esp) +2:	fsts 4(%esp)  	fstp %st(1) -1:	ret +	ret diff --git a/src/math/i386/log1pf.s b/src/math/i386/log1pf.s index c0bcd30f..9f13d95f 100644 --- a/src/math/i386/log1pf.s +++ b/src/math/i386/log1pf.s @@ -10,16 +10,17 @@ log1pf:  	cmp $0x00800000,%eax  	jb 2f  	fyl2xp1 +	fstps 4(%esp) +	flds 4(%esp)  	ret  1:	fld1  	faddp  	fyl2x +	fstps 4(%esp) +	flds 4(%esp)  	ret  		# subnormal x, return x with underflow -2:	fnstsw %ax -	and $16,%ax -	jnz 1f -	fxch +2:	fxch  	fmul %st(1)  	fstps 4(%esp) -1:	ret +	ret diff --git a/src/math/i386/log2.s b/src/math/i386/log2.s index 15088037..7eff0b61 100644 --- a/src/math/i386/log2.s +++ b/src/math/i386/log2.s @@ -4,4 +4,6 @@ log2:  	fld1  	fldl 4(%esp)  	fyl2x +	fstpl 4(%esp) +	fldl 4(%esp)  	ret diff --git a/src/math/i386/log2f.s b/src/math/i386/log2f.s index 00cdce75..b32fa2f7 100644 --- a/src/math/i386/log2f.s +++ b/src/math/i386/log2f.s @@ -4,4 +4,6 @@ log2f:  	fld1  	flds 4(%esp)  	fyl2x +	fstps 4(%esp) +	flds 4(%esp)  	ret diff --git a/src/math/i386/logf.s b/src/math/i386/logf.s index da7ff3ae..4d0346a4 100644 --- a/src/math/i386/logf.s +++ b/src/math/i386/logf.s @@ -4,4 +4,6 @@ logf:  	fldln2  	flds 4(%esp)  	fyl2x +	fstps 4(%esp) +	flds 4(%esp)  	ret diff --git a/src/math/i386/lrint.c b/src/math/i386/lrint.c new file mode 100644 index 00000000..89563ab2 --- /dev/null +++ b/src/math/i386/lrint.c @@ -0,0 +1,8 @@ +#include <math.h> + +long lrint(double x) +{ +	long r; +	__asm__ ("fistpl %0" : "=m"(r) : "t"(x) : "st"); +	return r; +} diff --git a/src/math/i386/lrint.s b/src/math/i386/lrint.s deleted file mode 100644 index 02b83d9f..00000000 --- a/src/math/i386/lrint.s +++ /dev/null @@ -1,7 +0,0 @@ -.global lrint -.type lrint,@function -lrint: -	fldl 4(%esp) -	fistpl 4(%esp) -	mov 4(%esp),%eax -	ret diff --git a/src/math/i386/lrintf.c b/src/math/i386/lrintf.c new file mode 100644 index 00000000..0bbf29de --- /dev/null +++ b/src/math/i386/lrintf.c @@ -0,0 +1,8 @@ +#include <math.h> + +long lrintf(float x) +{ +	long r; +	__asm__ ("fistpl %0" : "=m"(r) : "t"(x) : "st"); +	return r; +} diff --git a/src/math/i386/lrintf.s b/src/math/i386/lrintf.s deleted file mode 100644 index 907aac29..00000000 --- a/src/math/i386/lrintf.s +++ /dev/null @@ -1,7 +0,0 @@ -.global lrintf -.type lrintf,@function -lrintf: -	flds 4(%esp) -	fistpl 4(%esp) -	mov 4(%esp),%eax -	ret diff --git a/src/math/i386/lrintl.c b/src/math/i386/lrintl.c new file mode 100644 index 00000000..eb8c0902 --- /dev/null +++ b/src/math/i386/lrintl.c @@ -0,0 +1,8 @@ +#include <math.h> + +long lrintl(long double x) +{ +	long r; +	__asm__ ("fistpl %0" : "=m"(r) : "t"(x) : "st"); +	return r; +} diff --git a/src/math/i386/lrintl.s b/src/math/i386/lrintl.s deleted file mode 100644 index 3ae05aac..00000000 --- a/src/math/i386/lrintl.s +++ /dev/null @@ -1,7 +0,0 @@ -.global lrintl -.type lrintl,@function -lrintl: -	fldt 4(%esp) -	fistpl 4(%esp) -	mov 4(%esp),%eax -	ret diff --git a/src/math/i386/remainder.c b/src/math/i386/remainder.c new file mode 100644 index 00000000..c083df90 --- /dev/null +++ b/src/math/i386/remainder.c @@ -0,0 +1,12 @@ +#include <math.h> + +double remainder(double x, double y) +{ +	unsigned short fpsr; +	// fprem1 does not introduce excess precision into x +	do __asm__ ("fprem1; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y)); +	while (fpsr & 0x400); +	return x; +} + +weak_alias(remainder, drem); diff --git a/src/math/i386/remainder.s b/src/math/i386/remainder.s deleted file mode 100644 index ab1da95d..00000000 --- a/src/math/i386/remainder.s +++ /dev/null @@ -1,14 +0,0 @@ -.global remainder -.type remainder,@function -remainder: -.weak drem -.type drem,@function -drem: -	fldl 12(%esp) -	fldl 4(%esp) -1:	fprem1 -	fnstsw %ax -	sahf -	jp 1b -	fstp %st(1) -	ret diff --git a/src/math/i386/remainderf.c b/src/math/i386/remainderf.c new file mode 100644 index 00000000..280207d2 --- /dev/null +++ b/src/math/i386/remainderf.c @@ -0,0 +1,12 @@ +#include <math.h> + +float remainderf(float x, float y) +{ +	unsigned short fpsr; +	// fprem1 does not introduce excess precision into x +	do __asm__ ("fprem1; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y)); +	while (fpsr & 0x400); +	return x; +} + +weak_alias(remainderf, dremf); diff --git a/src/math/i386/remainderf.s b/src/math/i386/remainderf.s deleted file mode 100644 index 6a7378a3..00000000 --- a/src/math/i386/remainderf.s +++ /dev/null @@ -1,14 +0,0 @@ -.global remainderf -.type remainderf,@function -remainderf: -.weak dremf -.type dremf,@function -dremf: -	flds 8(%esp) -	flds 4(%esp) -1:	fprem1 -	fnstsw %ax -	sahf -	jp 1b -	fstp %st(1) -	ret diff --git a/src/math/i386/remainderl.c b/src/math/i386/remainderl.c new file mode 100644 index 00000000..8cf75071 --- /dev/null +++ b/src/math/i386/remainderl.c @@ -0,0 +1,9 @@ +#include <math.h> + +long double remainderl(long double x, long double y) +{ +	unsigned short fpsr; +	do __asm__ ("fprem1; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y)); +	while (fpsr & 0x400); +	return x; +} diff --git a/src/math/i386/remainderl.s b/src/math/i386/remainderl.s deleted file mode 100644 index b41518ed..00000000 --- a/src/math/i386/remainderl.s +++ /dev/null @@ -1,11 +0,0 @@ -.global remainderl -.type remainderl,@function -remainderl: -	fldt 16(%esp) -	fldt 4(%esp) -1:	fprem1 -	fnstsw %ax -	sahf -	jp 1b -	fstp %st(1) -	ret diff --git a/src/math/i386/rint.c b/src/math/i386/rint.c new file mode 100644 index 00000000..a5276a60 --- /dev/null +++ b/src/math/i386/rint.c @@ -0,0 +1,7 @@ +#include <math.h> + +double rint(double x) +{ +	__asm__ ("frndint" : "+t"(x)); +	return x; +} diff --git a/src/math/i386/rint.s b/src/math/i386/rint.s deleted file mode 100644 index bb99a11c..00000000 --- a/src/math/i386/rint.s +++ /dev/null @@ -1,6 +0,0 @@ -.global rint -.type rint,@function -rint: -	fldl 4(%esp) -	frndint -	ret diff --git a/src/math/i386/rintf.c b/src/math/i386/rintf.c new file mode 100644 index 00000000..bb4121a4 --- /dev/null +++ b/src/math/i386/rintf.c @@ -0,0 +1,7 @@ +#include <math.h> + +float rintf(float x) +{ +	__asm__ ("frndint" : "+t"(x)); +	return x; +} diff --git a/src/math/i386/rintf.s b/src/math/i386/rintf.s deleted file mode 100644 index bce4c5a6..00000000 --- a/src/math/i386/rintf.s +++ /dev/null @@ -1,6 +0,0 @@ -.global rintf -.type rintf,@function -rintf: -	flds 4(%esp) -	frndint -	ret diff --git a/src/math/i386/rintl.c b/src/math/i386/rintl.c new file mode 100644 index 00000000..e1a92077 --- /dev/null +++ b/src/math/i386/rintl.c @@ -0,0 +1,7 @@ +#include <math.h> + +long double rintl(long double x) +{ +	__asm__ ("frndint" : "+t"(x)); +	return x; +} diff --git a/src/math/i386/rintl.s b/src/math/i386/rintl.s deleted file mode 100644 index cd2bf9a9..00000000 --- a/src/math/i386/rintl.s +++ /dev/null @@ -1,6 +0,0 @@ -.global rintl -.type rintl,@function -rintl: -	fldt 4(%esp) -	frndint -	ret diff --git a/src/math/i386/sqrt.c b/src/math/i386/sqrt.c new file mode 100644 index 00000000..934fbcca --- /dev/null +++ b/src/math/i386/sqrt.c @@ -0,0 +1,15 @@ +#include "libm.h" + +double sqrt(double x) +{ +	union ldshape ux; +	unsigned fpsr; +	__asm__ ("fsqrt; fnstsw %%ax": "=t"(ux.f), "=a"(fpsr) : "0"(x)); +	if ((ux.i.m & 0x7ff) != 0x400) +		return (double)ux.f; +	/* Rounding to double would have encountered an exact halfway case. +	   Adjust mantissa downwards if fsqrt rounded up, else upwards. +	   (result of fsqrt could not have been exact) */ +	ux.i.m ^= (fpsr & 0x200) + 0x300; +	return (double)ux.f; +} diff --git a/src/math/i386/sqrt.s b/src/math/i386/sqrt.s deleted file mode 100644 index 57837e25..00000000 --- a/src/math/i386/sqrt.s +++ /dev/null @@ -1,21 +0,0 @@ -.global sqrt -.type sqrt,@function -sqrt:	fldl 4(%esp) -	fsqrt -	fnstsw %ax -	sub $12,%esp -	fld %st(0) -	fstpt (%esp) -	mov (%esp),%ecx -	and $0x7ff,%ecx -	cmp $0x400,%ecx -	jnz 1f -	and $0x200,%eax -	sub $0x100,%eax -	sub %eax,(%esp) -	fstp %st(0) -	fldt (%esp) -1:	add $12,%esp -	fstpl 4(%esp) -	fldl 4(%esp) -	ret diff --git a/src/math/i386/sqrtf.c b/src/math/i386/sqrtf.c new file mode 100644 index 00000000..41c65c2b --- /dev/null +++ b/src/math/i386/sqrtf.c @@ -0,0 +1,12 @@ +#include <math.h> + +float sqrtf(float x) +{ +	long double t; +	/* The long double result has sufficient precision so that +	 * second rounding to float still keeps the returned value +	 * correctly rounded, see Pierre Roux, "Innocuous Double +	 * Rounding of Basic Arithmetic Operations". */ +	__asm__ ("fsqrt" : "=t"(t) : "0"(x)); +	return (float)t; +} diff --git a/src/math/i386/sqrtf.s b/src/math/i386/sqrtf.s deleted file mode 100644 index 9e944f45..00000000 --- a/src/math/i386/sqrtf.s +++ /dev/null @@ -1,7 +0,0 @@ -.global sqrtf -.type sqrtf,@function -sqrtf:	flds 4(%esp) -	fsqrt -	fstps 4(%esp) -	flds 4(%esp) -	ret diff --git a/src/math/i386/sqrtl.c b/src/math/i386/sqrtl.c new file mode 100644 index 00000000..864cfcc4 --- /dev/null +++ b/src/math/i386/sqrtl.c @@ -0,0 +1,7 @@ +#include <math.h> + +long double sqrtl(long double x) +{ +	__asm__ ("fsqrt" : "+t"(x)); +	return x; +} diff --git a/src/math/i386/sqrtl.s b/src/math/i386/sqrtl.s deleted file mode 100644 index e0d42616..00000000 --- a/src/math/i386/sqrtl.s +++ /dev/null @@ -1,5 +0,0 @@ -.global sqrtl -.type sqrtl,@function -sqrtl:	fldt 4(%esp) -	fsqrt -	ret diff --git a/src/math/log.c b/src/math/log.c index e61e113d..cc52585a 100644 --- a/src/math/log.c +++ b/src/math/log.c @@ -1,118 +1,112 @@ -/* origin: FreeBSD /usr/src/lib/msun/src/e_log.c */  /* - * ==================================================== - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * Double-precision log(x) function.   * - * Developed at SunSoft, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - */ -/* log(x) - * Return the logarithm of x - * - * Method : - *   1. Argument Reduction: find k and f such that - *                      x = 2^k * (1+f), - *         where  sqrt(2)/2 < 1+f < sqrt(2) . - * - *   2. Approximation of log(1+f). - *      Let s = f/(2+f) ; based on log(1+f) = log(1+s) - log(1-s) - *               = 2s + 2/3 s**3 + 2/5 s**5 + ....., - *               = 2s + s*R - *      We use a special Remez algorithm on [0,0.1716] to generate - *      a polynomial of degree 14 to approximate R The maximum error - *      of this polynomial approximation is bounded by 2**-58.45. In - *      other words, - *                      2      4      6      8      10      12      14 - *          R(z) ~ Lg1*s +Lg2*s +Lg3*s +Lg4*s +Lg5*s  +Lg6*s  +Lg7*s - *      (the values of Lg1 to Lg7 are listed in the program) - *      and - *          |      2          14          |     -58.45 - *          | Lg1*s +...+Lg7*s    -  R(z) | <= 2 - *          |                             | - *      Note that 2s = f - s*f = f - hfsq + s*hfsq, where hfsq = f*f/2. - *      In order to guarantee error in log below 1ulp, we compute log - *      by - *              log(1+f) = f - s*(f - R)        (if f is not too large) - *              log(1+f) = f - (hfsq - s*(hfsq+R)).     (better accuracy) - * - *      3. Finally,  log(x) = k*ln2 + log(1+f). - *                          = k*ln2_hi+(f-(hfsq-(s*(hfsq+R)+k*ln2_lo))) - *         Here ln2 is split into two floating point number: - *                      ln2_hi + ln2_lo, - *         where n*ln2_hi is always exact for |n| < 2000. - * - * Special cases: - *      log(x) is NaN with signal if x < 0 (including -INF) ; - *      log(+INF) is +INF; log(0) is -INF with signal; - *      log(NaN) is that NaN with no signal. - * - * Accuracy: - *      according to an error analysis, the error is always less than - *      1 ulp (unit in the last place). - * - * Constants: - * The hexadecimal values are the intended ones for the following - * constants. The decimal values may be used, provided that the - * compiler will convert from decimal to binary accurately enough - * to produce the hexadecimal values shown. + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT   */  #include <math.h>  #include <stdint.h> +#include "libm.h" +#include "log_data.h" + +#define T __log_data.tab +#define T2 __log_data.tab2 +#define B __log_data.poly1 +#define A __log_data.poly +#define Ln2hi __log_data.ln2hi +#define Ln2lo __log_data.ln2lo +#define N (1 << LOG_TABLE_BITS) +#define OFF 0x3fe6000000000000 -static const double -ln2_hi = 6.93147180369123816490e-01,  /* 3fe62e42 fee00000 */ -ln2_lo = 1.90821492927058770002e-10,  /* 3dea39ef 35793c76 */ -Lg1 = 6.666666666666735130e-01,  /* 3FE55555 55555593 */ -Lg2 = 3.999999999940941908e-01,  /* 3FD99999 9997FA04 */ -Lg3 = 2.857142874366239149e-01,  /* 3FD24924 94229359 */ -Lg4 = 2.222219843214978396e-01,  /* 3FCC71C5 1D8E78AF */ -Lg5 = 1.818357216161805012e-01,  /* 3FC74664 96CB03DE */ -Lg6 = 1.531383769920937332e-01,  /* 3FC39A09 D078C69F */ -Lg7 = 1.479819860511658591e-01;  /* 3FC2F112 DF3E5244 */ +/* Top 16 bits of a double.  */ +static inline uint32_t top16(double x) +{ +	return asuint64(x) >> 48; +}  double log(double x)  { -	union {double f; uint64_t i;} u = {x}; -	double_t hfsq,f,s,z,R,w,t1,t2,dk; -	uint32_t hx; -	int k; +	double_t w, z, r, r2, r3, y, invc, logc, kd, hi, lo; +	uint64_t ix, iz, tmp; +	uint32_t top; +	int k, i; + +	ix = asuint64(x); +	top = top16(x); +#define LO asuint64(1.0 - 0x1p-4) +#define HI asuint64(1.0 + 0x1.09p-4) +	if (predict_false(ix - LO < HI - LO)) { +		/* Handle close to 1.0 inputs separately.  */ +		/* Fix sign of zero with downward rounding when x==1.  */ +		if (WANT_ROUNDING && predict_false(ix == asuint64(1.0))) +			return 0; +		r = x - 1.0; +		r2 = r * r; +		r3 = r * r2; +		y = r3 * +		    (B[1] + r * B[2] + r2 * B[3] + +		     r3 * (B[4] + r * B[5] + r2 * B[6] + +			   r3 * (B[7] + r * B[8] + r2 * B[9] + r3 * B[10]))); +		/* Worst-case error is around 0.507 ULP.  */ +		w = r * 0x1p27; +		double_t rhi = r + w - w; +		double_t rlo = r - rhi; +		w = rhi * rhi * B[0]; /* B[0] == -0.5.  */ +		hi = r + w; +		lo = r - hi + w; +		lo += B[0] * rlo * (rhi + r); +		y += lo; +		y += hi; +		return eval_as_double(y); +	} +	if (predict_false(top - 0x0010 >= 0x7ff0 - 0x0010)) { +		/* x < 0x1p-1022 or inf or nan.  */ +		if (ix * 2 == 0) +			return __math_divzero(1); +		if (ix == asuint64(INFINITY)) /* log(inf) == inf.  */ +			return x; +		if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0) +			return __math_invalid(x); +		/* x is subnormal, normalize it.  */ +		ix = asuint64(x * 0x1p52); +		ix -= 52ULL << 52; +	} + +	/* x = 2^k z; where z is in range [OFF,2*OFF) and exact. +	   The range is split into N subintervals. +	   The ith subinterval contains z and c is near its center.  */ +	tmp = ix - OFF; +	i = (tmp >> (52 - LOG_TABLE_BITS)) % N; +	k = (int64_t)tmp >> 52; /* arithmetic shift */ +	iz = ix - (tmp & 0xfffULL << 52); +	invc = T[i].invc; +	logc = T[i].logc; +	z = asdouble(iz); -	hx = u.i>>32; -	k = 0; -	if (hx < 0x00100000 || hx>>31) { -		if (u.i<<1 == 0) -			return -1/(x*x);  /* log(+-0)=-inf */ -		if (hx>>31) -			return (x-x)/0.0; /* log(-#) = NaN */ -		/* subnormal number, scale x up */ -		k -= 54; -		x *= 0x1p54; -		u.f = x; -		hx = u.i>>32; -	} else if (hx >= 0x7ff00000) { -		return x; -	} else if (hx == 0x3ff00000 && u.i<<32 == 0) -		return 0; +	/* log(x) = log1p(z/c-1) + log(c) + k*Ln2.  */ +	/* r ~= z/c - 1, |r| < 1/(2*N).  */ +#if __FP_FAST_FMA +	/* rounding error: 0x1p-55/N.  */ +	r = __builtin_fma(z, invc, -1.0); +#else +	/* rounding error: 0x1p-55/N + 0x1p-66.  */ +	r = (z - T2[i].chi - T2[i].clo) * invc; +#endif +	kd = (double_t)k; -	/* reduce x into [sqrt(2)/2, sqrt(2)] */ -	hx += 0x3ff00000 - 0x3fe6a09e; -	k += (int)(hx>>20) - 0x3ff; -	hx = (hx&0x000fffff) + 0x3fe6a09e; -	u.i = (uint64_t)hx<<32 | (u.i&0xffffffff); -	x = u.f; +	/* hi + lo = r + log(c) + k*Ln2.  */ +	w = kd * Ln2hi + logc; +	hi = w + r; +	lo = w - hi + r + kd * Ln2lo; -	f = x - 1.0; -	hfsq = 0.5*f*f; -	s = f/(2.0+f); -	z = s*s; -	w = z*z; -	t1 = w*(Lg2+w*(Lg4+w*Lg6)); -	t2 = z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7))); -	R = t2 + t1; -	dk = k; -	return s*(hfsq+R) + dk*ln2_lo - hfsq + f + dk*ln2_hi; +	/* log(x) = lo + (log1p(r) - r) + hi.  */ +	r2 = r * r; /* rounding error: 0x1p-54/N^2.  */ +	/* Worst case error if |y| > 0x1p-5: +	   0.5 + 4.13/N + abs-poly-error*2^57 ULP (+ 0.002 ULP without fma) +	   Worst case error if |y| > 0x1p-4: +	   0.5 + 2.06/N + abs-poly-error*2^56 ULP (+ 0.001 ULP without fma).  */ +	y = lo + r2 * A[0] + +	    r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4])) + hi; +	return eval_as_double(y);  } diff --git a/src/math/log2.c b/src/math/log2.c index 0aafad4b..1276ed4e 100644 --- a/src/math/log2.c +++ b/src/math/log2.c @@ -1,122 +1,122 @@ -/* origin: FreeBSD /usr/src/lib/msun/src/e_log2.c */  /* - * ==================================================== - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * Double-precision log2(x) function.   * - * Developed at SunSoft, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - */ -/* - * Return the base 2 logarithm of x.  See log.c for most comments. - * - * Reduce x to 2^k (1+f) and calculate r = log(1+f) - f + f*f/2 - * as in log.c, then combine and scale in extra precision: - *    log2(x) = (f - f*f/2 + r)/log(2) + k + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT   */  #include <math.h>  #include <stdint.h> +#include "libm.h" +#include "log2_data.h" -static const double -ivln2hi = 1.44269504072144627571e+00, /* 0x3ff71547, 0x65200000 */ -ivln2lo = 1.67517131648865118353e-10, /* 0x3de705fc, 0x2eefa200 */ -Lg1 = 6.666666666666735130e-01,  /* 3FE55555 55555593 */ -Lg2 = 3.999999999940941908e-01,  /* 3FD99999 9997FA04 */ -Lg3 = 2.857142874366239149e-01,  /* 3FD24924 94229359 */ -Lg4 = 2.222219843214978396e-01,  /* 3FCC71C5 1D8E78AF */ -Lg5 = 1.818357216161805012e-01,  /* 3FC74664 96CB03DE */ -Lg6 = 1.531383769920937332e-01,  /* 3FC39A09 D078C69F */ -Lg7 = 1.479819860511658591e-01;  /* 3FC2F112 DF3E5244 */ +#define T __log2_data.tab +#define T2 __log2_data.tab2 +#define B __log2_data.poly1 +#define A __log2_data.poly +#define InvLn2hi __log2_data.invln2hi +#define InvLn2lo __log2_data.invln2lo +#define N (1 << LOG2_TABLE_BITS) +#define OFF 0x3fe6000000000000 -double log2(double x) +/* Top 16 bits of a double.  */ +static inline uint32_t top16(double x)  { -	union {double f; uint64_t i;} u = {x}; -	double_t hfsq,f,s,z,R,w,t1,t2,y,hi,lo,val_hi,val_lo; -	uint32_t hx; -	int k; - -	hx = u.i>>32; -	k = 0; -	if (hx < 0x00100000 || hx>>31) { -		if (u.i<<1 == 0) -			return -1/(x*x);  /* log(+-0)=-inf */ -		if (hx>>31) -			return (x-x)/0.0; /* log(-#) = NaN */ -		/* subnormal number, scale x up */ -		k -= 54; -		x *= 0x1p54; -		u.f = x; -		hx = u.i>>32; -	} else if (hx >= 0x7ff00000) { -		return x; -	} else if (hx == 0x3ff00000 && u.i<<32 == 0) -		return 0; - -	/* reduce x into [sqrt(2)/2, sqrt(2)] */ -	hx += 0x3ff00000 - 0x3fe6a09e; -	k += (int)(hx>>20) - 0x3ff; -	hx = (hx&0x000fffff) + 0x3fe6a09e; -	u.i = (uint64_t)hx<<32 | (u.i&0xffffffff); -	x = u.f; +	return asuint64(x) >> 48; +} -	f = x - 1.0; -	hfsq = 0.5*f*f; -	s = f/(2.0+f); -	z = s*s; -	w = z*z; -	t1 = w*(Lg2+w*(Lg4+w*Lg6)); -	t2 = z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7))); -	R = t2 + t1; +double log2(double x) +{ +	double_t z, r, r2, r4, y, invc, logc, kd, hi, lo, t1, t2, t3, p; +	uint64_t ix, iz, tmp; +	uint32_t top; +	int k, i; -	/* -	 * f-hfsq must (for args near 1) be evaluated in extra precision -	 * to avoid a large cancellation when x is near sqrt(2) or 1/sqrt(2). -	 * This is fairly efficient since f-hfsq only depends on f, so can -	 * be evaluated in parallel with R.  Not combining hfsq with R also -	 * keeps R small (though not as small as a true `lo' term would be), -	 * so that extra precision is not needed for terms involving R. -	 * -	 * Compiler bugs involving extra precision used to break Dekker's -	 * theorem for spitting f-hfsq as hi+lo, unless double_t was used -	 * or the multi-precision calculations were avoided when double_t -	 * has extra precision.  These problems are now automatically -	 * avoided as a side effect of the optimization of combining the -	 * Dekker splitting step with the clear-low-bits step. -	 * -	 * y must (for args near sqrt(2) and 1/sqrt(2)) be added in extra -	 * precision to avoid a very large cancellation when x is very near -	 * these values.  Unlike the above cancellations, this problem is -	 * specific to base 2.  It is strange that adding +-1 is so much -	 * harder than adding +-ln2 or +-log10_2. -	 * -	 * This uses Dekker's theorem to normalize y+val_hi, so the -	 * compiler bugs are back in some configurations, sigh.  And I -	 * don't want to used double_t to avoid them, since that gives a -	 * pessimization and the support for avoiding the pessimization -	 * is not yet available. -	 * -	 * The multi-precision calculations for the multiplications are -	 * routine. -	 */ +	ix = asuint64(x); +	top = top16(x); +#define LO asuint64(1.0 - 0x1.5b51p-5) +#define HI asuint64(1.0 + 0x1.6ab2p-5) +	if (predict_false(ix - LO < HI - LO)) { +		/* Handle close to 1.0 inputs separately.  */ +		/* Fix sign of zero with downward rounding when x==1.  */ +		if (WANT_ROUNDING && predict_false(ix == asuint64(1.0))) +			return 0; +		r = x - 1.0; +#if __FP_FAST_FMA +		hi = r * InvLn2hi; +		lo = r * InvLn2lo + __builtin_fma(r, InvLn2hi, -hi); +#else +		double_t rhi, rlo; +		rhi = asdouble(asuint64(r) & -1ULL << 32); +		rlo = r - rhi; +		hi = rhi * InvLn2hi; +		lo = rlo * InvLn2hi + r * InvLn2lo; +#endif +		r2 = r * r; /* rounding error: 0x1p-62.  */ +		r4 = r2 * r2; +		/* Worst-case error is less than 0.54 ULP (0.55 ULP without fma).  */ +		p = r2 * (B[0] + r * B[1]); +		y = hi + p; +		lo += hi - y + p; +		lo += r4 * (B[2] + r * B[3] + r2 * (B[4] + r * B[5]) + +			    r4 * (B[6] + r * B[7] + r2 * (B[8] + r * B[9]))); +		y += lo; +		return eval_as_double(y); +	} +	if (predict_false(top - 0x0010 >= 0x7ff0 - 0x0010)) { +		/* x < 0x1p-1022 or inf or nan.  */ +		if (ix * 2 == 0) +			return __math_divzero(1); +		if (ix == asuint64(INFINITY)) /* log(inf) == inf.  */ +			return x; +		if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0) +			return __math_invalid(x); +		/* x is subnormal, normalize it.  */ +		ix = asuint64(x * 0x1p52); +		ix -= 52ULL << 52; +	} -	/* hi+lo = f - hfsq + s*(hfsq+R) ~ log(1+f) */ -	hi = f - hfsq; -	u.f = hi; -	u.i &= (uint64_t)-1<<32; -	hi = u.f; -	lo = f - hi - hfsq + s*(hfsq+R); +	/* x = 2^k z; where z is in range [OFF,2*OFF) and exact. +	   The range is split into N subintervals. +	   The ith subinterval contains z and c is near its center.  */ +	tmp = ix - OFF; +	i = (tmp >> (52 - LOG2_TABLE_BITS)) % N; +	k = (int64_t)tmp >> 52; /* arithmetic shift */ +	iz = ix - (tmp & 0xfffULL << 52); +	invc = T[i].invc; +	logc = T[i].logc; +	z = asdouble(iz); +	kd = (double_t)k; -	val_hi = hi*ivln2hi; -	val_lo = (lo+hi)*ivln2lo + lo*ivln2hi; +	/* log2(x) = log2(z/c) + log2(c) + k.  */ +	/* r ~= z/c - 1, |r| < 1/(2*N).  */ +#if __FP_FAST_FMA +	/* rounding error: 0x1p-55/N.  */ +	r = __builtin_fma(z, invc, -1.0); +	t1 = r * InvLn2hi; +	t2 = r * InvLn2lo + __builtin_fma(r, InvLn2hi, -t1); +#else +	double_t rhi, rlo; +	/* rounding error: 0x1p-55/N + 0x1p-65.  */ +	r = (z - T2[i].chi - T2[i].clo) * invc; +	rhi = asdouble(asuint64(r) & -1ULL << 32); +	rlo = r - rhi; +	t1 = rhi * InvLn2hi; +	t2 = rlo * InvLn2hi + r * InvLn2lo; +#endif -	/* spadd(val_hi, val_lo, y), except for not using double_t: */ -	y = k; -	w = y + val_hi; -	val_lo += (y - w) + val_hi; -	val_hi = w; +	/* hi + lo = r/ln2 + log2(c) + k.  */ +	t3 = kd + logc; +	hi = t3 + t1; +	lo = t3 - hi + t1 + t2; -	return val_lo + val_hi; +	/* log2(r+1) = r/ln2 + r^2*poly(r).  */ +	/* Evaluation is optimized assuming superscalar pipelined execution.  */ +	r2 = r * r; /* rounding error: 0x1p-54/N^2.  */ +	r4 = r2 * r2; +	/* Worst-case error if |y| > 0x1p-4: 0.547 ULP (0.550 ULP without fma). +	   ~ 0.5 + 2/N/ln2 + abs-poly-error*0x1p56 ULP (+ 0.003 ULP without fma).  */ +	p = A[0] + r * A[1] + r2 * (A[2] + r * A[3]) + r4 * (A[4] + r * A[5]); +	y = lo + r2 * p + hi; +	return eval_as_double(y);  } diff --git a/src/math/log2_data.c b/src/math/log2_data.c new file mode 100644 index 00000000..3dd1ca51 --- /dev/null +++ b/src/math/log2_data.c @@ -0,0 +1,201 @@ +/* + * Data for log2. + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "log2_data.h" + +#define N (1 << LOG2_TABLE_BITS) + +const struct log2_data __log2_data = { +// First coefficient: 0x1.71547652b82fe1777d0ffda0d24p0 +.invln2hi = 0x1.7154765200000p+0, +.invln2lo = 0x1.705fc2eefa200p-33, +.poly1 = { +// relative error: 0x1.2fad8188p-63 +// in -0x1.5b51p-5 0x1.6ab2p-5 +-0x1.71547652b82fep-1, +0x1.ec709dc3a03f7p-2, +-0x1.71547652b7c3fp-2, +0x1.2776c50f05be4p-2, +-0x1.ec709dd768fe5p-3, +0x1.a61761ec4e736p-3, +-0x1.7153fbc64a79bp-3, +0x1.484d154f01b4ap-3, +-0x1.289e4a72c383cp-3, +0x1.0b32f285aee66p-3, +}, +.poly = { +// relative error: 0x1.a72c2bf8p-58 +// abs error: 0x1.67a552c8p-66 +// in -0x1.f45p-8 0x1.f45p-8 +-0x1.71547652b8339p-1, +0x1.ec709dc3a04bep-2, +-0x1.7154764702ffbp-2, +0x1.2776c50034c48p-2, +-0x1.ec7b328ea92bcp-3, +0x1.a6225e117f92ep-3, +}, +/* Algorithm: + +	x = 2^k z +	log2(x) = k + log2(c) + log2(z/c) +	log2(z/c) = poly(z/c - 1) + +where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls +into the ith one, then table entries are computed as + +	tab[i].invc = 1/c +	tab[i].logc = (double)log2(c) +	tab2[i].chi = (double)c +	tab2[i].clo = (double)(c - (double)c) + +where c is near the center of the subinterval and is chosen by trying +-2^29 +floating point invc candidates around 1/center and selecting one for which + +	1) the rounding error in 0x1.8p10 + logc is 0, +	2) the rounding error in z - chi - clo is < 0x1p-64 and +	3) the rounding error in (double)log2(c) is minimized (< 0x1p-68). + +Note: 1) ensures that k + logc can be computed without rounding error, 2) +ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to a +single rounding error when there is no fast fma for z*invc - 1, 3) ensures +that logc + poly(z/c - 1) has small error, however near x == 1 when +|log2(x)| < 0x1p-4, this is not enough so that is special cased.  */ +.tab = { +{0x1.724286bb1acf8p+0, -0x1.1095feecdb000p-1}, +{0x1.6e1f766d2cca1p+0, -0x1.08494bd76d000p-1}, +{0x1.6a13d0e30d48ap+0, -0x1.00143aee8f800p-1}, +{0x1.661ec32d06c85p+0, -0x1.efec5360b4000p-2}, +{0x1.623fa951198f8p+0, -0x1.dfdd91ab7e000p-2}, +{0x1.5e75ba4cf026cp+0, -0x1.cffae0cc79000p-2}, +{0x1.5ac055a214fb8p+0, -0x1.c043811fda000p-2}, +{0x1.571ed0f166e1ep+0, -0x1.b0b67323ae000p-2}, +{0x1.53909590bf835p+0, -0x1.a152f5a2db000p-2}, +{0x1.5014fed61adddp+0, -0x1.9217f5af86000p-2}, +{0x1.4cab88e487bd0p+0, -0x1.8304db0719000p-2}, +{0x1.49539b4334feep+0, -0x1.74189f9a9e000p-2}, +{0x1.460cbdfafd569p+0, -0x1.6552bb5199000p-2}, +{0x1.42d664ee4b953p+0, -0x1.56b23a29b1000p-2}, +{0x1.3fb01111dd8a6p+0, -0x1.483650f5fa000p-2}, +{0x1.3c995b70c5836p+0, -0x1.39de937f6a000p-2}, +{0x1.3991c4ab6fd4ap+0, -0x1.2baa1538d6000p-2}, +{0x1.3698e0ce099b5p+0, -0x1.1d98340ca4000p-2}, +{0x1.33ae48213e7b2p+0, -0x1.0fa853a40e000p-2}, +{0x1.30d191985bdb1p+0, -0x1.01d9c32e73000p-2}, +{0x1.2e025cab271d7p+0, -0x1.e857da2fa6000p-3}, +{0x1.2b404cf13cd82p+0, -0x1.cd3c8633d8000p-3}, +{0x1.288b02c7ccb50p+0, -0x1.b26034c14a000p-3}, +{0x1.25e2263944de5p+0, -0x1.97c1c2f4fe000p-3}, +{0x1.234563d8615b1p+0, -0x1.7d6023f800000p-3}, +{0x1.20b46e33eaf38p+0, -0x1.633a71a05e000p-3}, +{0x1.1e2eefdcda3ddp+0, -0x1.494f5e9570000p-3}, +{0x1.1bb4a580b3930p+0, -0x1.2f9e424e0a000p-3}, +{0x1.19453847f2200p+0, -0x1.162595afdc000p-3}, +{0x1.16e06c0d5d73cp+0, -0x1.f9c9a75bd8000p-4}, +{0x1.1485f47b7e4c2p+0, -0x1.c7b575bf9c000p-4}, +{0x1.12358ad0085d1p+0, -0x1.960c60ff48000p-4}, +{0x1.0fef00f532227p+0, -0x1.64ce247b60000p-4}, +{0x1.0db2077d03a8fp+0, -0x1.33f78b2014000p-4}, +{0x1.0b7e6d65980d9p+0, -0x1.0387d1a42c000p-4}, +{0x1.0953efe7b408dp+0, -0x1.a6f9208b50000p-5}, +{0x1.07325cac53b83p+0, -0x1.47a954f770000p-5}, +{0x1.05197e40d1b5cp+0, -0x1.d23a8c50c0000p-6}, +{0x1.03091c1208ea2p+0, -0x1.16a2629780000p-6}, +{0x1.0101025b37e21p+0, -0x1.720f8d8e80000p-8}, +{0x1.fc07ef9caa76bp-1, 0x1.6fe53b1500000p-7}, +{0x1.f4465d3f6f184p-1, 0x1.11ccce10f8000p-5}, +{0x1.ecc079f84107fp-1, 0x1.c4dfc8c8b8000p-5}, +{0x1.e573a99975ae8p-1, 0x1.3aa321e574000p-4}, +{0x1.de5d6f0bd3de6p-1, 0x1.918a0d08b8000p-4}, +{0x1.d77b681ff38b3p-1, 0x1.e72e9da044000p-4}, +{0x1.d0cb5724de943p-1, 0x1.1dcd2507f6000p-3}, +{0x1.ca4b2dc0e7563p-1, 0x1.476ab03dea000p-3}, +{0x1.c3f8ee8d6cb51p-1, 0x1.7074377e22000p-3}, +{0x1.bdd2b4f020c4cp-1, 0x1.98ede8ba94000p-3}, +{0x1.b7d6c006015cap-1, 0x1.c0db86ad2e000p-3}, +{0x1.b20366e2e338fp-1, 0x1.e840aafcee000p-3}, +{0x1.ac57026295039p-1, 0x1.0790ab4678000p-2}, +{0x1.a6d01bc2731ddp-1, 0x1.1ac056801c000p-2}, +{0x1.a16d3bc3ff18bp-1, 0x1.2db11d4fee000p-2}, +{0x1.9c2d14967feadp-1, 0x1.406464ec58000p-2}, +{0x1.970e4f47c9902p-1, 0x1.52dbe093af000p-2}, +{0x1.920fb3982bcf2p-1, 0x1.651902050d000p-2}, +{0x1.8d30187f759f1p-1, 0x1.771d2cdeaf000p-2}, +{0x1.886e5ebb9f66dp-1, 0x1.88e9c857d9000p-2}, +{0x1.83c97b658b994p-1, 0x1.9a80155e16000p-2}, +{0x1.7f405ffc61022p-1, 0x1.abe186ed3d000p-2}, +{0x1.7ad22181415cap-1, 0x1.bd0f2aea0e000p-2}, +{0x1.767dcf99eff8cp-1, 0x1.ce0a43dbf4000p-2}, +}, +#if !__FP_FAST_FMA +.tab2 = { +{0x1.6200012b90a8ep-1, 0x1.904ab0644b605p-55}, +{0x1.66000045734a6p-1, 0x1.1ff9bea62f7a9p-57}, +{0x1.69fffc325f2c5p-1, 0x1.27ecfcb3c90bap-55}, +{0x1.6e00038b95a04p-1, 0x1.8ff8856739326p-55}, +{0x1.71fffe09994e3p-1, 0x1.afd40275f82b1p-55}, +{0x1.7600015590e1p-1, -0x1.2fd75b4238341p-56}, +{0x1.7a00012655bd5p-1, 0x1.808e67c242b76p-56}, +{0x1.7e0003259e9a6p-1, -0x1.208e426f622b7p-57}, +{0x1.81fffedb4b2d2p-1, -0x1.402461ea5c92fp-55}, +{0x1.860002dfafcc3p-1, 0x1.df7f4a2f29a1fp-57}, +{0x1.89ffff78c6b5p-1, -0x1.e0453094995fdp-55}, +{0x1.8e00039671566p-1, -0x1.a04f3bec77b45p-55}, +{0x1.91fffe2bf1745p-1, -0x1.7fa34400e203cp-56}, +{0x1.95fffcc5c9fd1p-1, -0x1.6ff8005a0695dp-56}, +{0x1.9a0003bba4767p-1, 0x1.0f8c4c4ec7e03p-56}, +{0x1.9dfffe7b92da5p-1, 0x1.e7fd9478c4602p-55}, +{0x1.a1fffd72efdafp-1, -0x1.a0c554dcdae7ep-57}, +{0x1.a5fffde04ff95p-1, 0x1.67da98ce9b26bp-55}, +{0x1.a9fffca5e8d2bp-1, -0x1.284c9b54c13dep-55}, +{0x1.adfffddad03eap-1, 0x1.812c8ea602e3cp-58}, +{0x1.b1ffff10d3d4dp-1, -0x1.efaddad27789cp-55}, +{0x1.b5fffce21165ap-1, 0x1.3cb1719c61237p-58}, +{0x1.b9fffd950e674p-1, 0x1.3f7d94194cep-56}, +{0x1.be000139ca8afp-1, 0x1.50ac4215d9bcp-56}, +{0x1.c20005b46df99p-1, 0x1.beea653e9c1c9p-57}, +{0x1.c600040b9f7aep-1, -0x1.c079f274a70d6p-56}, +{0x1.ca0006255fd8ap-1, -0x1.a0b4076e84c1fp-56}, +{0x1.cdfffd94c095dp-1, 0x1.8f933f99ab5d7p-55}, +{0x1.d1ffff975d6cfp-1, -0x1.82c08665fe1bep-58}, +{0x1.d5fffa2561c93p-1, -0x1.b04289bd295f3p-56}, +{0x1.d9fff9d228b0cp-1, 0x1.70251340fa236p-55}, +{0x1.de00065bc7e16p-1, -0x1.5011e16a4d80cp-56}, +{0x1.e200002f64791p-1, 0x1.9802f09ef62ep-55}, +{0x1.e600057d7a6d8p-1, -0x1.e0b75580cf7fap-56}, +{0x1.ea00027edc00cp-1, -0x1.c848309459811p-55}, +{0x1.ee0006cf5cb7cp-1, -0x1.f8027951576f4p-55}, +{0x1.f2000782b7dccp-1, -0x1.f81d97274538fp-55}, +{0x1.f6000260c450ap-1, -0x1.071002727ffdcp-59}, +{0x1.f9fffe88cd533p-1, -0x1.81bdce1fda8bp-58}, +{0x1.fdfffd50f8689p-1, 0x1.7f91acb918e6ep-55}, +{0x1.0200004292367p+0, 0x1.b7ff365324681p-54}, +{0x1.05fffe3e3d668p+0, 0x1.6fa08ddae957bp-55}, +{0x1.0a0000a85a757p+0, -0x1.7e2de80d3fb91p-58}, +{0x1.0e0001a5f3fccp+0, -0x1.1823305c5f014p-54}, +{0x1.11ffff8afbaf5p+0, -0x1.bfabb6680bac2p-55}, +{0x1.15fffe54d91adp+0, -0x1.d7f121737e7efp-54}, +{0x1.1a00011ac36e1p+0, 0x1.c000a0516f5ffp-54}, +{0x1.1e00019c84248p+0, -0x1.082fbe4da5dap-54}, +{0x1.220000ffe5e6ep+0, -0x1.8fdd04c9cfb43p-55}, +{0x1.26000269fd891p+0, 0x1.cfe2a7994d182p-55}, +{0x1.2a00029a6e6dap+0, -0x1.00273715e8bc5p-56}, +{0x1.2dfffe0293e39p+0, 0x1.b7c39dab2a6f9p-54}, +{0x1.31ffff7dcf082p+0, 0x1.df1336edc5254p-56}, +{0x1.35ffff05a8b6p+0, -0x1.e03564ccd31ebp-54}, +{0x1.3a0002e0eaeccp+0, 0x1.5f0e74bd3a477p-56}, +{0x1.3e000043bb236p+0, 0x1.c7dcb149d8833p-54}, +{0x1.4200002d187ffp+0, 0x1.e08afcf2d3d28p-56}, +{0x1.460000d387cb1p+0, 0x1.20837856599a6p-55}, +{0x1.4a00004569f89p+0, -0x1.9fa5c904fbcd2p-55}, +{0x1.4e000043543f3p+0, -0x1.81125ed175329p-56}, +{0x1.51fffcc027f0fp+0, 0x1.883d8847754dcp-54}, +{0x1.55ffffd87b36fp+0, -0x1.709e731d02807p-55}, +{0x1.59ffff21df7bap+0, 0x1.7f79f68727b02p-55}, +{0x1.5dfffebfc3481p+0, -0x1.180902e30e93ep-54}, +}, +#endif +}; diff --git a/src/math/log2_data.h b/src/math/log2_data.h new file mode 100644 index 00000000..276a786d --- /dev/null +++ b/src/math/log2_data.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#ifndef _LOG2_DATA_H +#define _LOG2_DATA_H + +#include <features.h> + +#define LOG2_TABLE_BITS 6 +#define LOG2_POLY_ORDER 7 +#define LOG2_POLY1_ORDER 11 +extern hidden const struct log2_data { +	double invln2hi; +	double invln2lo; +	double poly[LOG2_POLY_ORDER - 1]; +	double poly1[LOG2_POLY1_ORDER - 1]; +	struct { +		double invc, logc; +	} tab[1 << LOG2_TABLE_BITS]; +#if !__FP_FAST_FMA +	struct { +		double chi, clo; +	} tab2[1 << LOG2_TABLE_BITS]; +#endif +} __log2_data; + +#endif diff --git a/src/math/log2f.c b/src/math/log2f.c index b3e305fe..c368f88f 100644 --- a/src/math/log2f.c +++ b/src/math/log2f.c @@ -1,74 +1,72 @@ -/* origin: FreeBSD /usr/src/lib/msun/src/e_log2f.c */  /* - * ==================================================== - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * Single-precision log2 function.   * - * Developed at SunPro, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - */ -/* - * See comments in log2.c. + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT   */  #include <math.h>  #include <stdint.h> +#include "libm.h" +#include "log2f_data.h" + +/* +LOG2F_TABLE_BITS = 4 +LOG2F_POLY_ORDER = 4 + +ULP error: 0.752 (nearest rounding.) +Relative error: 1.9 * 2^-26 (before rounding.) +*/ -static const float -ivln2hi =  1.4428710938e+00, /* 0x3fb8b000 */ -ivln2lo = -1.7605285393e-04, /* 0xb9389ad4 */ -/* |(log(1+s)-log(1-s))/s - Lg(s)| < 2**-34.24 (~[-4.95e-11, 4.97e-11]). */ -Lg1 = 0xaaaaaa.0p-24, /* 0.66666662693 */ -Lg2 = 0xccce13.0p-25, /* 0.40000972152 */ -Lg3 = 0x91e9ee.0p-25, /* 0.28498786688 */ -Lg4 = 0xf89e26.0p-26; /* 0.24279078841 */ +#define N (1 << LOG2F_TABLE_BITS) +#define T __log2f_data.tab +#define A __log2f_data.poly +#define OFF 0x3f330000  float log2f(float x)  { -	union {float f; uint32_t i;} u = {x}; -	float_t hfsq,f,s,z,R,w,t1,t2,hi,lo; -	uint32_t ix; -	int k; +	double_t z, r, r2, p, y, y0, invc, logc; +	uint32_t ix, iz, top, tmp; +	int k, i; -	ix = u.i; -	k = 0; -	if (ix < 0x00800000 || ix>>31) {  /* x < 2**-126  */ -		if (ix<<1 == 0) -			return -1/(x*x);  /* log(+-0)=-inf */ -		if (ix>>31) -			return (x-x)/0.0f; /* log(-#) = NaN */ -		/* subnormal number, scale up x */ -		k -= 25; -		x *= 0x1p25f; -		u.f = x; -		ix = u.i; -	} else if (ix >= 0x7f800000) { -		return x; -	} else if (ix == 0x3f800000) +	ix = asuint(x); +	/* Fix sign of zero with downward rounding when x==1.  */ +	if (WANT_ROUNDING && predict_false(ix == 0x3f800000))  		return 0; +	if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000)) { +		/* x < 0x1p-126 or inf or nan.  */ +		if (ix * 2 == 0) +			return __math_divzerof(1); +		if (ix == 0x7f800000) /* log2(inf) == inf.  */ +			return x; +		if ((ix & 0x80000000) || ix * 2 >= 0xff000000) +			return __math_invalidf(x); +		/* x is subnormal, normalize it.  */ +		ix = asuint(x * 0x1p23f); +		ix -= 23 << 23; +	} -	/* reduce x into [sqrt(2)/2, sqrt(2)] */ -	ix += 0x3f800000 - 0x3f3504f3; -	k += (int)(ix>>23) - 0x7f; -	ix = (ix&0x007fffff) + 0x3f3504f3; -	u.i = ix; -	x = u.f; +	/* x = 2^k z; where z is in range [OFF,2*OFF] and exact. +	   The range is split into N subintervals. +	   The ith subinterval contains z and c is near its center.  */ +	tmp = ix - OFF; +	i = (tmp >> (23 - LOG2F_TABLE_BITS)) % N; +	top = tmp & 0xff800000; +	iz = ix - top; +	k = (int32_t)tmp >> 23; /* arithmetic shift */ +	invc = T[i].invc; +	logc = T[i].logc; +	z = (double_t)asfloat(iz); -	f = x - 1.0f; -	s = f/(2.0f + f); -	z = s*s; -	w = z*z; -	t1= w*(Lg2+w*Lg4); -	t2= z*(Lg1+w*Lg3); -	R = t2 + t1; -	hfsq = 0.5f*f*f; +	/* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */ +	r = z * invc - 1; +	y0 = logc + (double_t)k; -	hi = f - hfsq; -	u.f = hi; -	u.i &= 0xfffff000; -	hi = u.f; -	lo = f - hi - hfsq + s*(hfsq+R); -	return (lo+hi)*ivln2lo + lo*ivln2hi + hi*ivln2hi + k; +	/* Pipelined polynomial evaluation to approximate log1p(r)/ln2.  */ +	r2 = r * r; +	y = A[1] * r + A[2]; +	y = A[0] * r2 + y; +	p = A[3] * r + y0; +	y = y * r2 + p; +	return eval_as_float(y);  } diff --git a/src/math/log2f_data.c b/src/math/log2f_data.c new file mode 100644 index 00000000..24e450f1 --- /dev/null +++ b/src/math/log2f_data.c @@ -0,0 +1,33 @@ +/* + * Data definition for log2f. + * + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "log2f_data.h" + +const struct log2f_data __log2f_data = { +  .tab = { +  { 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 }, +  { 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 }, +  { 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 }, +  { 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 }, +  { 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 }, +  { 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 }, +  { 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 }, +  { 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 }, +  { 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 }, +  { 0x1p+0, 0x0p+0 }, +  { 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 }, +  { 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 }, +  { 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 }, +  { 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 }, +  { 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 }, +  { 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 }, +  }, +  .poly = { +  -0x1.712b6f70a7e4dp-2, 0x1.ecabf496832ep-2, -0x1.715479ffae3dep-1, +  0x1.715475f35c8b8p0, +  } +}; diff --git a/src/math/log2f_data.h b/src/math/log2f_data.h new file mode 100644 index 00000000..4fa48956 --- /dev/null +++ b/src/math/log2f_data.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#ifndef _LOG2F_DATA_H +#define _LOG2F_DATA_H + +#include <features.h> + +#define LOG2F_TABLE_BITS 4 +#define LOG2F_POLY_ORDER 4 +extern hidden const struct log2f_data { +	struct { +		double invc, logc; +	} tab[1 << LOG2F_TABLE_BITS]; +	double poly[LOG2F_POLY_ORDER]; +} __log2f_data; + +#endif diff --git a/src/math/log_data.c b/src/math/log_data.c new file mode 100644 index 00000000..1a6ec712 --- /dev/null +++ b/src/math/log_data.c @@ -0,0 +1,328 @@ +/* + * Data for log. + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "log_data.h" + +#define N (1 << LOG_TABLE_BITS) + +const struct log_data __log_data = { +.ln2hi = 0x1.62e42fefa3800p-1, +.ln2lo = 0x1.ef35793c76730p-45, +.poly1 = { +// relative error: 0x1.c04d76cp-63 +// in -0x1p-4 0x1.09p-4 (|log(1+x)| > 0x1p-4 outside the interval) +-0x1p-1, +0x1.5555555555577p-2, +-0x1.ffffffffffdcbp-3, +0x1.999999995dd0cp-3, +-0x1.55555556745a7p-3, +0x1.24924a344de3p-3, +-0x1.fffffa4423d65p-4, +0x1.c7184282ad6cap-4, +-0x1.999eb43b068ffp-4, +0x1.78182f7afd085p-4, +-0x1.5521375d145cdp-4, +}, +.poly = { +// relative error: 0x1.926199e8p-56 +// abs error: 0x1.882ff33p-65 +// in -0x1.fp-9 0x1.fp-9 +-0x1.0000000000001p-1, +0x1.555555551305bp-2, +-0x1.fffffffeb459p-3, +0x1.999b324f10111p-3, +-0x1.55575e506c89fp-3, +}, +/* Algorithm: + +	x = 2^k z +	log(x) = k ln2 + log(c) + log(z/c) +	log(z/c) = poly(z/c - 1) + +where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls +into the ith one, then table entries are computed as + +	tab[i].invc = 1/c +	tab[i].logc = (double)log(c) +	tab2[i].chi = (double)c +	tab2[i].clo = (double)(c - (double)c) + +where c is near the center of the subinterval and is chosen by trying +-2^29 +floating point invc candidates around 1/center and selecting one for which + +	1) the rounding error in 0x1.8p9 + logc is 0, +	2) the rounding error in z - chi - clo is < 0x1p-66 and +	3) the rounding error in (double)log(c) is minimized (< 0x1p-66). + +Note: 1) ensures that k*ln2hi + logc can be computed without rounding error, +2) ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to +a single rounding error when there is no fast fma for z*invc - 1, 3) ensures +that logc + poly(z/c - 1) has small error, however near x == 1 when +|log(x)| < 0x1p-4, this is not enough so that is special cased.  */ +.tab = { +{0x1.734f0c3e0de9fp+0, -0x1.7cc7f79e69000p-2}, +{0x1.713786a2ce91fp+0, -0x1.76feec20d0000p-2}, +{0x1.6f26008fab5a0p+0, -0x1.713e31351e000p-2}, +{0x1.6d1a61f138c7dp+0, -0x1.6b85b38287800p-2}, +{0x1.6b1490bc5b4d1p+0, -0x1.65d5590807800p-2}, +{0x1.69147332f0cbap+0, -0x1.602d076180000p-2}, +{0x1.6719f18224223p+0, -0x1.5a8ca86909000p-2}, +{0x1.6524f99a51ed9p+0, -0x1.54f4356035000p-2}, +{0x1.63356aa8f24c4p+0, -0x1.4f637c36b4000p-2}, +{0x1.614b36b9ddc14p+0, -0x1.49da7fda85000p-2}, +{0x1.5f66452c65c4cp+0, -0x1.445923989a800p-2}, +{0x1.5d867b5912c4fp+0, -0x1.3edf439b0b800p-2}, +{0x1.5babccb5b90dep+0, -0x1.396ce448f7000p-2}, +{0x1.59d61f2d91a78p+0, -0x1.3401e17bda000p-2}, +{0x1.5805612465687p+0, -0x1.2e9e2ef468000p-2}, +{0x1.56397cee76bd3p+0, -0x1.2941b3830e000p-2}, +{0x1.54725e2a77f93p+0, -0x1.23ec58cda8800p-2}, +{0x1.52aff42064583p+0, -0x1.1e9e129279000p-2}, +{0x1.50f22dbb2bddfp+0, -0x1.1956d2b48f800p-2}, +{0x1.4f38f4734ded7p+0, -0x1.141679ab9f800p-2}, +{0x1.4d843cfde2840p+0, -0x1.0edd094ef9800p-2}, +{0x1.4bd3ec078a3c8p+0, -0x1.09aa518db1000p-2}, +{0x1.4a27fc3e0258ap+0, -0x1.047e65263b800p-2}, +{0x1.4880524d48434p+0, -0x1.feb224586f000p-3}, +{0x1.46dce1b192d0bp+0, -0x1.f474a7517b000p-3}, +{0x1.453d9d3391854p+0, -0x1.ea4443d103000p-3}, +{0x1.43a2744b4845ap+0, -0x1.e020d44e9b000p-3}, +{0x1.420b54115f8fbp+0, -0x1.d60a22977f000p-3}, +{0x1.40782da3ef4b1p+0, -0x1.cc00104959000p-3}, +{0x1.3ee8f5d57fe8fp+0, -0x1.c202956891000p-3}, +{0x1.3d5d9a00b4ce9p+0, -0x1.b81178d811000p-3}, +{0x1.3bd60c010c12bp+0, -0x1.ae2c9ccd3d000p-3}, +{0x1.3a5242b75dab8p+0, -0x1.a45402e129000p-3}, +{0x1.38d22cd9fd002p+0, -0x1.9a877681df000p-3}, +{0x1.3755bc5847a1cp+0, -0x1.90c6d69483000p-3}, +{0x1.35dce49ad36e2p+0, -0x1.87120a645c000p-3}, +{0x1.34679984dd440p+0, -0x1.7d68fb4143000p-3}, +{0x1.32f5cceffcb24p+0, -0x1.73cb83c627000p-3}, +{0x1.3187775a10d49p+0, -0x1.6a39a9b376000p-3}, +{0x1.301c8373e3990p+0, -0x1.60b3154b7a000p-3}, +{0x1.2eb4ebb95f841p+0, -0x1.5737d76243000p-3}, +{0x1.2d50a0219a9d1p+0, -0x1.4dc7b8fc23000p-3}, +{0x1.2bef9a8b7fd2ap+0, -0x1.4462c51d20000p-3}, +{0x1.2a91c7a0c1babp+0, -0x1.3b08abc830000p-3}, +{0x1.293726014b530p+0, -0x1.31b996b490000p-3}, +{0x1.27dfa5757a1f5p+0, -0x1.2875490a44000p-3}, +{0x1.268b39b1d3bbfp+0, -0x1.1f3b9f879a000p-3}, +{0x1.2539d838ff5bdp+0, -0x1.160c8252ca000p-3}, +{0x1.23eb7aac9083bp+0, -0x1.0ce7f57f72000p-3}, +{0x1.22a012ba940b6p+0, -0x1.03cdc49fea000p-3}, +{0x1.2157996cc4132p+0, -0x1.f57bdbc4b8000p-4}, +{0x1.201201dd2fc9bp+0, -0x1.e370896404000p-4}, +{0x1.1ecf4494d480bp+0, -0x1.d17983ef94000p-4}, +{0x1.1d8f5528f6569p+0, -0x1.bf9674ed8a000p-4}, +{0x1.1c52311577e7cp+0, -0x1.adc79202f6000p-4}, +{0x1.1b17c74cb26e9p+0, -0x1.9c0c3e7288000p-4}, +{0x1.19e010c2c1ab6p+0, -0x1.8a646b372c000p-4}, +{0x1.18ab07bb670bdp+0, -0x1.78d01b3ac0000p-4}, +{0x1.1778a25efbcb6p+0, -0x1.674f145380000p-4}, +{0x1.1648d354c31dap+0, -0x1.55e0e6d878000p-4}, +{0x1.151b990275fddp+0, -0x1.4485cdea1e000p-4}, +{0x1.13f0ea432d24cp+0, -0x1.333d94d6aa000p-4}, +{0x1.12c8b7210f9dap+0, -0x1.22079f8c56000p-4}, +{0x1.11a3028ecb531p+0, -0x1.10e4698622000p-4}, +{0x1.107fbda8434afp+0, -0x1.ffa6c6ad20000p-5}, +{0x1.0f5ee0f4e6bb3p+0, -0x1.dda8d4a774000p-5}, +{0x1.0e4065d2a9fcep+0, -0x1.bbcece4850000p-5}, +{0x1.0d244632ca521p+0, -0x1.9a1894012c000p-5}, +{0x1.0c0a77ce2981ap+0, -0x1.788583302c000p-5}, +{0x1.0af2f83c636d1p+0, -0x1.5715e67d68000p-5}, +{0x1.09ddb98a01339p+0, -0x1.35c8a49658000p-5}, +{0x1.08cabaf52e7dfp+0, -0x1.149e364154000p-5}, +{0x1.07b9f2f4e28fbp+0, -0x1.e72c082eb8000p-6}, +{0x1.06ab58c358f19p+0, -0x1.a55f152528000p-6}, +{0x1.059eea5ecf92cp+0, -0x1.63d62cf818000p-6}, +{0x1.04949cdd12c90p+0, -0x1.228fb8caa0000p-6}, +{0x1.038c6c6f0ada9p+0, -0x1.c317b20f90000p-7}, +{0x1.02865137932a9p+0, -0x1.419355daa0000p-7}, +{0x1.0182427ea7348p+0, -0x1.81203c2ec0000p-8}, +{0x1.008040614b195p+0, -0x1.0040979240000p-9}, +{0x1.fe01ff726fa1ap-1, 0x1.feff384900000p-9}, +{0x1.fa11cc261ea74p-1, 0x1.7dc41353d0000p-7}, +{0x1.f6310b081992ep-1, 0x1.3cea3c4c28000p-6}, +{0x1.f25f63ceeadcdp-1, 0x1.b9fc114890000p-6}, +{0x1.ee9c8039113e7p-1, 0x1.1b0d8ce110000p-5}, +{0x1.eae8078cbb1abp-1, 0x1.58a5bd001c000p-5}, +{0x1.e741aa29d0c9bp-1, 0x1.95c8340d88000p-5}, +{0x1.e3a91830a99b5p-1, 0x1.d276aef578000p-5}, +{0x1.e01e009609a56p-1, 0x1.07598e598c000p-4}, +{0x1.dca01e577bb98p-1, 0x1.253f5e30d2000p-4}, +{0x1.d92f20b7c9103p-1, 0x1.42edd8b380000p-4}, +{0x1.d5cac66fb5ccep-1, 0x1.606598757c000p-4}, +{0x1.d272caa5ede9dp-1, 0x1.7da76356a0000p-4}, +{0x1.cf26e3e6b2ccdp-1, 0x1.9ab434e1c6000p-4}, +{0x1.cbe6da2a77902p-1, 0x1.b78c7bb0d6000p-4}, +{0x1.c8b266d37086dp-1, 0x1.d431332e72000p-4}, +{0x1.c5894bd5d5804p-1, 0x1.f0a3171de6000p-4}, +{0x1.c26b533bb9f8cp-1, 0x1.067152b914000p-3}, +{0x1.bf583eeece73fp-1, 0x1.147858292b000p-3}, +{0x1.bc4fd75db96c1p-1, 0x1.2266ecdca3000p-3}, +{0x1.b951e0c864a28p-1, 0x1.303d7a6c55000p-3}, +{0x1.b65e2c5ef3e2cp-1, 0x1.3dfc33c331000p-3}, +{0x1.b374867c9888bp-1, 0x1.4ba366b7a8000p-3}, +{0x1.b094b211d304ap-1, 0x1.5933928d1f000p-3}, +{0x1.adbe885f2ef7ep-1, 0x1.66acd2418f000p-3}, +{0x1.aaf1d31603da2p-1, 0x1.740f8ec669000p-3}, +{0x1.a82e63fd358a7p-1, 0x1.815c0f51af000p-3}, +{0x1.a5740ef09738bp-1, 0x1.8e92954f68000p-3}, +{0x1.a2c2a90ab4b27p-1, 0x1.9bb3602f84000p-3}, +{0x1.a01a01393f2d1p-1, 0x1.a8bed1c2c0000p-3}, +{0x1.9d79f24db3c1bp-1, 0x1.b5b515c01d000p-3}, +{0x1.9ae2505c7b190p-1, 0x1.c2967ccbcc000p-3}, +{0x1.9852ef297ce2fp-1, 0x1.cf635d5486000p-3}, +{0x1.95cbaeea44b75p-1, 0x1.dc1bd3446c000p-3}, +{0x1.934c69de74838p-1, 0x1.e8c01b8cfe000p-3}, +{0x1.90d4f2f6752e6p-1, 0x1.f5509c0179000p-3}, +{0x1.8e6528effd79dp-1, 0x1.00e6c121fb800p-2}, +{0x1.8bfce9fcc007cp-1, 0x1.071b80e93d000p-2}, +{0x1.899c0dabec30ep-1, 0x1.0d46b9e867000p-2}, +{0x1.87427aa2317fbp-1, 0x1.13687334bd000p-2}, +{0x1.84f00acb39a08p-1, 0x1.1980d67234800p-2}, +{0x1.82a49e8653e55p-1, 0x1.1f8ffe0cc8000p-2}, +{0x1.8060195f40260p-1, 0x1.2595fd7636800p-2}, +{0x1.7e22563e0a329p-1, 0x1.2b9300914a800p-2}, +{0x1.7beb377dcb5adp-1, 0x1.3187210436000p-2}, +{0x1.79baa679725c2p-1, 0x1.377266dec1800p-2}, +{0x1.77907f2170657p-1, 0x1.3d54ffbaf3000p-2}, +{0x1.756cadbd6130cp-1, 0x1.432eee32fe000p-2}, +}, +#if !__FP_FAST_FMA +.tab2 = { +{0x1.61000014fb66bp-1, 0x1.e026c91425b3cp-56}, +{0x1.63000034db495p-1, 0x1.dbfea48005d41p-55}, +{0x1.650000d94d478p-1, 0x1.e7fa786d6a5b7p-55}, +{0x1.67000074e6fadp-1, 0x1.1fcea6b54254cp-57}, +{0x1.68ffffedf0faep-1, -0x1.c7e274c590efdp-56}, +{0x1.6b0000763c5bcp-1, -0x1.ac16848dcda01p-55}, +{0x1.6d0001e5cc1f6p-1, 0x1.33f1c9d499311p-55}, +{0x1.6efffeb05f63ep-1, -0x1.e80041ae22d53p-56}, +{0x1.710000e86978p-1, 0x1.bff6671097952p-56}, +{0x1.72ffffc67e912p-1, 0x1.c00e226bd8724p-55}, +{0x1.74fffdf81116ap-1, -0x1.e02916ef101d2p-57}, +{0x1.770000f679c9p-1, -0x1.7fc71cd549c74p-57}, +{0x1.78ffffa7ec835p-1, 0x1.1bec19ef50483p-55}, +{0x1.7affffe20c2e6p-1, -0x1.07e1729cc6465p-56}, +{0x1.7cfffed3fc9p-1, -0x1.08072087b8b1cp-55}, +{0x1.7efffe9261a76p-1, 0x1.dc0286d9df9aep-55}, +{0x1.81000049ca3e8p-1, 0x1.97fd251e54c33p-55}, +{0x1.8300017932c8fp-1, -0x1.afee9b630f381p-55}, +{0x1.850000633739cp-1, 0x1.9bfbf6b6535bcp-55}, +{0x1.87000204289c6p-1, -0x1.bbf65f3117b75p-55}, +{0x1.88fffebf57904p-1, -0x1.9006ea23dcb57p-55}, +{0x1.8b00022bc04dfp-1, -0x1.d00df38e04b0ap-56}, +{0x1.8cfffe50c1b8ap-1, -0x1.8007146ff9f05p-55}, +{0x1.8effffc918e43p-1, 0x1.3817bd07a7038p-55}, +{0x1.910001efa5fc7p-1, 0x1.93e9176dfb403p-55}, +{0x1.9300013467bb9p-1, 0x1.f804e4b980276p-56}, +{0x1.94fffe6ee076fp-1, -0x1.f7ef0d9ff622ep-55}, +{0x1.96fffde3c12d1p-1, -0x1.082aa962638bap-56}, +{0x1.98ffff4458a0dp-1, -0x1.7801b9164a8efp-55}, +{0x1.9afffdd982e3ep-1, -0x1.740e08a5a9337p-55}, +{0x1.9cfffed49fb66p-1, 0x1.fce08c19bep-60}, +{0x1.9f00020f19c51p-1, -0x1.a3faa27885b0ap-55}, +{0x1.a10001145b006p-1, 0x1.4ff489958da56p-56}, +{0x1.a300007bbf6fap-1, 0x1.cbeab8a2b6d18p-55}, +{0x1.a500010971d79p-1, 0x1.8fecadd78793p-55}, +{0x1.a70001df52e48p-1, -0x1.f41763dd8abdbp-55}, +{0x1.a90001c593352p-1, -0x1.ebf0284c27612p-55}, +{0x1.ab0002a4f3e4bp-1, -0x1.9fd043cff3f5fp-57}, +{0x1.acfffd7ae1ed1p-1, -0x1.23ee7129070b4p-55}, +{0x1.aefffee510478p-1, 0x1.a063ee00edea3p-57}, +{0x1.b0fffdb650d5bp-1, 0x1.a06c8381f0ab9p-58}, +{0x1.b2ffffeaaca57p-1, -0x1.9011e74233c1dp-56}, +{0x1.b4fffd995badcp-1, -0x1.9ff1068862a9fp-56}, +{0x1.b7000249e659cp-1, 0x1.aff45d0864f3ep-55}, +{0x1.b8ffff987164p-1, 0x1.cfe7796c2c3f9p-56}, +{0x1.bafffd204cb4fp-1, -0x1.3ff27eef22bc4p-57}, +{0x1.bcfffd2415c45p-1, -0x1.cffb7ee3bea21p-57}, +{0x1.beffff86309dfp-1, -0x1.14103972e0b5cp-55}, +{0x1.c0fffe1b57653p-1, 0x1.bc16494b76a19p-55}, +{0x1.c2ffff1fa57e3p-1, -0x1.4feef8d30c6edp-57}, +{0x1.c4fffdcbfe424p-1, -0x1.43f68bcec4775p-55}, +{0x1.c6fffed54b9f7p-1, 0x1.47ea3f053e0ecp-55}, +{0x1.c8fffeb998fd5p-1, 0x1.383068df992f1p-56}, +{0x1.cb0002125219ap-1, -0x1.8fd8e64180e04p-57}, +{0x1.ccfffdd94469cp-1, 0x1.e7ebe1cc7ea72p-55}, +{0x1.cefffeafdc476p-1, 0x1.ebe39ad9f88fep-55}, +{0x1.d1000169af82bp-1, 0x1.57d91a8b95a71p-56}, +{0x1.d30000d0ff71dp-1, 0x1.9c1906970c7dap-55}, +{0x1.d4fffea790fc4p-1, -0x1.80e37c558fe0cp-58}, +{0x1.d70002edc87e5p-1, -0x1.f80d64dc10f44p-56}, +{0x1.d900021dc82aap-1, -0x1.47c8f94fd5c5cp-56}, +{0x1.dafffd86b0283p-1, 0x1.c7f1dc521617ep-55}, +{0x1.dd000296c4739p-1, 0x1.8019eb2ffb153p-55}, +{0x1.defffe54490f5p-1, 0x1.e00d2c652cc89p-57}, +{0x1.e0fffcdabf694p-1, -0x1.f8340202d69d2p-56}, +{0x1.e2fffdb52c8ddp-1, 0x1.b00c1ca1b0864p-56}, +{0x1.e4ffff24216efp-1, 0x1.2ffa8b094ab51p-56}, +{0x1.e6fffe88a5e11p-1, -0x1.7f673b1efbe59p-58}, +{0x1.e9000119eff0dp-1, -0x1.4808d5e0bc801p-55}, +{0x1.eafffdfa51744p-1, 0x1.80006d54320b5p-56}, +{0x1.ed0001a127fa1p-1, -0x1.002f860565c92p-58}, +{0x1.ef00007babcc4p-1, -0x1.540445d35e611p-55}, +{0x1.f0ffff57a8d02p-1, -0x1.ffb3139ef9105p-59}, +{0x1.f30001ee58ac7p-1, 0x1.a81acf2731155p-55}, +{0x1.f4ffff5823494p-1, 0x1.a3f41d4d7c743p-55}, +{0x1.f6ffffca94c6bp-1, -0x1.202f41c987875p-57}, +{0x1.f8fffe1f9c441p-1, 0x1.77dd1f477e74bp-56}, +{0x1.fafffd2e0e37ep-1, -0x1.f01199a7ca331p-57}, +{0x1.fd0001c77e49ep-1, 0x1.181ee4bceacb1p-56}, +{0x1.feffff7e0c331p-1, -0x1.e05370170875ap-57}, +{0x1.00ffff465606ep+0, -0x1.a7ead491c0adap-55}, +{0x1.02ffff3867a58p+0, -0x1.77f69c3fcb2ep-54}, +{0x1.04ffffdfc0d17p+0, 0x1.7bffe34cb945bp-54}, +{0x1.0700003cd4d82p+0, 0x1.20083c0e456cbp-55}, +{0x1.08ffff9f2cbe8p+0, -0x1.dffdfbe37751ap-57}, +{0x1.0b000010cda65p+0, -0x1.13f7faee626ebp-54}, +{0x1.0d00001a4d338p+0, 0x1.07dfa79489ff7p-55}, +{0x1.0effffadafdfdp+0, -0x1.7040570d66bcp-56}, +{0x1.110000bbafd96p+0, 0x1.e80d4846d0b62p-55}, +{0x1.12ffffae5f45dp+0, 0x1.dbffa64fd36efp-54}, +{0x1.150000dd59ad9p+0, 0x1.a0077701250aep-54}, +{0x1.170000f21559ap+0, 0x1.dfdf9e2e3deeep-55}, +{0x1.18ffffc275426p+0, 0x1.10030dc3b7273p-54}, +{0x1.1b000123d3c59p+0, 0x1.97f7980030188p-54}, +{0x1.1cffff8299eb7p+0, -0x1.5f932ab9f8c67p-57}, +{0x1.1effff48ad4p+0, 0x1.37fbf9da75bebp-54}, +{0x1.210000c8b86a4p+0, 0x1.f806b91fd5b22p-54}, +{0x1.2300003854303p+0, 0x1.3ffc2eb9fbf33p-54}, +{0x1.24fffffbcf684p+0, 0x1.601e77e2e2e72p-56}, +{0x1.26ffff52921d9p+0, 0x1.ffcbb767f0c61p-56}, +{0x1.2900014933a3cp+0, -0x1.202ca3c02412bp-56}, +{0x1.2b00014556313p+0, -0x1.2808233f21f02p-54}, +{0x1.2cfffebfe523bp+0, -0x1.8ff7e384fdcf2p-55}, +{0x1.2f0000bb8ad96p+0, -0x1.5ff51503041c5p-55}, +{0x1.30ffffb7ae2afp+0, -0x1.10071885e289dp-55}, +{0x1.32ffffeac5f7fp+0, -0x1.1ff5d3fb7b715p-54}, +{0x1.350000ca66756p+0, 0x1.57f82228b82bdp-54}, +{0x1.3700011fbf721p+0, 0x1.000bac40dd5ccp-55}, +{0x1.38ffff9592fb9p+0, -0x1.43f9d2db2a751p-54}, +{0x1.3b00004ddd242p+0, 0x1.57f6b707638e1p-55}, +{0x1.3cffff5b2c957p+0, 0x1.a023a10bf1231p-56}, +{0x1.3efffeab0b418p+0, 0x1.87f6d66b152bp-54}, +{0x1.410001532aff4p+0, 0x1.7f8375f198524p-57}, +{0x1.4300017478b29p+0, 0x1.301e672dc5143p-55}, +{0x1.44fffe795b463p+0, 0x1.9ff69b8b2895ap-55}, +{0x1.46fffe80475ep+0, -0x1.5c0b19bc2f254p-54}, +{0x1.48fffef6fc1e7p+0, 0x1.b4009f23a2a72p-54}, +{0x1.4afffe5bea704p+0, -0x1.4ffb7bf0d7d45p-54}, +{0x1.4d000171027dep+0, -0x1.9c06471dc6a3dp-54}, +{0x1.4f0000ff03ee2p+0, 0x1.77f890b85531cp-54}, +{0x1.5100012dc4bd1p+0, 0x1.004657166a436p-57}, +{0x1.530001605277ap+0, -0x1.6bfcece233209p-54}, +{0x1.54fffecdb704cp+0, -0x1.902720505a1d7p-55}, +{0x1.56fffef5f54a9p+0, 0x1.bbfe60ec96412p-54}, +{0x1.5900017e61012p+0, 0x1.87ec581afef9p-55}, +{0x1.5b00003c93e92p+0, -0x1.f41080abf0ccp-54}, +{0x1.5d0001d4919bcp+0, -0x1.8812afb254729p-54}, +{0x1.5efffe7b87a89p+0, -0x1.47eb780ed6904p-54}, +}, +#endif +}; diff --git a/src/math/log_data.h b/src/math/log_data.h new file mode 100644 index 00000000..1be22ab2 --- /dev/null +++ b/src/math/log_data.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#ifndef _LOG_DATA_H +#define _LOG_DATA_H + +#include <features.h> + +#define LOG_TABLE_BITS 7 +#define LOG_POLY_ORDER 6 +#define LOG_POLY1_ORDER 12 +extern hidden const struct log_data { +	double ln2hi; +	double ln2lo; +	double poly[LOG_POLY_ORDER - 1]; /* First coefficient is 1.  */ +	double poly1[LOG_POLY1_ORDER - 1]; +	struct { +		double invc, logc; +	} tab[1 << LOG_TABLE_BITS]; +#if !__FP_FAST_FMA +	struct { +		double chi, clo; +	} tab2[1 << LOG_TABLE_BITS]; +#endif +} __log_data; + +#endif diff --git a/src/math/logf.c b/src/math/logf.c index 52230a1b..e4c2237c 100644 --- a/src/math/logf.c +++ b/src/math/logf.c @@ -1,69 +1,71 @@ -/* origin: FreeBSD /usr/src/lib/msun/src/e_logf.c */  /* - * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com. - */ -/* - * ==================================================== - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * Single-precision log function.   * - * Developed at SunPro, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT   */  #include <math.h>  #include <stdint.h> +#include "libm.h" +#include "logf_data.h" + +/* +LOGF_TABLE_BITS = 4 +LOGF_POLY_ORDER = 4 + +ULP error: 0.818 (nearest rounding.) +Relative error: 1.957 * 2^-26 (before rounding.) +*/ -static const float -ln2_hi = 6.9313812256e-01, /* 0x3f317180 */ -ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */ -/* |(log(1+s)-log(1-s))/s - Lg(s)| < 2**-34.24 (~[-4.95e-11, 4.97e-11]). */ -Lg1 = 0xaaaaaa.0p-24, /* 0.66666662693 */ -Lg2 = 0xccce13.0p-25, /* 0.40000972152 */ -Lg3 = 0x91e9ee.0p-25, /* 0.28498786688 */ -Lg4 = 0xf89e26.0p-26; /* 0.24279078841 */ +#define T __logf_data.tab +#define A __logf_data.poly +#define Ln2 __logf_data.ln2 +#define N (1 << LOGF_TABLE_BITS) +#define OFF 0x3f330000  float logf(float x)  { -	union {float f; uint32_t i;} u = {x}; -	float_t hfsq,f,s,z,R,w,t1,t2,dk; -	uint32_t ix; -	int k; +	double_t z, r, r2, y, y0, invc, logc; +	uint32_t ix, iz, tmp; +	int k, i; -	ix = u.i; -	k = 0; -	if (ix < 0x00800000 || ix>>31) {  /* x < 2**-126  */ -		if (ix<<1 == 0) -			return -1/(x*x);  /* log(+-0)=-inf */ -		if (ix>>31) -			return (x-x)/0.0f; /* log(-#) = NaN */ -		/* subnormal number, scale up x */ -		k -= 25; -		x *= 0x1p25f; -		u.f = x; -		ix = u.i; -	} else if (ix >= 0x7f800000) { -		return x; -	} else if (ix == 0x3f800000) +	ix = asuint(x); +	/* Fix sign of zero with downward rounding when x==1.  */ +	if (WANT_ROUNDING && predict_false(ix == 0x3f800000))  		return 0; +	if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000)) { +		/* x < 0x1p-126 or inf or nan.  */ +		if (ix * 2 == 0) +			return __math_divzerof(1); +		if (ix == 0x7f800000) /* log(inf) == inf.  */ +			return x; +		if ((ix & 0x80000000) || ix * 2 >= 0xff000000) +			return __math_invalidf(x); +		/* x is subnormal, normalize it.  */ +		ix = asuint(x * 0x1p23f); +		ix -= 23 << 23; +	} + +	/* x = 2^k z; where z is in range [OFF,2*OFF] and exact. +	   The range is split into N subintervals. +	   The ith subinterval contains z and c is near its center.  */ +	tmp = ix - OFF; +	i = (tmp >> (23 - LOGF_TABLE_BITS)) % N; +	k = (int32_t)tmp >> 23; /* arithmetic shift */ +	iz = ix - (tmp & 0xff800000); +	invc = T[i].invc; +	logc = T[i].logc; +	z = (double_t)asfloat(iz); -	/* reduce x into [sqrt(2)/2, sqrt(2)] */ -	ix += 0x3f800000 - 0x3f3504f3; -	k += (int)(ix>>23) - 0x7f; -	ix = (ix&0x007fffff) + 0x3f3504f3; -	u.i = ix; -	x = u.f; +	/* log(x) = log1p(z/c-1) + log(c) + k*Ln2 */ +	r = z * invc - 1; +	y0 = logc + (double_t)k * Ln2; -	f = x - 1.0f; -	s = f/(2.0f + f); -	z = s*s; -	w = z*z; -	t1= w*(Lg2+w*Lg4); -	t2= z*(Lg1+w*Lg3); -	R = t2 + t1; -	hfsq = 0.5f*f*f; -	dk = k; -	return s*(hfsq+R) + dk*ln2_lo - hfsq + f + dk*ln2_hi; +	/* Pipelined polynomial evaluation to approximate log1p(r).  */ +	r2 = r * r; +	y = A[1] * r + A[2]; +	y = A[0] * r2 + y; +	y = y * r2 + (y0 + r); +	return eval_as_float(y);  } diff --git a/src/math/logf_data.c b/src/math/logf_data.c new file mode 100644 index 00000000..857221f7 --- /dev/null +++ b/src/math/logf_data.c @@ -0,0 +1,33 @@ +/* + * Data definition for logf. + * + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "logf_data.h" + +const struct logf_data __logf_data = { +  .tab = { +  { 0x1.661ec79f8f3bep+0, -0x1.57bf7808caadep-2 }, +  { 0x1.571ed4aaf883dp+0, -0x1.2bef0a7c06ddbp-2 }, +  { 0x1.49539f0f010bp+0, -0x1.01eae7f513a67p-2 }, +  { 0x1.3c995b0b80385p+0, -0x1.b31d8a68224e9p-3 }, +  { 0x1.30d190c8864a5p+0, -0x1.6574f0ac07758p-3 }, +  { 0x1.25e227b0b8eap+0, -0x1.1aa2bc79c81p-3 }, +  { 0x1.1bb4a4a1a343fp+0, -0x1.a4e76ce8c0e5ep-4 }, +  { 0x1.12358f08ae5bap+0, -0x1.1973c5a611cccp-4 }, +  { 0x1.0953f419900a7p+0, -0x1.252f438e10c1ep-5 }, +  { 0x1p+0, 0x0p+0 }, +  { 0x1.e608cfd9a47acp-1, 0x1.aa5aa5df25984p-5 }, +  { 0x1.ca4b31f026aap-1, 0x1.c5e53aa362eb4p-4 }, +  { 0x1.b2036576afce6p-1, 0x1.526e57720db08p-3 }, +  { 0x1.9c2d163a1aa2dp-1, 0x1.bc2860d22477p-3 }, +  { 0x1.886e6037841edp-1, 0x1.1058bc8a07ee1p-2 }, +  { 0x1.767dcf5534862p-1, 0x1.4043057b6ee09p-2 }, +  }, +  .ln2 = 0x1.62e42fefa39efp-1, +  .poly = { +  -0x1.00ea348b88334p-2, 0x1.5575b0be00b6ap-2, -0x1.ffffef20a4123p-2, +  } +}; diff --git a/src/math/logf_data.h b/src/math/logf_data.h new file mode 100644 index 00000000..00cff6f8 --- /dev/null +++ b/src/math/logf_data.h @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#ifndef _LOGF_DATA_H +#define _LOGF_DATA_H + +#include <features.h> + +#define LOGF_TABLE_BITS 4 +#define LOGF_POLY_ORDER 4 +extern hidden const struct logf_data { +	struct { +		double invc, logc; +	} tab[1 << LOGF_TABLE_BITS]; +	double ln2; +	double poly[LOGF_POLY_ORDER - 1]; /* First order coefficient is 1.  */ +} __logf_data; + +#endif diff --git a/src/math/lrint.c b/src/math/lrint.c index bdca8b7c..ddee7a0d 100644 --- a/src/math/lrint.c +++ b/src/math/lrint.c @@ -1,5 +1,6 @@  #include <limits.h>  #include <fenv.h> +#include <math.h>  #include "libm.h"  /* @@ -26,7 +27,18 @@ as a double.  */  #if LONG_MAX < 1U<<53 && defined(FE_INEXACT) -long lrint(double x) +#include <float.h> +#include <stdint.h> +#if FLT_EVAL_METHOD==0 || FLT_EVAL_METHOD==1 +#define EPS DBL_EPSILON +#elif FLT_EVAL_METHOD==2 +#define EPS LDBL_EPSILON +#endif +#ifdef __GNUC__ +/* avoid stack frame in lrint */ +__attribute__((noinline)) +#endif +static long lrint_slow(double x)  {  	#pragma STDC FENV_ACCESS ON  	int e; @@ -38,6 +50,20 @@ long lrint(double x)  	/* conversion */  	return x;  } + +long lrint(double x) +{ +	uint32_t abstop = asuint64(x)>>32 & 0x7fffffff; +	uint64_t sign = asuint64(x) & (1ULL << 63); + +	if (abstop < 0x41dfffff) { +		/* |x| < 0x7ffffc00, no overflow */ +		double_t toint = asdouble(asuint64(1/EPS) | sign); +		double_t y = x + toint - toint; +		return (long)y; +	} +	return lrint_slow(x); +}  #else  long lrint(double x)  { diff --git a/src/math/m68k/sqrtl.c b/src/math/m68k/sqrtl.c new file mode 100644 index 00000000..b1c303c7 --- /dev/null +++ b/src/math/m68k/sqrtl.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __HAVE_68881__ + +long double sqrtl(long double x) +{ +	__asm__ ("fsqrt.x %1,%0" : "=f"(x) : "fm"(x)); +	return x; +} + +#else + +#include "../sqrtl.c" + +#endif diff --git a/src/math/mips/fabs.c b/src/math/mips/fabs.c new file mode 100644 index 00000000..0a5aa3b1 --- /dev/null +++ b/src/math/mips/fabs.c @@ -0,0 +1,16 @@ +#if !defined(__mips_soft_float) && defined(__mips_abs2008) + +#include <math.h> + +double fabs(double x) +{ +	double r; +	__asm__("abs.d %0,%1" : "=f"(r) : "f"(x)); +	return r; +} + +#else + +#include "../fabs.c" + +#endif diff --git a/src/math/mips/fabsf.c b/src/math/mips/fabsf.c new file mode 100644 index 00000000..35307be6 --- /dev/null +++ b/src/math/mips/fabsf.c @@ -0,0 +1,16 @@ +#if !defined(__mips_soft_float) && defined(__mips_abs2008) + +#include <math.h> + +float fabsf(float x) +{ +	float r; +	__asm__("abs.s %0,%1" : "=f"(r) : "f"(x)); +	return r; +} + +#else + +#include "../fabsf.c" + +#endif diff --git a/src/math/mips/sqrt.c b/src/math/mips/sqrt.c new file mode 100644 index 00000000..595c9dbc --- /dev/null +++ b/src/math/mips/sqrt.c @@ -0,0 +1,16 @@ +#if !defined(__mips_soft_float) && __mips >= 3 + +#include <math.h> + +double sqrt(double x) +{ +	double r; +	__asm__("sqrt.d %0,%1" : "=f"(r) : "f"(x)); +	return r; +} + +#else + +#include "../sqrt.c" + +#endif diff --git a/src/math/mips/sqrtf.c b/src/math/mips/sqrtf.c new file mode 100644 index 00000000..84090d2d --- /dev/null +++ b/src/math/mips/sqrtf.c @@ -0,0 +1,16 @@ +#if !defined(__mips_soft_float) && __mips >= 2 + +#include <math.h> + +float sqrtf(float x) +{ +	float r; +	__asm__("sqrt.s %0,%1" : "=f"(r) : "f"(x)); +	return r; +} + +#else + +#include "../sqrtf.c" + +#endif diff --git a/src/math/pow.c b/src/math/pow.c index 3ddc1b6f..694c2ef6 100644 --- a/src/math/pow.c +++ b/src/math/pow.c @@ -1,328 +1,343 @@ -/* origin: FreeBSD /usr/src/lib/msun/src/e_pow.c */  /* - * ==================================================== - * Copyright (C) 2004 by Sun Microsystems, Inc. All rights reserved. + * Double-precision x^y function.   * - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - */ -/* pow(x,y) return x**y - * - *                    n - * Method:  Let x =  2   * (1+f) - *      1. Compute and return log2(x) in two pieces: - *              log2(x) = w1 + w2, - *         where w1 has 53-24 = 29 bit trailing zeros. - *      2. Perform y*log2(x) = n+y' by simulating muti-precision - *         arithmetic, where |y'|<=0.5. - *      3. Return x**y = 2**n*exp(y'*log2) - * - * Special cases: - *      1.  (anything) ** 0  is 1 - *      2.  1 ** (anything)  is 1 - *      3.  (anything except 1) ** NAN is NAN - *      4.  NAN ** (anything except 0) is NAN - *      5.  +-(|x| > 1) **  +INF is +INF - *      6.  +-(|x| > 1) **  -INF is +0 - *      7.  +-(|x| < 1) **  +INF is +0 - *      8.  +-(|x| < 1) **  -INF is +INF - *      9.  -1          ** +-INF is 1 - *      10. +0 ** (+anything except 0, NAN)               is +0 - *      11. -0 ** (+anything except 0, NAN, odd integer)  is +0 - *      12. +0 ** (-anything except 0, NAN)               is +INF, raise divbyzero - *      13. -0 ** (-anything except 0, NAN, odd integer)  is +INF, raise divbyzero - *      14. -0 ** (+odd integer) is -0 - *      15. -0 ** (-odd integer) is -INF, raise divbyzero - *      16. +INF ** (+anything except 0,NAN) is +INF - *      17. +INF ** (-anything except 0,NAN) is +0 - *      18. -INF ** (+odd integer) is -INF - *      19. -INF ** (anything) = -0 ** (-anything), (anything except odd integer) - *      20. (anything) ** 1 is (anything) - *      21. (anything) ** -1 is 1/(anything) - *      22. (-anything) ** (integer) is (-1)**(integer)*(+anything**integer) - *      23. (-anything except 0 and inf) ** (non-integer) is NAN - * - * Accuracy: - *      pow(x,y) returns x**y nearly rounded. In particular - *                      pow(integer,integer) - *      always returns the correct integer provided it is - *      representable. - * - * Constants : - * The hexadecimal values are the intended ones for the following - * constants. The decimal values may be used, provided that the - * compiler will convert from decimal to binary accurately enough - * to produce the hexadecimal values shown. + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT   */ +#include <math.h> +#include <stdint.h>  #include "libm.h" +#include "exp_data.h" +#include "pow_data.h" -static const double -bp[]   = {1.0, 1.5,}, -dp_h[] = { 0.0, 5.84962487220764160156e-01,}, /* 0x3FE2B803, 0x40000000 */ -dp_l[] = { 0.0, 1.35003920212974897128e-08,}, /* 0x3E4CFDEB, 0x43CFD006 */ -two53  =  9007199254740992.0, /* 0x43400000, 0x00000000 */ -huge   =  1.0e300, -tiny   =  1.0e-300, -/* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */ -L1 =  5.99999999999994648725e-01, /* 0x3FE33333, 0x33333303 */ -L2 =  4.28571428578550184252e-01, /* 0x3FDB6DB6, 0xDB6FABFF */ -L3 =  3.33333329818377432918e-01, /* 0x3FD55555, 0x518F264D */ -L4 =  2.72728123808534006489e-01, /* 0x3FD17460, 0xA91D4101 */ -L5 =  2.30660745775561754067e-01, /* 0x3FCD864A, 0x93C9DB65 */ -L6 =  2.06975017800338417784e-01, /* 0x3FCA7E28, 0x4A454EEF */ -P1 =  1.66666666666666019037e-01, /* 0x3FC55555, 0x5555553E */ -P2 = -2.77777777770155933842e-03, /* 0xBF66C16C, 0x16BEBD93 */ -P3 =  6.61375632143793436117e-05, /* 0x3F11566A, 0xAF25DE2C */ -P4 = -1.65339022054652515390e-06, /* 0xBEBBBD41, 0xC5D26BF1 */ -P5 =  4.13813679705723846039e-08, /* 0x3E663769, 0x72BEA4D0 */ -lg2     =  6.93147180559945286227e-01, /* 0x3FE62E42, 0xFEFA39EF */ -lg2_h   =  6.93147182464599609375e-01, /* 0x3FE62E43, 0x00000000 */ -lg2_l   = -1.90465429995776804525e-09, /* 0xBE205C61, 0x0CA86C39 */ -ovt     =  8.0085662595372944372e-017, /* -(1024-log2(ovfl+.5ulp)) */ -cp      =  9.61796693925975554329e-01, /* 0x3FEEC709, 0xDC3A03FD =2/(3ln2) */ -cp_h    =  9.61796700954437255859e-01, /* 0x3FEEC709, 0xE0000000 =(float)cp */ -cp_l    = -7.02846165095275826516e-09, /* 0xBE3E2FE0, 0x145B01F5 =tail of cp_h*/ -ivln2   =  1.44269504088896338700e+00, /* 0x3FF71547, 0x652B82FE =1/ln2 */ -ivln2_h =  1.44269502162933349609e+00, /* 0x3FF71547, 0x60000000 =24b 1/ln2*/ -ivln2_l =  1.92596299112661746887e-08; /* 0x3E54AE0B, 0xF85DDF44 =1/ln2 tail*/ +/* +Worst-case error: 0.54 ULP (~= ulperr_exp + 1024*Ln2*relerr_log*2^53) +relerr_log: 1.3 * 2^-68 (Relative error of log, 1.5 * 2^-68 without fma) +ulperr_exp: 0.509 ULP (ULP error of exp, 0.511 ULP without fma) +*/ -double pow(double x, double y) +#define T __pow_log_data.tab +#define A __pow_log_data.poly +#define Ln2hi __pow_log_data.ln2hi +#define Ln2lo __pow_log_data.ln2lo +#define N (1 << POW_LOG_TABLE_BITS) +#define OFF 0x3fe6955500000000 + +/* Top 12 bits of a double (sign and exponent bits).  */ +static inline uint32_t top12(double x)  { -	double z,ax,z_h,z_l,p_h,p_l; -	double y1,t1,t2,r,s,t,u,v,w; -	int32_t i,j,k,yisint,n; -	int32_t hx,hy,ix,iy; -	uint32_t lx,ly; +	return asuint64(x) >> 52; +} -	EXTRACT_WORDS(hx, lx, x); -	EXTRACT_WORDS(hy, ly, y); -	ix = hx & 0x7fffffff; -	iy = hy & 0x7fffffff; +/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about +   additional 15 bits precision.  IX is the bit representation of x, but +   normalized in the subnormal range using the sign bit for the exponent.  */ +static inline double_t log_inline(uint64_t ix, double_t *tail) +{ +	/* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */ +	double_t z, r, y, invc, logc, logctail, kd, hi, t1, t2, lo, lo1, lo2, p; +	uint64_t iz, tmp; +	int k, i; -	/* x**0 = 1, even if x is NaN */ -	if ((iy|ly) == 0) -		return 1.0; -	/* 1**y = 1, even if y is NaN */ -	if (hx == 0x3ff00000 && lx == 0) -		return 1.0; -	/* NaN if either arg is NaN */ -	if (ix > 0x7ff00000 || (ix == 0x7ff00000 && lx != 0) || -	    iy > 0x7ff00000 || (iy == 0x7ff00000 && ly != 0)) -		return x + y; +	/* x = 2^k z; where z is in range [OFF,2*OFF) and exact. +	   The range is split into N subintervals. +	   The ith subinterval contains z and c is near its center.  */ +	tmp = ix - OFF; +	i = (tmp >> (52 - POW_LOG_TABLE_BITS)) % N; +	k = (int64_t)tmp >> 52; /* arithmetic shift */ +	iz = ix - (tmp & 0xfffULL << 52); +	z = asdouble(iz); +	kd = (double_t)k; -	/* determine if y is an odd int when x < 0 -	 * yisint = 0       ... y is not an integer -	 * yisint = 1       ... y is an odd int -	 * yisint = 2       ... y is an even int -	 */ -	yisint = 0; -	if (hx < 0) { -		if (iy >= 0x43400000) -			yisint = 2; /* even integer y */ -		else if (iy >= 0x3ff00000) { -			k = (iy>>20) - 0x3ff;  /* exponent */ -			if (k > 20) { -				uint32_t j = ly>>(52-k); -				if ((j<<(52-k)) == ly) -					yisint = 2 - (j&1); -			} else if (ly == 0) { -				uint32_t j = iy>>(20-k); -				if ((j<<(20-k)) == iy) -					yisint = 2 - (j&1); -			} -		} -	} +	/* log(x) = k*Ln2 + log(c) + log1p(z/c-1).  */ +	invc = T[i].invc; +	logc = T[i].logc; +	logctail = T[i].logctail; -	/* special value of y */ -	if (ly == 0) { -		if (iy == 0x7ff00000) {  /* y is +-inf */ -			if (((ix-0x3ff00000)|lx) == 0)  /* (-1)**+-inf is 1 */ -				return 1.0; -			else if (ix >= 0x3ff00000) /* (|x|>1)**+-inf = inf,0 */ -				return hy >= 0 ? y : 0.0; -			else                       /* (|x|<1)**+-inf = 0,inf */ -				return hy >= 0 ? 0.0 : -y; -		} -		if (iy == 0x3ff00000) {    /* y is +-1 */ -			if (hy >= 0) -				return x; -			y = 1/x; -#if FLT_EVAL_METHOD!=0 -			{ -				union {double f; uint64_t i;} u = {y}; -				uint64_t i = u.i & -1ULL/2; -				if (i>>52 == 0 && (i&(i-1))) -					FORCE_EVAL((float)y); -			} +	/* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and +     |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible.  */ +#if __FP_FAST_FMA +	r = __builtin_fma(z, invc, -1.0); +#else +	/* Split z such that rhi, rlo and rhi*rhi are exact and |rlo| <= |r|.  */ +	double_t zhi = asdouble((iz + (1ULL << 31)) & (-1ULL << 32)); +	double_t zlo = z - zhi; +	double_t rhi = zhi * invc - 1.0; +	double_t rlo = zlo * invc; +	r = rhi + rlo;  #endif -			return y; -		} -		if (hy == 0x40000000)    /* y is 2 */ -			return x*x; -		if (hy == 0x3fe00000) {  /* y is 0.5 */ -			if (hx >= 0)     /* x >= +0 */ -				return sqrt(x); -		} + +	/* k*Ln2 + log(c) + r.  */ +	t1 = kd * Ln2hi + logc; +	t2 = t1 + r; +	lo1 = kd * Ln2lo + logctail; +	lo2 = t1 - t2 + r; + +	/* Evaluation is optimized assuming superscalar pipelined execution.  */ +	double_t ar, ar2, ar3, lo3, lo4; +	ar = A[0] * r; /* A[0] = -0.5.  */ +	ar2 = r * ar; +	ar3 = r * ar2; +	/* k*Ln2 + log(c) + r + A[0]*r*r.  */ +#if __FP_FAST_FMA +	hi = t2 + ar2; +	lo3 = __builtin_fma(ar, r, -ar2); +	lo4 = t2 - hi + ar2; +#else +	double_t arhi = A[0] * rhi; +	double_t arhi2 = rhi * arhi; +	hi = t2 + arhi2; +	lo3 = rlo * (ar + arhi); +	lo4 = t2 - hi + arhi2; +#endif +	/* p = log1p(r) - r - A[0]*r*r.  */ +	p = (ar3 * (A[1] + r * A[2] + +		    ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r * A[6])))); +	lo = lo1 + lo2 + lo3 + lo4 + p; +	y = hi + lo; +	*tail = hi - y + lo; +	return y; +} + +#undef N +#undef T +#define N (1 << EXP_TABLE_BITS) +#define InvLn2N __exp_data.invln2N +#define NegLn2hiN __exp_data.negln2hiN +#define NegLn2loN __exp_data.negln2loN +#define Shift __exp_data.shift +#define T __exp_data.tab +#define C2 __exp_data.poly[5 - EXP_POLY_ORDER] +#define C3 __exp_data.poly[6 - EXP_POLY_ORDER] +#define C4 __exp_data.poly[7 - EXP_POLY_ORDER] +#define C5 __exp_data.poly[8 - EXP_POLY_ORDER] +#define C6 __exp_data.poly[9 - EXP_POLY_ORDER] + +/* Handle cases that may overflow or underflow when computing the result that +   is scale*(1+TMP) without intermediate rounding.  The bit representation of +   scale is in SBITS, however it has a computed exponent that may have +   overflown into the sign bit so that needs to be adjusted before using it as +   a double.  (int32_t)KI is the k used in the argument reduction and exponent +   adjustment of scale, positive k here means the result may overflow and +   negative k means the result may underflow.  */ +static inline double specialcase(double_t tmp, uint64_t sbits, uint64_t ki) +{ +	double_t scale, y; + +	if ((ki & 0x80000000) == 0) { +		/* k > 0, the exponent of scale might have overflowed by <= 460.  */ +		sbits -= 1009ull << 52; +		scale = asdouble(sbits); +		y = 0x1p1009 * (scale + scale * tmp); +		return eval_as_double(y); +	} +	/* k < 0, need special care in the subnormal range.  */ +	sbits += 1022ull << 52; +	/* Note: sbits is signed scale.  */ +	scale = asdouble(sbits); +	y = scale + scale * tmp; +	if (fabs(y) < 1.0) { +		/* Round y to the right precision before scaling it into the subnormal +		   range to avoid double rounding that can cause 0.5+E/2 ulp error where +		   E is the worst-case ulp error outside the subnormal range.  So this +		   is only useful if the goal is better than 1 ulp worst-case error.  */ +		double_t hi, lo, one = 1.0; +		if (y < 0.0) +			one = -1.0; +		lo = scale - y + scale * tmp; +		hi = one + y; +		lo = one - hi + y + lo; +		y = eval_as_double(hi + lo) - one; +		/* Fix the sign of 0.  */ +		if (y == 0.0) +			y = asdouble(sbits & 0x8000000000000000); +		/* The underflow exception needs to be signaled explicitly.  */ +		fp_force_eval(fp_barrier(0x1p-1022) * 0x1p-1022);  	} +	y = 0x1p-1022 * y; +	return eval_as_double(y); +} -	ax = fabs(x); -	/* special value of x */ -	if (lx == 0) { -		if (ix == 0x7ff00000 || ix == 0 || ix == 0x3ff00000) { /* x is +-0,+-inf,+-1 */ -			z = ax; -			if (hy < 0)   /* z = (1/|x|) */ -				z = 1.0/z; -			if (hx < 0) { -				if (((ix-0x3ff00000)|yisint) == 0) { -					z = (z-z)/(z-z); /* (-1)**non-int is NaN */ -				} else if (yisint == 1) -					z = -z;          /* (x<0)**odd = -(|x|**odd) */ -			} -			return z; +#define SIGN_BIAS (0x800 << EXP_TABLE_BITS) + +/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. +   The sign_bias argument is SIGN_BIAS or 0 and sets the sign to -1 or 1.  */ +static inline double exp_inline(double_t x, double_t xtail, uint32_t sign_bias) +{ +	uint32_t abstop; +	uint64_t ki, idx, top, sbits; +	/* double_t for better performance on targets with FLT_EVAL_METHOD==2.  */ +	double_t kd, z, r, r2, scale, tail, tmp; + +	abstop = top12(x) & 0x7ff; +	if (predict_false(abstop - top12(0x1p-54) >= +			  top12(512.0) - top12(0x1p-54))) { +		if (abstop - top12(0x1p-54) >= 0x80000000) { +			/* Avoid spurious underflow for tiny x.  */ +			/* Note: 0 is common input.  */ +			double_t one = WANT_ROUNDING ? 1.0 + x : 1.0; +			return sign_bias ? -one : one; +		} +		if (abstop >= top12(1024.0)) { +			/* Note: inf and nan are already handled.  */ +			if (asuint64(x) >> 63) +				return __math_uflow(sign_bias); +			else +				return __math_oflow(sign_bias);  		} +		/* Large x is special cased below.  */ +		abstop = 0;  	} -	s = 1.0; /* sign of result */ -	if (hx < 0) { -		if (yisint == 0) /* (x<0)**(non-int) is NaN */ -			return (x-x)/(x-x); -		if (yisint == 1) /* (x<0)**(odd int) */ -			s = -1.0; -	} +	/* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */ +	/* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */ +	z = InvLn2N * x; +#if TOINT_INTRINSICS +	kd = roundtoint(z); +	ki = converttoint(z); +#elif EXP_USE_TOINT_NARROW +	/* z - kd is in [-0.5-2^-16, 0.5] in all rounding modes.  */ +	kd = eval_as_double(z + Shift); +	ki = asuint64(kd) >> 16; +	kd = (double_t)(int32_t)ki; +#else +	/* z - kd is in [-1, 1] in non-nearest rounding modes.  */ +	kd = eval_as_double(z + Shift); +	ki = asuint64(kd); +	kd -= Shift; +#endif +	r = x + kd * NegLn2hiN + kd * NegLn2loN; +	/* The code assumes 2^-200 < |xtail| < 2^-8/N.  */ +	r += xtail; +	/* 2^(k/N) ~= scale * (1 + tail).  */ +	idx = 2 * (ki % N); +	top = (ki + sign_bias) << (52 - EXP_TABLE_BITS); +	tail = asdouble(T[idx]); +	/* This is only a valid scale when -1023*N < k < 1024*N.  */ +	sbits = T[idx + 1] + top; +	/* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1).  */ +	/* Evaluation is optimized assuming superscalar pipelined execution.  */ +	r2 = r * r; +	/* Without fma the worst case error is 0.25/N ulp larger.  */ +	/* Worst case error is less than 0.5+1.11/N+(abs poly error * 2^53) ulp.  */ +	tmp = tail + r + r2 * (C2 + r * C3) + r2 * r2 * (C4 + r * C5); +	if (predict_false(abstop == 0)) +		return specialcase(tmp, sbits, ki); +	scale = asdouble(sbits); +	/* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there +	   is no spurious underflow here even without fma.  */ +	return eval_as_double(scale + scale * tmp); +} -	/* |y| is huge */ -	if (iy > 0x41e00000) { /* if |y| > 2**31 */ -		if (iy > 0x43f00000) {  /* if |y| > 2**64, must o/uflow */ -			if (ix <= 0x3fefffff) -				return hy < 0 ? huge*huge : tiny*tiny; -			if (ix >= 0x3ff00000) -				return hy > 0 ? huge*huge : tiny*tiny; +/* Returns 0 if not int, 1 if odd int, 2 if even int.  The argument is +   the bit representation of a non-zero finite floating-point value.  */ +static inline int checkint(uint64_t iy) +{ +	int e = iy >> 52 & 0x7ff; +	if (e < 0x3ff) +		return 0; +	if (e > 0x3ff + 52) +		return 2; +	if (iy & ((1ULL << (0x3ff + 52 - e)) - 1)) +		return 0; +	if (iy & (1ULL << (0x3ff + 52 - e))) +		return 1; +	return 2; +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan.  */ +static inline int zeroinfnan(uint64_t i) +{ +	return 2 * i - 1 >= 2 * asuint64(INFINITY) - 1; +} + +double pow(double x, double y) +{ +	uint32_t sign_bias = 0; +	uint64_t ix, iy; +	uint32_t topx, topy; + +	ix = asuint64(x); +	iy = asuint64(y); +	topx = top12(x); +	topy = top12(y); +	if (predict_false(topx - 0x001 >= 0x7ff - 0x001 || +			  (topy & 0x7ff) - 0x3be >= 0x43e - 0x3be)) { +		/* Note: if |y| > 1075 * ln2 * 2^53 ~= 0x1.749p62 then pow(x,y) = inf/0 +		   and if |y| < 2^-54 / 1075 ~= 0x1.e7b6p-65 then pow(x,y) = +-1.  */ +		/* Special cases: (x < 0x1p-126 or inf or nan) or +		   (|y| < 0x1p-65 or |y| >= 0x1p63 or nan).  */ +		if (predict_false(zeroinfnan(iy))) { +			if (2 * iy == 0) +				return issignaling_inline(x) ? x + y : 1.0; +			if (ix == asuint64(1.0)) +				return issignaling_inline(y) ? x + y : 1.0; +			if (2 * ix > 2 * asuint64(INFINITY) || +			    2 * iy > 2 * asuint64(INFINITY)) +				return x + y; +			if (2 * ix == 2 * asuint64(1.0)) +				return 1.0; +			if ((2 * ix < 2 * asuint64(1.0)) == !(iy >> 63)) +				return 0.0; /* |x|<1 && y==inf or |x|>1 && y==-inf.  */ +			return y * y;  		} -		/* over/underflow if x is not close to one */ -		if (ix < 0x3fefffff) -			return hy < 0 ? s*huge*huge : s*tiny*tiny; -		if (ix > 0x3ff00000) -			return hy > 0 ? s*huge*huge : s*tiny*tiny; -		/* now |1-x| is tiny <= 2**-20, suffice to compute -		   log(x) by x-x^2/2+x^3/3-x^4/4 */ -		t = ax - 1.0;       /* t has 20 trailing zeros */ -		w = (t*t)*(0.5 - t*(0.3333333333333333333333-t*0.25)); -		u = ivln2_h*t;      /* ivln2_h has 21 sig. bits */ -		v = t*ivln2_l - w*ivln2; -		t1 = u + v; -		SET_LOW_WORD(t1, 0); -		t2 = v - (t1-u); -	} else { -		double ss,s2,s_h,s_l,t_h,t_l; -		n = 0; -		/* take care subnormal number */ -		if (ix < 0x00100000) { -			ax *= two53; -			n -= 53; -			GET_HIGH_WORD(ix,ax); +		if (predict_false(zeroinfnan(ix))) { +			double_t x2 = x * x; +			if (ix >> 63 && checkint(iy) == 1) +				x2 = -x2; +			/* Without the barrier some versions of clang hoist the 1/x2 and +			   thus division by zero exception can be signaled spuriously.  */ +			return iy >> 63 ? fp_barrier(1 / x2) : x2;  		} -		n += ((ix)>>20) - 0x3ff; -		j = ix & 0x000fffff; -		/* determine interval */ -		ix = j | 0x3ff00000;   /* normalize ix */ -		if (j <= 0x3988E)      /* |x|<sqrt(3/2) */ -			k = 0; -		else if (j < 0xBB67A)  /* |x|<sqrt(3)   */ -			k = 1; -		else { -			k = 0; -			n += 1; -			ix -= 0x00100000; +		/* Here x and y are non-zero finite.  */ +		if (ix >> 63) { +			/* Finite x < 0.  */ +			int yint = checkint(iy); +			if (yint == 0) +				return __math_invalid(x); +			if (yint == 1) +				sign_bias = SIGN_BIAS; +			ix &= 0x7fffffffffffffff; +			topx &= 0x7ff; +		} +		if ((topy & 0x7ff) - 0x3be >= 0x43e - 0x3be) { +			/* Note: sign_bias == 0 here because y is not odd.  */ +			if (ix == asuint64(1.0)) +				return 1.0; +			if ((topy & 0x7ff) < 0x3be) { +				/* |y| < 2^-65, x^y ~= 1 + y*log(x).  */ +				if (WANT_ROUNDING) +					return ix > asuint64(1.0) ? 1.0 + y : +								    1.0 - y; +				else +					return 1.0; +			} +			return (ix > asuint64(1.0)) == (topy < 0x800) ? +				       __math_oflow(0) : +				       __math_uflow(0); +		} +		if (topx == 0) { +			/* Normalize subnormal x so exponent becomes negative.  */ +			ix = asuint64(x * 0x1p52); +			ix &= 0x7fffffffffffffff; +			ix -= 52ULL << 52;  		} -		SET_HIGH_WORD(ax, ix); - -		/* compute ss = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */ -		u = ax - bp[k];        /* bp[0]=1.0, bp[1]=1.5 */ -		v = 1.0/(ax+bp[k]); -		ss = u*v; -		s_h = ss; -		SET_LOW_WORD(s_h, 0); -		/* t_h=ax+bp[k] High */ -		t_h = 0.0; -		SET_HIGH_WORD(t_h, ((ix>>1)|0x20000000) + 0x00080000 + (k<<18)); -		t_l = ax - (t_h-bp[k]); -		s_l = v*((u-s_h*t_h)-s_h*t_l); -		/* compute log(ax) */ -		s2 = ss*ss; -		r = s2*s2*(L1+s2*(L2+s2*(L3+s2*(L4+s2*(L5+s2*L6))))); -		r += s_l*(s_h+ss); -		s2 = s_h*s_h; -		t_h = 3.0 + s2 + r; -		SET_LOW_WORD(t_h, 0); -		t_l = r - ((t_h-3.0)-s2); -		/* u+v = ss*(1+...) */ -		u = s_h*t_h; -		v = s_l*t_h + t_l*ss; -		/* 2/(3log2)*(ss+...) */ -		p_h = u + v; -		SET_LOW_WORD(p_h, 0); -		p_l = v - (p_h-u); -		z_h = cp_h*p_h;        /* cp_h+cp_l = 2/(3*log2) */ -		z_l = cp_l*p_h+p_l*cp + dp_l[k]; -		/* log2(ax) = (ss+..)*2/(3*log2) = n + dp_h + z_h + z_l */ -		t = (double)n; -		t1 = ((z_h + z_l) + dp_h[k]) + t; -		SET_LOW_WORD(t1, 0); -		t2 = z_l - (((t1 - t) - dp_h[k]) - z_h);  	} -	/* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */ -	y1 = y; -	SET_LOW_WORD(y1, 0); -	p_l = (y-y1)*t1 + y*t2; -	p_h = y1*t1; -	z = p_l + p_h; -	EXTRACT_WORDS(j, i, z); -	if (j >= 0x40900000) {                      /* z >= 1024 */ -		if (((j-0x40900000)|i) != 0)        /* if z > 1024 */ -			return s*huge*huge;         /* overflow */ -		if (p_l + ovt > z - p_h) -			return s*huge*huge;         /* overflow */ -	} else if ((j&0x7fffffff) >= 0x4090cc00) {  /* z <= -1075 */  // FIXME: instead of abs(j) use unsigned j -		if (((j-0xc090cc00)|i) != 0)        /* z < -1075 */ -			return s*tiny*tiny;         /* underflow */ -		if (p_l <= z - p_h) -			return s*tiny*tiny;         /* underflow */ -	} -	/* -	 * compute 2**(p_h+p_l) -	 */ -	i = j & 0x7fffffff; -	k = (i>>20) - 0x3ff; -	n = 0; -	if (i > 0x3fe00000) {  /* if |z| > 0.5, set n = [z+0.5] */ -		n = j + (0x00100000>>(k+1)); -		k = ((n&0x7fffffff)>>20) - 0x3ff;  /* new k for n */ -		t = 0.0; -		SET_HIGH_WORD(t, n & ~(0x000fffff>>k)); -		n = ((n&0x000fffff)|0x00100000)>>(20-k); -		if (j < 0) -			n = -n; -		p_h -= t; -	} -	t = p_l + p_h; -	SET_LOW_WORD(t, 0); -	u = t*lg2_h; -	v = (p_l-(t-p_h))*lg2 + t*lg2_l; -	z = u + v; -	w = v - (z-u); -	t = z*z; -	t1 = z - t*(P1+t*(P2+t*(P3+t*(P4+t*P5)))); -	r = (z*t1)/(t1-2.0) - (w + z*w); -	z = 1.0 - (r-z); -	GET_HIGH_WORD(j, z); -	j += n<<20; -	if ((j>>20) <= 0)  /* subnormal output */ -		z = scalbn(z,n); -	else -		SET_HIGH_WORD(z, j); -	return s*z; +	double_t lo; +	double_t hi = log_inline(ix, &lo); +	double_t ehi, elo; +#if __FP_FAST_FMA +	ehi = y * hi; +	elo = y * lo + __builtin_fma(y, hi, -ehi); +#else +	double_t yhi = asdouble(iy & -1ULL << 27); +	double_t ylo = y - yhi; +	double_t lhi = asdouble(asuint64(hi) & -1ULL << 27); +	double_t llo = hi - lhi + lo; +	ehi = yhi * lhi; +	elo = ylo * lhi + y * llo; /* |elo| < |ehi| * 2^-25.  */ +#endif +	return exp_inline(ehi, elo, sign_bias);  } diff --git a/src/math/pow_data.c b/src/math/pow_data.c new file mode 100644 index 00000000..81e760de --- /dev/null +++ b/src/math/pow_data.c @@ -0,0 +1,180 @@ +/* + * Data for the log part of pow. + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "pow_data.h" + +#define N (1 << POW_LOG_TABLE_BITS) + +const struct pow_log_data __pow_log_data = { +.ln2hi = 0x1.62e42fefa3800p-1, +.ln2lo = 0x1.ef35793c76730p-45, +.poly = { +// relative error: 0x1.11922ap-70 +// in -0x1.6bp-8 0x1.6bp-8 +// Coefficients are scaled to match the scaling during evaluation. +-0x1p-1, +0x1.555555555556p-2 * -2, +-0x1.0000000000006p-2 * -2, +0x1.999999959554ep-3 * 4, +-0x1.555555529a47ap-3 * 4, +0x1.2495b9b4845e9p-3 * -8, +-0x1.0002b8b263fc3p-3 * -8, +}, +/* Algorithm: + +	x = 2^k z +	log(x) = k ln2 + log(c) + log(z/c) +	log(z/c) = poly(z/c - 1) + +where z is in [0x1.69555p-1; 0x1.69555p0] which is split into N subintervals +and z falls into the ith one, then table entries are computed as + +	tab[i].invc = 1/c +	tab[i].logc = round(0x1p43*log(c))/0x1p43 +	tab[i].logctail = (double)(log(c) - logc) + +where c is chosen near the center of the subinterval such that 1/c has only a +few precision bits so z/c - 1 is exactly representible as double: + +	1/c = center < 1 ? round(N/center)/N : round(2*N/center)/N/2 + +Note: |z/c - 1| < 1/N for the chosen c, |log(c) - logc - logctail| < 0x1p-97, +the last few bits of logc are rounded away so k*ln2hi + logc has no rounding +error and the interval for z is selected such that near x == 1, where log(x) +is tiny, large cancellation error is avoided in logc + poly(z/c - 1).  */ +.tab = { +#define A(a, b, c) {a, 0, b, c}, +A(0x1.6a00000000000p+0, -0x1.62c82f2b9c800p-2, 0x1.ab42428375680p-48) +A(0x1.6800000000000p+0, -0x1.5d1bdbf580800p-2, -0x1.ca508d8e0f720p-46) +A(0x1.6600000000000p+0, -0x1.5767717455800p-2, -0x1.362a4d5b6506dp-45) +A(0x1.6400000000000p+0, -0x1.51aad872df800p-2, -0x1.684e49eb067d5p-49) +A(0x1.6200000000000p+0, -0x1.4be5f95777800p-2, -0x1.41b6993293ee0p-47) +A(0x1.6000000000000p+0, -0x1.4618bc21c6000p-2, 0x1.3d82f484c84ccp-46) +A(0x1.5e00000000000p+0, -0x1.404308686a800p-2, 0x1.c42f3ed820b3ap-50) +A(0x1.5c00000000000p+0, -0x1.3a64c55694800p-2, 0x1.0b1c686519460p-45) +A(0x1.5a00000000000p+0, -0x1.347dd9a988000p-2, 0x1.5594dd4c58092p-45) +A(0x1.5800000000000p+0, -0x1.2e8e2bae12000p-2, 0x1.67b1e99b72bd8p-45) +A(0x1.5600000000000p+0, -0x1.2895a13de8800p-2, 0x1.5ca14b6cfb03fp-46) +A(0x1.5600000000000p+0, -0x1.2895a13de8800p-2, 0x1.5ca14b6cfb03fp-46) +A(0x1.5400000000000p+0, -0x1.22941fbcf7800p-2, -0x1.65a242853da76p-46) +A(0x1.5200000000000p+0, -0x1.1c898c1699800p-2, -0x1.fafbc68e75404p-46) +A(0x1.5000000000000p+0, -0x1.1675cababa800p-2, 0x1.f1fc63382a8f0p-46) +A(0x1.4e00000000000p+0, -0x1.1058bf9ae4800p-2, -0x1.6a8c4fd055a66p-45) +A(0x1.4c00000000000p+0, -0x1.0a324e2739000p-2, -0x1.c6bee7ef4030ep-47) +A(0x1.4a00000000000p+0, -0x1.0402594b4d000p-2, -0x1.036b89ef42d7fp-48) +A(0x1.4a00000000000p+0, -0x1.0402594b4d000p-2, -0x1.036b89ef42d7fp-48) +A(0x1.4800000000000p+0, -0x1.fb9186d5e4000p-3, 0x1.d572aab993c87p-47) +A(0x1.4600000000000p+0, -0x1.ef0adcbdc6000p-3, 0x1.b26b79c86af24p-45) +A(0x1.4400000000000p+0, -0x1.e27076e2af000p-3, -0x1.72f4f543fff10p-46) +A(0x1.4200000000000p+0, -0x1.d5c216b4fc000p-3, 0x1.1ba91bbca681bp-45) +A(0x1.4000000000000p+0, -0x1.c8ff7c79aa000p-3, 0x1.7794f689f8434p-45) +A(0x1.4000000000000p+0, -0x1.c8ff7c79aa000p-3, 0x1.7794f689f8434p-45) +A(0x1.3e00000000000p+0, -0x1.bc286742d9000p-3, 0x1.94eb0318bb78fp-46) +A(0x1.3c00000000000p+0, -0x1.af3c94e80c000p-3, 0x1.a4e633fcd9066p-52) +A(0x1.3a00000000000p+0, -0x1.a23bc1fe2b000p-3, -0x1.58c64dc46c1eap-45) +A(0x1.3a00000000000p+0, -0x1.a23bc1fe2b000p-3, -0x1.58c64dc46c1eap-45) +A(0x1.3800000000000p+0, -0x1.9525a9cf45000p-3, -0x1.ad1d904c1d4e3p-45) +A(0x1.3600000000000p+0, -0x1.87fa06520d000p-3, 0x1.bbdbf7fdbfa09p-45) +A(0x1.3400000000000p+0, -0x1.7ab890210e000p-3, 0x1.bdb9072534a58p-45) +A(0x1.3400000000000p+0, -0x1.7ab890210e000p-3, 0x1.bdb9072534a58p-45) +A(0x1.3200000000000p+0, -0x1.6d60fe719d000p-3, -0x1.0e46aa3b2e266p-46) +A(0x1.3000000000000p+0, -0x1.5ff3070a79000p-3, -0x1.e9e439f105039p-46) +A(0x1.3000000000000p+0, -0x1.5ff3070a79000p-3, -0x1.e9e439f105039p-46) +A(0x1.2e00000000000p+0, -0x1.526e5e3a1b000p-3, -0x1.0de8b90075b8fp-45) +A(0x1.2c00000000000p+0, -0x1.44d2b6ccb8000p-3, 0x1.70cc16135783cp-46) +A(0x1.2c00000000000p+0, -0x1.44d2b6ccb8000p-3, 0x1.70cc16135783cp-46) +A(0x1.2a00000000000p+0, -0x1.371fc201e9000p-3, 0x1.178864d27543ap-48) +A(0x1.2800000000000p+0, -0x1.29552f81ff000p-3, -0x1.48d301771c408p-45) +A(0x1.2600000000000p+0, -0x1.1b72ad52f6000p-3, -0x1.e80a41811a396p-45) +A(0x1.2600000000000p+0, -0x1.1b72ad52f6000p-3, -0x1.e80a41811a396p-45) +A(0x1.2400000000000p+0, -0x1.0d77e7cd09000p-3, 0x1.a699688e85bf4p-47) +A(0x1.2400000000000p+0, -0x1.0d77e7cd09000p-3, 0x1.a699688e85bf4p-47) +A(0x1.2200000000000p+0, -0x1.fec9131dbe000p-4, -0x1.575545ca333f2p-45) +A(0x1.2000000000000p+0, -0x1.e27076e2b0000p-4, 0x1.a342c2af0003cp-45) +A(0x1.2000000000000p+0, -0x1.e27076e2b0000p-4, 0x1.a342c2af0003cp-45) +A(0x1.1e00000000000p+0, -0x1.c5e548f5bc000p-4, -0x1.d0c57585fbe06p-46) +A(0x1.1c00000000000p+0, -0x1.a926d3a4ae000p-4, 0x1.53935e85baac8p-45) +A(0x1.1c00000000000p+0, -0x1.a926d3a4ae000p-4, 0x1.53935e85baac8p-45) +A(0x1.1a00000000000p+0, -0x1.8c345d631a000p-4, 0x1.37c294d2f5668p-46) +A(0x1.1a00000000000p+0, -0x1.8c345d631a000p-4, 0x1.37c294d2f5668p-46) +A(0x1.1800000000000p+0, -0x1.6f0d28ae56000p-4, -0x1.69737c93373dap-45) +A(0x1.1600000000000p+0, -0x1.51b073f062000p-4, 0x1.f025b61c65e57p-46) +A(0x1.1600000000000p+0, -0x1.51b073f062000p-4, 0x1.f025b61c65e57p-46) +A(0x1.1400000000000p+0, -0x1.341d7961be000p-4, 0x1.c5edaccf913dfp-45) +A(0x1.1400000000000p+0, -0x1.341d7961be000p-4, 0x1.c5edaccf913dfp-45) +A(0x1.1200000000000p+0, -0x1.16536eea38000p-4, 0x1.47c5e768fa309p-46) +A(0x1.1000000000000p+0, -0x1.f0a30c0118000p-5, 0x1.d599e83368e91p-45) +A(0x1.1000000000000p+0, -0x1.f0a30c0118000p-5, 0x1.d599e83368e91p-45) +A(0x1.0e00000000000p+0, -0x1.b42dd71198000p-5, 0x1.c827ae5d6704cp-46) +A(0x1.0e00000000000p+0, -0x1.b42dd71198000p-5, 0x1.c827ae5d6704cp-46) +A(0x1.0c00000000000p+0, -0x1.77458f632c000p-5, -0x1.cfc4634f2a1eep-45) +A(0x1.0c00000000000p+0, -0x1.77458f632c000p-5, -0x1.cfc4634f2a1eep-45) +A(0x1.0a00000000000p+0, -0x1.39e87b9fec000p-5, 0x1.502b7f526feaap-48) +A(0x1.0a00000000000p+0, -0x1.39e87b9fec000p-5, 0x1.502b7f526feaap-48) +A(0x1.0800000000000p+0, -0x1.f829b0e780000p-6, -0x1.980267c7e09e4p-45) +A(0x1.0800000000000p+0, -0x1.f829b0e780000p-6, -0x1.980267c7e09e4p-45) +A(0x1.0600000000000p+0, -0x1.7b91b07d58000p-6, -0x1.88d5493faa639p-45) +A(0x1.0400000000000p+0, -0x1.fc0a8b0fc0000p-7, -0x1.f1e7cf6d3a69cp-50) +A(0x1.0400000000000p+0, -0x1.fc0a8b0fc0000p-7, -0x1.f1e7cf6d3a69cp-50) +A(0x1.0200000000000p+0, -0x1.fe02a6b100000p-8, -0x1.9e23f0dda40e4p-46) +A(0x1.0200000000000p+0, -0x1.fe02a6b100000p-8, -0x1.9e23f0dda40e4p-46) +A(0x1.0000000000000p+0, 0x0.0000000000000p+0, 0x0.0000000000000p+0) +A(0x1.0000000000000p+0, 0x0.0000000000000p+0, 0x0.0000000000000p+0) +A(0x1.fc00000000000p-1, 0x1.0101575890000p-7, -0x1.0c76b999d2be8p-46) +A(0x1.f800000000000p-1, 0x1.0205658938000p-6, -0x1.3dc5b06e2f7d2p-45) +A(0x1.f400000000000p-1, 0x1.8492528c90000p-6, -0x1.aa0ba325a0c34p-45) +A(0x1.f000000000000p-1, 0x1.0415d89e74000p-5, 0x1.111c05cf1d753p-47) +A(0x1.ec00000000000p-1, 0x1.466aed42e0000p-5, -0x1.c167375bdfd28p-45) +A(0x1.e800000000000p-1, 0x1.894aa149fc000p-5, -0x1.97995d05a267dp-46) +A(0x1.e400000000000p-1, 0x1.ccb73cdddc000p-5, -0x1.a68f247d82807p-46) +A(0x1.e200000000000p-1, 0x1.eea31c006c000p-5, -0x1.e113e4fc93b7bp-47) +A(0x1.de00000000000p-1, 0x1.1973bd1466000p-4, -0x1.5325d560d9e9bp-45) +A(0x1.da00000000000p-1, 0x1.3bdf5a7d1e000p-4, 0x1.cc85ea5db4ed7p-45) +A(0x1.d600000000000p-1, 0x1.5e95a4d97a000p-4, -0x1.c69063c5d1d1ep-45) +A(0x1.d400000000000p-1, 0x1.700d30aeac000p-4, 0x1.c1e8da99ded32p-49) +A(0x1.d000000000000p-1, 0x1.9335e5d594000p-4, 0x1.3115c3abd47dap-45) +A(0x1.cc00000000000p-1, 0x1.b6ac88dad6000p-4, -0x1.390802bf768e5p-46) +A(0x1.ca00000000000p-1, 0x1.c885801bc4000p-4, 0x1.646d1c65aacd3p-45) +A(0x1.c600000000000p-1, 0x1.ec739830a2000p-4, -0x1.dc068afe645e0p-45) +A(0x1.c400000000000p-1, 0x1.fe89139dbe000p-4, -0x1.534d64fa10afdp-45) +A(0x1.c000000000000p-1, 0x1.1178e8227e000p-3, 0x1.1ef78ce2d07f2p-45) +A(0x1.be00000000000p-1, 0x1.1aa2b7e23f000p-3, 0x1.ca78e44389934p-45) +A(0x1.ba00000000000p-1, 0x1.2d1610c868000p-3, 0x1.39d6ccb81b4a1p-47) +A(0x1.b800000000000p-1, 0x1.365fcb0159000p-3, 0x1.62fa8234b7289p-51) +A(0x1.b400000000000p-1, 0x1.4913d8333b000p-3, 0x1.5837954fdb678p-45) +A(0x1.b200000000000p-1, 0x1.527e5e4a1b000p-3, 0x1.633e8e5697dc7p-45) +A(0x1.ae00000000000p-1, 0x1.6574ebe8c1000p-3, 0x1.9cf8b2c3c2e78p-46) +A(0x1.ac00000000000p-1, 0x1.6f0128b757000p-3, -0x1.5118de59c21e1p-45) +A(0x1.aa00000000000p-1, 0x1.7898d85445000p-3, -0x1.c661070914305p-46) +A(0x1.a600000000000p-1, 0x1.8beafeb390000p-3, -0x1.73d54aae92cd1p-47) +A(0x1.a400000000000p-1, 0x1.95a5adcf70000p-3, 0x1.7f22858a0ff6fp-47) +A(0x1.a000000000000p-1, 0x1.a93ed3c8ae000p-3, -0x1.8724350562169p-45) +A(0x1.9e00000000000p-1, 0x1.b31d8575bd000p-3, -0x1.c358d4eace1aap-47) +A(0x1.9c00000000000p-1, 0x1.bd087383be000p-3, -0x1.d4bc4595412b6p-45) +A(0x1.9a00000000000p-1, 0x1.c6ffbc6f01000p-3, -0x1.1ec72c5962bd2p-48) +A(0x1.9600000000000p-1, 0x1.db13db0d49000p-3, -0x1.aff2af715b035p-45) +A(0x1.9400000000000p-1, 0x1.e530effe71000p-3, 0x1.212276041f430p-51) +A(0x1.9200000000000p-1, 0x1.ef5ade4dd0000p-3, -0x1.a211565bb8e11p-51) +A(0x1.9000000000000p-1, 0x1.f991c6cb3b000p-3, 0x1.bcbecca0cdf30p-46) +A(0x1.8c00000000000p-1, 0x1.07138604d5800p-2, 0x1.89cdb16ed4e91p-48) +A(0x1.8a00000000000p-1, 0x1.0c42d67616000p-2, 0x1.7188b163ceae9p-45) +A(0x1.8800000000000p-1, 0x1.1178e8227e800p-2, -0x1.c210e63a5f01cp-45) +A(0x1.8600000000000p-1, 0x1.16b5ccbacf800p-2, 0x1.b9acdf7a51681p-45) +A(0x1.8400000000000p-1, 0x1.1bf99635a6800p-2, 0x1.ca6ed5147bdb7p-45) +A(0x1.8200000000000p-1, 0x1.214456d0eb800p-2, 0x1.a87deba46baeap-47) +A(0x1.7e00000000000p-1, 0x1.2bef07cdc9000p-2, 0x1.a9cfa4a5004f4p-45) +A(0x1.7c00000000000p-1, 0x1.314f1e1d36000p-2, -0x1.8e27ad3213cb8p-45) +A(0x1.7a00000000000p-1, 0x1.36b6776be1000p-2, 0x1.16ecdb0f177c8p-46) +A(0x1.7800000000000p-1, 0x1.3c25277333000p-2, 0x1.83b54b606bd5cp-46) +A(0x1.7600000000000p-1, 0x1.419b423d5e800p-2, 0x1.8e436ec90e09dp-47) +A(0x1.7400000000000p-1, 0x1.4718dc271c800p-2, -0x1.f27ce0967d675p-45) +A(0x1.7200000000000p-1, 0x1.4c9e09e173000p-2, -0x1.e20891b0ad8a4p-45) +A(0x1.7000000000000p-1, 0x1.522ae0738a000p-2, 0x1.ebe708164c759p-45) +A(0x1.6e00000000000p-1, 0x1.57bf753c8d000p-2, 0x1.fadedee5d40efp-46) +A(0x1.6c00000000000p-1, 0x1.5d5bddf596000p-2, -0x1.a0b2a08a465dcp-47) +}, +}; diff --git a/src/math/pow_data.h b/src/math/pow_data.h new file mode 100644 index 00000000..5d609ae8 --- /dev/null +++ b/src/math/pow_data.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#ifndef _POW_DATA_H +#define _POW_DATA_H + +#include <features.h> + +#define POW_LOG_TABLE_BITS 7 +#define POW_LOG_POLY_ORDER 8 +extern hidden const struct pow_log_data { +	double ln2hi; +	double ln2lo; +	double poly[POW_LOG_POLY_ORDER - 1]; /* First coefficient is 1.  */ +	/* Note: the pad field is unused, but allows slightly faster indexing.  */ +	struct { +		double invc, pad, logc, logctail; +	} tab[1 << POW_LOG_TABLE_BITS]; +} __pow_log_data; + +#endif diff --git a/src/math/powerpc/fabs.c b/src/math/powerpc/fabs.c index f6ec4433..9453a3aa 100644 --- a/src/math/powerpc/fabs.c +++ b/src/math/powerpc/fabs.c @@ -1,6 +1,6 @@  #include <math.h> -#ifdef _SOFT_FLOAT +#if defined(_SOFT_FLOAT) || defined(__NO_FPRS__) || defined(BROKEN_PPC_D_ASM)  #include "../fabs.c" diff --git a/src/math/powerpc/fabsf.c b/src/math/powerpc/fabsf.c index d88b5911..2e9da588 100644 --- a/src/math/powerpc/fabsf.c +++ b/src/math/powerpc/fabsf.c @@ -1,6 +1,6 @@  #include <math.h> -#ifdef _SOFT_FLOAT +#if defined(_SOFT_FLOAT) || defined(__NO_FPRS__)  #include "../fabsf.c" diff --git a/src/math/powerpc/fma.c b/src/math/powerpc/fma.c index fd268f5f..0eb2ba1e 100644 --- a/src/math/powerpc/fma.c +++ b/src/math/powerpc/fma.c @@ -1,6 +1,6 @@  #include <math.h> -#ifdef _SOFT_FLOAT +#if defined(_SOFT_FLOAT) || defined(__NO_FPRS__) || defined(BROKEN_PPC_D_ASM)  #include "../fma.c" diff --git a/src/math/powerpc/fmaf.c b/src/math/powerpc/fmaf.c index a99a2a3b..dc1a749d 100644 --- a/src/math/powerpc/fmaf.c +++ b/src/math/powerpc/fmaf.c @@ -1,6 +1,6 @@  #include <math.h> -#ifdef _SOFT_FLOAT +#if defined(_SOFT_FLOAT) || defined(__NO_FPRS__)  #include "../fmaf.c" diff --git a/src/math/powf.c b/src/math/powf.c index 427c8965..de8fab54 100644 --- a/src/math/powf.c +++ b/src/math/powf.c @@ -1,259 +1,185 @@ -/* origin: FreeBSD /usr/src/lib/msun/src/e_powf.c */  /* - * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com. - */ -/* - * ==================================================== - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. - * - * Developed at SunPro, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT   */ +#include <math.h> +#include <stdint.h>  #include "libm.h" +#include "exp2f_data.h" +#include "powf_data.h" -static const float -bp[]   = {1.0, 1.5,}, -dp_h[] = { 0.0, 5.84960938e-01,}, /* 0x3f15c000 */ -dp_l[] = { 0.0, 1.56322085e-06,}, /* 0x35d1cfdc */ -two24  =  16777216.0,  /* 0x4b800000 */ -huge   =  1.0e30, -tiny   =  1.0e-30, -/* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */ -L1 =  6.0000002384e-01, /* 0x3f19999a */ -L2 =  4.2857143283e-01, /* 0x3edb6db7 */ -L3 =  3.3333334327e-01, /* 0x3eaaaaab */ -L4 =  2.7272811532e-01, /* 0x3e8ba305 */ -L5 =  2.3066075146e-01, /* 0x3e6c3255 */ -L6 =  2.0697501302e-01, /* 0x3e53f142 */ -P1 =  1.6666667163e-01, /* 0x3e2aaaab */ -P2 = -2.7777778450e-03, /* 0xbb360b61 */ -P3 =  6.6137559770e-05, /* 0x388ab355 */ -P4 = -1.6533901999e-06, /* 0xb5ddea0e */ -P5 =  4.1381369442e-08, /* 0x3331bb4c */ -lg2     =  6.9314718246e-01, /* 0x3f317218 */ -lg2_h   =  6.93145752e-01,   /* 0x3f317200 */ -lg2_l   =  1.42860654e-06,   /* 0x35bfbe8c */ -ovt     =  4.2995665694e-08, /* -(128-log2(ovfl+.5ulp)) */ -cp      =  9.6179670095e-01, /* 0x3f76384f =2/(3ln2) */ -cp_h    =  9.6191406250e-01, /* 0x3f764000 =12b cp */ -cp_l    = -1.1736857402e-04, /* 0xb8f623c6 =tail of cp_h */ -ivln2   =  1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */ -ivln2_h =  1.4426879883e+00, /* 0x3fb8aa00 =16b 1/ln2*/ -ivln2_l =  7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/ +/* +POWF_LOG2_POLY_ORDER = 5 +EXP2F_TABLE_BITS = 5 -float powf(float x, float y) +ULP error: 0.82 (~ 0.5 + relerr*2^24) +relerr: 1.27 * 2^-26 (Relative error ~= 128*Ln2*relerr_log2 + relerr_exp2) +relerr_log2: 1.83 * 2^-33 (Relative error of logx.) +relerr_exp2: 1.69 * 2^-34 (Relative error of exp2(ylogx).) +*/ + +#define N (1 << POWF_LOG2_TABLE_BITS) +#define T __powf_log2_data.tab +#define A __powf_log2_data.poly +#define OFF 0x3f330000 + +/* Subnormal input is normalized so ix has negative biased exponent. +   Output is multiplied by N (POWF_SCALE) if TOINT_INTRINICS is set.  */ +static inline double_t log2_inline(uint32_t ix)  { -	float z,ax,z_h,z_l,p_h,p_l; -	float y1,t1,t2,r,s,sn,t,u,v,w; -	int32_t i,j,k,yisint,n; -	int32_t hx,hy,ix,iy,is; +	double_t z, r, r2, r4, p, q, y, y0, invc, logc; +	uint32_t iz, top, tmp; +	int k, i; -	GET_FLOAT_WORD(hx, x); -	GET_FLOAT_WORD(hy, y); -	ix = hx & 0x7fffffff; -	iy = hy & 0x7fffffff; +	/* x = 2^k z; where z is in range [OFF,2*OFF] and exact. +	   The range is split into N subintervals. +	   The ith subinterval contains z and c is near its center.  */ +	tmp = ix - OFF; +	i = (tmp >> (23 - POWF_LOG2_TABLE_BITS)) % N; +	top = tmp & 0xff800000; +	iz = ix - top; +	k = (int32_t)top >> (23 - POWF_SCALE_BITS); /* arithmetic shift */ +	invc = T[i].invc; +	logc = T[i].logc; +	z = (double_t)asfloat(iz); -	/* x**0 = 1, even if x is NaN */ -	if (iy == 0) -		return 1.0f; -	/* 1**y = 1, even if y is NaN */ -	if (hx == 0x3f800000) -		return 1.0f; -	/* NaN if either arg is NaN */ -	if (ix > 0x7f800000 || iy > 0x7f800000) -		return x + y; +	/* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */ +	r = z * invc - 1; +	y0 = logc + (double_t)k; -	/* determine if y is an odd int when x < 0 -	 * yisint = 0       ... y is not an integer -	 * yisint = 1       ... y is an odd int -	 * yisint = 2       ... y is an even int -	 */ -	yisint  = 0; -	if (hx < 0) { -		if (iy >= 0x4b800000) -			yisint = 2; /* even integer y */ -		else if (iy >= 0x3f800000) { -			k = (iy>>23) - 0x7f;         /* exponent */ -			j = iy>>(23-k); -			if ((j<<(23-k)) == iy) -				yisint = 2 - (j & 1); -		} -	} +	/* Pipelined polynomial evaluation to approximate log1p(r)/ln2.  */ +	r2 = r * r; +	y = A[0] * r + A[1]; +	p = A[2] * r + A[3]; +	r4 = r2 * r2; +	q = A[4] * r + y0; +	q = p * r2 + q; +	y = y * r4 + q; +	return y; +} -	/* special value of y */ -	if (iy == 0x7f800000) {  /* y is +-inf */ -		if (ix == 0x3f800000)      /* (-1)**+-inf is 1 */ -			return 1.0f; -		else if (ix > 0x3f800000)  /* (|x|>1)**+-inf = inf,0 */ -			return hy >= 0 ? y : 0.0f; -		else                       /* (|x|<1)**+-inf = 0,inf */ -			return hy >= 0 ? 0.0f: -y; -	} -	if (iy == 0x3f800000)    /* y is +-1 */ -		return hy >= 0 ? x : 1.0f/x; -	if (hy == 0x40000000)    /* y is 2 */ -		return x*x; -	if (hy == 0x3f000000) {  /* y is  0.5 */ -		if (hx >= 0)     /* x >= +0 */ -			return sqrtf(x); -	} +#undef N +#undef T +#define N (1 << EXP2F_TABLE_BITS) +#define T __exp2f_data.tab +#define SIGN_BIAS (1 << (EXP2F_TABLE_BITS + 11)) -	ax = fabsf(x); -	/* special value of x */ -	if (ix == 0x7f800000 || ix == 0 || ix == 0x3f800000) { /* x is +-0,+-inf,+-1 */ -		z = ax; -		if (hy < 0)  /* z = (1/|x|) */ -			z = 1.0f/z; -		if (hx < 0) { -			if (((ix-0x3f800000)|yisint) == 0) { -				z = (z-z)/(z-z); /* (-1)**non-int is NaN */ -			} else if (yisint == 1) -				z = -z;          /* (x<0)**odd = -(|x|**odd) */ -		} -		return z; -	} +/* The output of log2 and thus the input of exp2 is either scaled by N +   (in case of fast toint intrinsics) or not.  The unscaled xd must be +   in [-1021,1023], sign_bias sets the sign of the result.  */ +static inline float exp2_inline(double_t xd, uint32_t sign_bias) +{ +	uint64_t ki, ski, t; +	double_t kd, z, r, r2, y, s; -	sn = 1.0f; /* sign of result */ -	if (hx < 0) { -		if (yisint == 0) /* (x<0)**(non-int) is NaN */ -			return (x-x)/(x-x); -		if (yisint == 1) /* (x<0)**(odd int) */ -			sn = -1.0f; -	} +#if TOINT_INTRINSICS +#define C __exp2f_data.poly_scaled +	/* N*x = k + r with r in [-1/2, 1/2] */ +	kd = roundtoint(xd); /* k */ +	ki = converttoint(xd); +#else +#define C __exp2f_data.poly +#define SHIFT __exp2f_data.shift_scaled +	/* x = k/N + r with r in [-1/(2N), 1/(2N)] */ +	kd = eval_as_double(xd + SHIFT); +	ki = asuint64(kd); +	kd -= SHIFT; /* k/N */ +#endif +	r = xd - kd; -	/* |y| is huge */ -	if (iy > 0x4d000000) { /* if |y| > 2**27 */ -		/* over/underflow if x is not close to one */ -		if (ix < 0x3f7ffff8) -			return hy < 0 ? sn*huge*huge : sn*tiny*tiny; -		if (ix > 0x3f800007) -			return hy > 0 ? sn*huge*huge : sn*tiny*tiny; -		/* now |1-x| is tiny <= 2**-20, suffice to compute -		   log(x) by x-x^2/2+x^3/3-x^4/4 */ -		t = ax - 1;     /* t has 20 trailing zeros */ -		w = (t*t)*(0.5f - t*(0.333333333333f - t*0.25f)); -		u = ivln2_h*t;  /* ivln2_h has 16 sig. bits */ -		v = t*ivln2_l - w*ivln2; -		t1 = u + v; -		GET_FLOAT_WORD(is, t1); -		SET_FLOAT_WORD(t1, is & 0xfffff000); -		t2 = v - (t1-u); -	} else { -		float s2,s_h,s_l,t_h,t_l; -		n = 0; -		/* take care subnormal number */ -		if (ix < 0x00800000) { -			ax *= two24; -			n -= 24; -			GET_FLOAT_WORD(ix, ax); -		} -		n += ((ix)>>23) - 0x7f; -		j = ix & 0x007fffff; -		/* determine interval */ -		ix = j | 0x3f800000;     /* normalize ix */ -		if (j <= 0x1cc471)       /* |x|<sqrt(3/2) */ -			k = 0; -		else if (j < 0x5db3d7)   /* |x|<sqrt(3)   */ -			k = 1; -		else { -			k = 0; -			n += 1; -			ix -= 0x00800000; -		} -		SET_FLOAT_WORD(ax, ix); +	/* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */ +	t = T[ki % N]; +	ski = ki + sign_bias; +	t += ski << (52 - EXP2F_TABLE_BITS); +	s = asdouble(t); +	z = C[0] * r + C[1]; +	r2 = r * r; +	y = C[2] * r + 1; +	y = z * r2 + y; +	y = y * s; +	return eval_as_float(y); +} -		/* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */ -		u = ax - bp[k];   /* bp[0]=1.0, bp[1]=1.5 */ -		v = 1.0f/(ax+bp[k]); -		s = u*v; -		s_h = s; -		GET_FLOAT_WORD(is, s_h); -		SET_FLOAT_WORD(s_h, is & 0xfffff000); -		/* t_h=ax+bp[k] High */ -		is = ((ix>>1) & 0xfffff000) | 0x20000000; -		SET_FLOAT_WORD(t_h, is + 0x00400000 + (k<<21)); -		t_l = ax - (t_h - bp[k]); -		s_l = v*((u - s_h*t_h) - s_h*t_l); -		/* compute log(ax) */ -		s2 = s*s; -		r = s2*s2*(L1+s2*(L2+s2*(L3+s2*(L4+s2*(L5+s2*L6))))); -		r += s_l*(s_h+s); -		s2 = s_h*s_h; -		t_h = 3.0f + s2 + r; -		GET_FLOAT_WORD(is, t_h); -		SET_FLOAT_WORD(t_h, is & 0xfffff000); -		t_l = r - ((t_h - 3.0f) - s2); -		/* u+v = s*(1+...) */ -		u = s_h*t_h; -		v = s_l*t_h + t_l*s; -		/* 2/(3log2)*(s+...) */ -		p_h = u + v; -		GET_FLOAT_WORD(is, p_h); -		SET_FLOAT_WORD(p_h, is & 0xfffff000); -		p_l = v - (p_h - u); -		z_h = cp_h*p_h;  /* cp_h+cp_l = 2/(3*log2) */ -		z_l = cp_l*p_h + p_l*cp+dp_l[k]; -		/* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */ -		t = (float)n; -		t1 = (((z_h + z_l) + dp_h[k]) + t); -		GET_FLOAT_WORD(is, t1); -		SET_FLOAT_WORD(t1, is & 0xfffff000); -		t2 = z_l - (((t1 - t) - dp_h[k]) - z_h); -	} +/* Returns 0 if not int, 1 if odd int, 2 if even int.  The argument is +   the bit representation of a non-zero finite floating-point value.  */ +static inline int checkint(uint32_t iy) +{ +	int e = iy >> 23 & 0xff; +	if (e < 0x7f) +		return 0; +	if (e > 0x7f + 23) +		return 2; +	if (iy & ((1 << (0x7f + 23 - e)) - 1)) +		return 0; +	if (iy & (1 << (0x7f + 23 - e))) +		return 1; +	return 2; +} + +static inline int zeroinfnan(uint32_t ix) +{ +	return 2 * ix - 1 >= 2u * 0x7f800000 - 1; +} -	/* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */ -	GET_FLOAT_WORD(is, y); -	SET_FLOAT_WORD(y1, is & 0xfffff000); -	p_l = (y-y1)*t1 + y*t2; -	p_h = y1*t1; -	z = p_l + p_h; -	GET_FLOAT_WORD(j, z); -	if (j > 0x43000000)          /* if z > 128 */ -		return sn*huge*huge;  /* overflow */ -	else if (j == 0x43000000) {  /* if z == 128 */ -		if (p_l + ovt > z - p_h) -			return sn*huge*huge;  /* overflow */ -	} else if ((j&0x7fffffff) > 0x43160000)  /* z < -150 */ // FIXME: check should be  (uint32_t)j > 0xc3160000 -		return sn*tiny*tiny;  /* underflow */ -	else if (j == 0xc3160000) {  /* z == -150 */ -		if (p_l <= z-p_h) -			return sn*tiny*tiny;  /* underflow */ +float powf(float x, float y) +{ +	uint32_t sign_bias = 0; +	uint32_t ix, iy; + +	ix = asuint(x); +	iy = asuint(y); +	if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000 || +			  zeroinfnan(iy))) { +		/* Either (x < 0x1p-126 or inf or nan) or (y is 0 or inf or nan).  */ +		if (predict_false(zeroinfnan(iy))) { +			if (2 * iy == 0) +				return issignalingf_inline(x) ? x + y : 1.0f; +			if (ix == 0x3f800000) +				return issignalingf_inline(y) ? x + y : 1.0f; +			if (2 * ix > 2u * 0x7f800000 || +			    2 * iy > 2u * 0x7f800000) +				return x + y; +			if (2 * ix == 2 * 0x3f800000) +				return 1.0f; +			if ((2 * ix < 2 * 0x3f800000) == !(iy & 0x80000000)) +				return 0.0f; /* |x|<1 && y==inf or |x|>1 && y==-inf.  */ +			return y * y; +		} +		if (predict_false(zeroinfnan(ix))) { +			float_t x2 = x * x; +			if (ix & 0x80000000 && checkint(iy) == 1) +				x2 = -x2; +			/* Without the barrier some versions of clang hoist the 1/x2 and +			   thus division by zero exception can be signaled spuriously.  */ +			return iy & 0x80000000 ? fp_barrierf(1 / x2) : x2; +		} +		/* x and y are non-zero finite.  */ +		if (ix & 0x80000000) { +			/* Finite x < 0.  */ +			int yint = checkint(iy); +			if (yint == 0) +				return __math_invalidf(x); +			if (yint == 1) +				sign_bias = SIGN_BIAS; +			ix &= 0x7fffffff; +		} +		if (ix < 0x00800000) { +			/* Normalize subnormal x so exponent becomes negative.  */ +			ix = asuint(x * 0x1p23f); +			ix &= 0x7fffffff; +			ix -= 23 << 23; +		}  	} -	/* -	 * compute 2**(p_h+p_l) -	 */ -	i = j & 0x7fffffff; -	k = (i>>23) - 0x7f; -	n = 0; -	if (i > 0x3f000000) {   /* if |z| > 0.5, set n = [z+0.5] */ -		n = j + (0x00800000>>(k+1)); -		k = ((n&0x7fffffff)>>23) - 0x7f;  /* new k for n */ -		SET_FLOAT_WORD(t, n & ~(0x007fffff>>k)); -		n = ((n&0x007fffff)|0x00800000)>>(23-k); -		if (j < 0) -			n = -n; -		p_h -= t; +	double_t logx = log2_inline(ix); +	double_t ylogx = y * logx; /* cannot overflow, y is single prec.  */ +	if (predict_false((asuint64(ylogx) >> 47 & 0xffff) >= +			  asuint64(126.0 * POWF_SCALE) >> 47)) { +		/* |y*log(x)| >= 126.  */ +		if (ylogx > 0x1.fffffffd1d571p+6 * POWF_SCALE) +			return __math_oflowf(sign_bias); +		if (ylogx <= -150.0 * POWF_SCALE) +			return __math_uflowf(sign_bias);  	} -	t = p_l + p_h; -	GET_FLOAT_WORD(is, t); -	SET_FLOAT_WORD(t, is & 0xffff8000); -	u = t*lg2_h; -	v = (p_l-(t-p_h))*lg2 + t*lg2_l; -	z = u + v; -	w = v - (z - u); -	t = z*z; -	t1 = z - t*(P1+t*(P2+t*(P3+t*(P4+t*P5)))); -	r = (z*t1)/(t1-2.0f) - (w+z*w); -	z = 1.0f - (r - z); -	GET_FLOAT_WORD(j, z); -	j += n<<23; -	if ((j>>23) <= 0)  /* subnormal output */ -		z = scalbnf(z, n); -	else -		SET_FLOAT_WORD(z, j); -	return sn*z; +	return exp2_inline(ylogx, sign_bias);  } diff --git a/src/math/powf_data.c b/src/math/powf_data.c new file mode 100644 index 00000000..13e1d9a0 --- /dev/null +++ b/src/math/powf_data.c @@ -0,0 +1,34 @@ +/* + * Data definition for powf. + * + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "powf_data.h" + +const struct powf_log2_data __powf_log2_data = { +  .tab = { +  { 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 * POWF_SCALE }, +  { 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 * POWF_SCALE }, +  { 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 * POWF_SCALE }, +  { 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 * POWF_SCALE }, +  { 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 * POWF_SCALE }, +  { 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 * POWF_SCALE }, +  { 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 * POWF_SCALE }, +  { 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 * POWF_SCALE }, +  { 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 * POWF_SCALE }, +  { 0x1p+0, 0x0p+0 * POWF_SCALE }, +  { 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 * POWF_SCALE }, +  { 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 * POWF_SCALE }, +  { 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 * POWF_SCALE }, +  { 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 * POWF_SCALE }, +  { 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 * POWF_SCALE }, +  { 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 * POWF_SCALE }, +  }, +  .poly = { +  0x1.27616c9496e0bp-2 * POWF_SCALE, -0x1.71969a075c67ap-2 * POWF_SCALE, +  0x1.ec70a6ca7baddp-2 * POWF_SCALE, -0x1.7154748bef6c8p-1 * POWF_SCALE, +  0x1.71547652ab82bp0 * POWF_SCALE, +  } +}; diff --git a/src/math/powf_data.h b/src/math/powf_data.h new file mode 100644 index 00000000..5b136e28 --- /dev/null +++ b/src/math/powf_data.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#ifndef _POWF_DATA_H +#define _POWF_DATA_H + +#include "libm.h" +#include "exp2f_data.h" + +#define POWF_LOG2_TABLE_BITS 4 +#define POWF_LOG2_POLY_ORDER 5 +#if TOINT_INTRINSICS +#define POWF_SCALE_BITS EXP2F_TABLE_BITS +#else +#define POWF_SCALE_BITS 0 +#endif +#define POWF_SCALE ((double)(1 << POWF_SCALE_BITS)) +extern hidden const struct powf_log2_data { +	struct { +		double invc, logc; +	} tab[1 << POWF_LOG2_TABLE_BITS]; +	double poly[POWF_LOG2_POLY_ORDER]; +} __powf_log2_data; + +#endif diff --git a/src/math/powl.c b/src/math/powl.c index 5b6da07b..9eb22162 100644 --- a/src/math/powl.c +++ b/src/math/powl.c @@ -57,14 +57,6 @@   *    IEEE     0,8700       60000      6.5e-18      1.0e-18   * 0.99 < x < 1.01, 0 < y < 8700, uniformly distributed.   * - * - * ERROR MESSAGES: - * - *   message         condition      value returned - * pow overflow     x**y > MAXNUM      INFINITY - * pow underflow   x**y < 1/MAXNUM       0.0 - * pow domain      x<0 and y noninteger  0.0 - *   */  #include "libm.h" @@ -212,25 +204,33 @@ long double powl(long double x, long double y)  	}  	if (x == 1.0)  		return 1.0; /* 1**y = 1, even if y is nan */ -	if (x == -1.0 && !isfinite(y)) -		return 1.0; /* -1**inf = 1 */  	if (y == 0.0)  		return 1.0; /* x**0 = 1, even if x is nan */  	if (y == 1.0)  		return x; -	if (y >= LDBL_MAX) { -		if (x > 1.0 || x < -1.0) -			return INFINITY; -		if (x != 0.0) -			return 0.0; -	} -	if (y <= -LDBL_MAX) { -		if (x > 1.0 || x < -1.0) +	/* if y*log2(x) < log2(LDBL_TRUE_MIN)-1 then x^y uflows to 0 +	   if y*log2(x) > -log2(LDBL_TRUE_MIN)+1 > LDBL_MAX_EXP then x^y oflows +	   if |x|!=1 then |log2(x)| > |log(x)| > LDBL_EPSILON/2 so +	   x^y oflows/uflows if |y|*LDBL_EPSILON/2 > -log2(LDBL_TRUE_MIN)+1 */ +	if (fabsl(y) > 2*(-LDBL_MIN_EXP+LDBL_MANT_DIG+1)/LDBL_EPSILON) { +		/* y is not an odd int */ +		if (x == -1.0) +			return 1.0; +		if (y == INFINITY) { +			if (x > 1.0 || x < -1.0) +				return INFINITY;  			return 0.0; -		if (x != 0.0 || y == -INFINITY) +		} +		if (y == -INFINITY) { +			if (x > 1.0 || x < -1.0) +				return 0.0;  			return INFINITY; +		} +		if ((x > 1.0 || x < -1.0) == (y > 0)) +			return huge * huge; +		return twom10000 * twom10000;  	} -	if (x >= LDBL_MAX) { +	if (x == INFINITY) {  		if (y > 0.0)  			return INFINITY;  		return 0.0; @@ -253,7 +253,7 @@ long double powl(long double x, long double y)  			yoddint = 1;  	} -	if (x <= -LDBL_MAX) { +	if (x == -INFINITY) {  		if (y > 0.0) {  			if (yoddint)  				return -INFINITY; diff --git a/src/math/riscv32/copysign.c b/src/math/riscv32/copysign.c new file mode 100644 index 00000000..c7854178 --- /dev/null +++ b/src/math/riscv32/copysign.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 64 + +double copysign(double x, double y) +{ +	__asm__ ("fsgnj.d %0, %1, %2" : "=f"(x) : "f"(x), "f"(y)); +	return x; +} + +#else + +#include "../copysign.c" + +#endif diff --git a/src/math/riscv32/copysignf.c b/src/math/riscv32/copysignf.c new file mode 100644 index 00000000..a125611a --- /dev/null +++ b/src/math/riscv32/copysignf.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 32 + +float copysignf(float x, float y) +{ +	__asm__ ("fsgnj.s %0, %1, %2" : "=f"(x) : "f"(x), "f"(y)); +	return x; +} + +#else + +#include "../copysignf.c" + +#endif diff --git a/src/math/riscv32/fabs.c b/src/math/riscv32/fabs.c new file mode 100644 index 00000000..5290b6f0 --- /dev/null +++ b/src/math/riscv32/fabs.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 64 + +double fabs(double x) +{ +	__asm__ ("fabs.d %0, %1" : "=f"(x) : "f"(x)); +	return x; +} + +#else + +#include "../fabs.c" + +#endif diff --git a/src/math/riscv32/fabsf.c b/src/math/riscv32/fabsf.c new file mode 100644 index 00000000..f5032e35 --- /dev/null +++ b/src/math/riscv32/fabsf.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 32 + +float fabsf(float x) +{ +	__asm__ ("fabs.s %0, %1" : "=f"(x) : "f"(x)); +	return x; +} + +#else + +#include "../fabsf.c" + +#endif diff --git a/src/math/riscv32/fma.c b/src/math/riscv32/fma.c new file mode 100644 index 00000000..99b05713 --- /dev/null +++ b/src/math/riscv32/fma.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 64 + +double fma(double x, double y, double z) +{ +	__asm__ ("fmadd.d %0, %1, %2, %3" : "=f"(x) : "f"(x), "f"(y), "f"(z)); +	return x; +} + +#else + +#include "../fma.c" + +#endif diff --git a/src/math/riscv32/fmaf.c b/src/math/riscv32/fmaf.c new file mode 100644 index 00000000..f9dc47ed --- /dev/null +++ b/src/math/riscv32/fmaf.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 32 + +float fmaf(float x, float y, float z) +{ +	__asm__ ("fmadd.s %0, %1, %2, %3" : "=f"(x) : "f"(x), "f"(y), "f"(z)); +	return x; +} + +#else + +#include "../fmaf.c" + +#endif diff --git a/src/math/riscv32/fmax.c b/src/math/riscv32/fmax.c new file mode 100644 index 00000000..023709cd --- /dev/null +++ b/src/math/riscv32/fmax.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 64 + +double fmax(double x, double y) +{ +	__asm__ ("fmax.d %0, %1, %2" : "=f"(x) : "f"(x), "f"(y)); +	return x; +} + +#else + +#include "../fmax.c" + +#endif diff --git a/src/math/riscv32/fmaxf.c b/src/math/riscv32/fmaxf.c new file mode 100644 index 00000000..863d2bd1 --- /dev/null +++ b/src/math/riscv32/fmaxf.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 32 + +float fmaxf(float x, float y) +{ +	__asm__ ("fmax.s %0, %1, %2" : "=f"(x) : "f"(x), "f"(y)); +	return x; +} + +#else + +#include "../fmaxf.c" + +#endif diff --git a/src/math/riscv32/fmin.c b/src/math/riscv32/fmin.c new file mode 100644 index 00000000..a4e3b067 --- /dev/null +++ b/src/math/riscv32/fmin.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 64 + +double fmin(double x, double y) +{ +	__asm__ ("fmin.d %0, %1, %2" : "=f"(x) : "f"(x), "f"(y)); +	return x; +} + +#else + +#include "../fmin.c" + +#endif diff --git a/src/math/riscv32/fminf.c b/src/math/riscv32/fminf.c new file mode 100644 index 00000000..32156e80 --- /dev/null +++ b/src/math/riscv32/fminf.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 32 + +float fminf(float x, float y) +{ +	__asm__ ("fmin.s %0, %1, %2" : "=f"(x) : "f"(x), "f"(y)); +	return x; +} + +#else + +#include "../fminf.c" + +#endif diff --git a/src/math/riscv32/sqrt.c b/src/math/riscv32/sqrt.c new file mode 100644 index 00000000..867a504c --- /dev/null +++ b/src/math/riscv32/sqrt.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 64 + +double sqrt(double x) +{ +	__asm__ ("fsqrt.d %0, %1" : "=f"(x) : "f"(x)); +	return x; +} + +#else + +#include "../sqrt.c" + +#endif diff --git a/src/math/riscv32/sqrtf.c b/src/math/riscv32/sqrtf.c new file mode 100644 index 00000000..610c2cf8 --- /dev/null +++ b/src/math/riscv32/sqrtf.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 32 + +float sqrtf(float x) +{ +	__asm__ ("fsqrt.s %0, %1" : "=f"(x) : "f"(x)); +	return x; +} + +#else + +#include "../sqrtf.c" + +#endif diff --git a/src/math/riscv64/copysign.c b/src/math/riscv64/copysign.c new file mode 100644 index 00000000..c7854178 --- /dev/null +++ b/src/math/riscv64/copysign.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 64 + +double copysign(double x, double y) +{ +	__asm__ ("fsgnj.d %0, %1, %2" : "=f"(x) : "f"(x), "f"(y)); +	return x; +} + +#else + +#include "../copysign.c" + +#endif diff --git a/src/math/riscv64/copysignf.c b/src/math/riscv64/copysignf.c new file mode 100644 index 00000000..a125611a --- /dev/null +++ b/src/math/riscv64/copysignf.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 32 + +float copysignf(float x, float y) +{ +	__asm__ ("fsgnj.s %0, %1, %2" : "=f"(x) : "f"(x), "f"(y)); +	return x; +} + +#else + +#include "../copysignf.c" + +#endif diff --git a/src/math/riscv64/fabs.c b/src/math/riscv64/fabs.c new file mode 100644 index 00000000..5290b6f0 --- /dev/null +++ b/src/math/riscv64/fabs.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 64 + +double fabs(double x) +{ +	__asm__ ("fabs.d %0, %1" : "=f"(x) : "f"(x)); +	return x; +} + +#else + +#include "../fabs.c" + +#endif diff --git a/src/math/riscv64/fabsf.c b/src/math/riscv64/fabsf.c new file mode 100644 index 00000000..f5032e35 --- /dev/null +++ b/src/math/riscv64/fabsf.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 32 + +float fabsf(float x) +{ +	__asm__ ("fabs.s %0, %1" : "=f"(x) : "f"(x)); +	return x; +} + +#else + +#include "../fabsf.c" + +#endif diff --git a/src/math/riscv64/fma.c b/src/math/riscv64/fma.c new file mode 100644 index 00000000..99b05713 --- /dev/null +++ b/src/math/riscv64/fma.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 64 + +double fma(double x, double y, double z) +{ +	__asm__ ("fmadd.d %0, %1, %2, %3" : "=f"(x) : "f"(x), "f"(y), "f"(z)); +	return x; +} + +#else + +#include "../fma.c" + +#endif diff --git a/src/math/riscv64/fmaf.c b/src/math/riscv64/fmaf.c new file mode 100644 index 00000000..f9dc47ed --- /dev/null +++ b/src/math/riscv64/fmaf.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 32 + +float fmaf(float x, float y, float z) +{ +	__asm__ ("fmadd.s %0, %1, %2, %3" : "=f"(x) : "f"(x), "f"(y), "f"(z)); +	return x; +} + +#else + +#include "../fmaf.c" + +#endif diff --git a/src/math/riscv64/fmax.c b/src/math/riscv64/fmax.c new file mode 100644 index 00000000..023709cd --- /dev/null +++ b/src/math/riscv64/fmax.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 64 + +double fmax(double x, double y) +{ +	__asm__ ("fmax.d %0, %1, %2" : "=f"(x) : "f"(x), "f"(y)); +	return x; +} + +#else + +#include "../fmax.c" + +#endif diff --git a/src/math/riscv64/fmaxf.c b/src/math/riscv64/fmaxf.c new file mode 100644 index 00000000..863d2bd1 --- /dev/null +++ b/src/math/riscv64/fmaxf.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 32 + +float fmaxf(float x, float y) +{ +	__asm__ ("fmax.s %0, %1, %2" : "=f"(x) : "f"(x), "f"(y)); +	return x; +} + +#else + +#include "../fmaxf.c" + +#endif diff --git a/src/math/riscv64/fmin.c b/src/math/riscv64/fmin.c new file mode 100644 index 00000000..a4e3b067 --- /dev/null +++ b/src/math/riscv64/fmin.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 64 + +double fmin(double x, double y) +{ +	__asm__ ("fmin.d %0, %1, %2" : "=f"(x) : "f"(x), "f"(y)); +	return x; +} + +#else + +#include "../fmin.c" + +#endif diff --git a/src/math/riscv64/fminf.c b/src/math/riscv64/fminf.c new file mode 100644 index 00000000..32156e80 --- /dev/null +++ b/src/math/riscv64/fminf.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 32 + +float fminf(float x, float y) +{ +	__asm__ ("fmin.s %0, %1, %2" : "=f"(x) : "f"(x), "f"(y)); +	return x; +} + +#else + +#include "../fminf.c" + +#endif diff --git a/src/math/riscv64/sqrt.c b/src/math/riscv64/sqrt.c new file mode 100644 index 00000000..867a504c --- /dev/null +++ b/src/math/riscv64/sqrt.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 64 + +double sqrt(double x) +{ +	__asm__ ("fsqrt.d %0, %1" : "=f"(x) : "f"(x)); +	return x; +} + +#else + +#include "../sqrt.c" + +#endif diff --git a/src/math/riscv64/sqrtf.c b/src/math/riscv64/sqrtf.c new file mode 100644 index 00000000..610c2cf8 --- /dev/null +++ b/src/math/riscv64/sqrtf.c @@ -0,0 +1,15 @@ +#include <math.h> + +#if __riscv_flen >= 32 + +float sqrtf(float x) +{ +	__asm__ ("fsqrt.s %0, %1" : "=f"(x) : "f"(x)); +	return x; +} + +#else + +#include "../sqrtf.c" + +#endif diff --git a/src/math/sinh.c b/src/math/sinh.c index 00022c4e..a01951ae 100644 --- a/src/math/sinh.c +++ b/src/math/sinh.c @@ -34,6 +34,6 @@ double sinh(double x)  	/* |x| > log(DBL_MAX) or nan */  	/* note: the result is stored to handle overflow */ -	t = 2*h*__expo2(absx); +	t = __expo2(absx, 2*h);  	return t;  } diff --git a/src/math/sinhf.c b/src/math/sinhf.c index 6ad19ea2..b9caa793 100644 --- a/src/math/sinhf.c +++ b/src/math/sinhf.c @@ -26,6 +26,6 @@ float sinhf(float x)  	}  	/* |x| > logf(FLT_MAX) or nan */ -	t = 2*h*__expo2f(absx); +	t = __expo2f(absx, 2*h);  	return t;  } diff --git a/src/math/sqrt.c b/src/math/sqrt.c index b2775673..5ba26559 100644 --- a/src/math/sqrt.c +++ b/src/math/sqrt.c @@ -1,185 +1,158 @@ -/* origin: FreeBSD /usr/src/lib/msun/src/e_sqrt.c */ -/* - * ==================================================== - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. - * - * Developed at SunSoft, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - */ -/* sqrt(x) - * Return correctly rounded sqrt. - *           ------------------------------------------ - *           |  Use the hardware sqrt if you have one | - *           ------------------------------------------ - * Method: - *   Bit by bit method using integer arithmetic. (Slow, but portable) - *   1. Normalization - *      Scale x to y in [1,4) with even powers of 2: - *      find an integer k such that  1 <= (y=x*2^(2k)) < 4, then - *              sqrt(x) = 2^k * sqrt(y) - *   2. Bit by bit computation - *      Let q  = sqrt(y) truncated to i bit after binary point (q = 1), - *           i                                                   0 - *                                     i+1         2 - *          s  = 2*q , and      y  =  2   * ( y - q  ).         (1) - *           i      i            i                 i - * - *      To compute q    from q , one checks whether - *                  i+1       i - * - *                            -(i+1) 2 - *                      (q + 2      ) <= y.                     (2) - *                        i - *                                                            -(i+1) - *      If (2) is false, then q   = q ; otherwise q   = q  + 2      . - *                             i+1   i             i+1   i - * - *      With some algebric manipulation, it is not difficult to see - *      that (2) is equivalent to - *                             -(i+1) - *                      s  +  2       <= y                      (3) - *                       i                i - * - *      The advantage of (3) is that s  and y  can be computed by - *                                    i      i - *      the following recurrence formula: - *          if (3) is false - * - *          s     =  s  ,       y    = y   ;                    (4) - *           i+1      i          i+1    i - * - *          otherwise, - *                         -i                     -(i+1) - *          s     =  s  + 2  ,  y    = y  -  s  - 2             (5) - *           i+1      i          i+1    i     i - * - *      One may easily use induction to prove (4) and (5). - *      Note. Since the left hand side of (3) contain only i+2 bits, - *            it does not necessary to do a full (53-bit) comparison - *            in (3). - *   3. Final rounding - *      After generating the 53 bits result, we compute one more bit. - *      Together with the remainder, we can decide whether the - *      result is exact, bigger than 1/2ulp, or less than 1/2ulp - *      (it will never equal to 1/2ulp). - *      The rounding mode can be detected by checking whether - *      huge + tiny is equal to huge, and whether huge - tiny is - *      equal to huge for some floating point number "huge" and "tiny". - * - * Special cases: - *      sqrt(+-0) = +-0         ... exact - *      sqrt(inf) = inf - *      sqrt(-ve) = NaN         ... with invalid signal - *      sqrt(NaN) = NaN         ... with invalid signal for signaling NaN - */ - +#include <stdint.h> +#include <math.h>  #include "libm.h" +#include "sqrt_data.h" -static const double tiny = 1.0e-300; +#define FENV_SUPPORT 1 -double sqrt(double x) +/* returns a*b*2^-32 - e, with error 0 <= e < 1.  */ +static inline uint32_t mul32(uint32_t a, uint32_t b)  { -	double z; -	int32_t sign = (int)0x80000000; -	int32_t ix0,s0,q,m,t,i; -	uint32_t r,t1,s1,ix1,q1; +	return (uint64_t)a*b >> 32; +} -	EXTRACT_WORDS(ix0, ix1, x); +/* returns a*b*2^-64 - e, with error 0 <= e < 3.  */ +static inline uint64_t mul64(uint64_t a, uint64_t b) +{ +	uint64_t ahi = a>>32; +	uint64_t alo = a&0xffffffff; +	uint64_t bhi = b>>32; +	uint64_t blo = b&0xffffffff; +	return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32); +} -	/* take care of Inf and NaN */ -	if ((ix0&0x7ff00000) == 0x7ff00000) { -		return x*x + x;  /* sqrt(NaN)=NaN, sqrt(+inf)=+inf, sqrt(-inf)=sNaN */ -	} -	/* take care of zero */ -	if (ix0 <= 0) { -		if (((ix0&~sign)|ix1) == 0) -			return x;  /* sqrt(+-0) = +-0 */ -		if (ix0 < 0) -			return (x-x)/(x-x);  /* sqrt(-ve) = sNaN */ -	} -	/* normalize x */ -	m = ix0>>20; -	if (m == 0) {  /* subnormal x */ -		while (ix0 == 0) { -			m -= 21; -			ix0 |= (ix1>>11); -			ix1 <<= 21; -		} -		for (i=0; (ix0&0x00100000) == 0; i++) -			ix0<<=1; -		m -= i - 1; -		ix0 |= ix1>>(32-i); -		ix1 <<= i; -	} -	m -= 1023;    /* unbias exponent */ -	ix0 = (ix0&0x000fffff)|0x00100000; -	if (m & 1) {  /* odd m, double x to make it even */ -		ix0 += ix0 + ((ix1&sign)>>31); -		ix1 += ix1; -	} -	m >>= 1;      /* m = [m/2] */ - -	/* generate sqrt(x) bit by bit */ -	ix0 += ix0 + ((ix1&sign)>>31); -	ix1 += ix1; -	q = q1 = s0 = s1 = 0;  /* [q,q1] = sqrt(x) */ -	r = 0x00200000;        /* r = moving bit from right to left */ - -	while (r != 0) { -		t = s0 + r; -		if (t <= ix0) { -			s0   = t + r; -			ix0 -= t; -			q   += r; -		} -		ix0 += ix0 + ((ix1&sign)>>31); -		ix1 += ix1; -		r >>= 1; -	} +double sqrt(double x) +{ +	uint64_t ix, top, m; -	r = sign; -	while (r != 0) { -		t1 = s1 + r; -		t  = s0; -		if (t < ix0 || (t == ix0 && t1 <= ix1)) { -			s1 = t1 + r; -			if ((t1&sign) == sign && (s1&sign) == 0) -				s0++; -			ix0 -= t; -			if (ix1 < t1) -				ix0--; -			ix1 -= t1; -			q1 += r; -		} -		ix0 += ix0 + ((ix1&sign)>>31); -		ix1 += ix1; -		r >>= 1; +	/* special case handling.  */ +	ix = asuint64(x); +	top = ix >> 52; +	if (predict_false(top - 0x001 >= 0x7ff - 0x001)) { +		/* x < 0x1p-1022 or inf or nan.  */ +		if (ix * 2 == 0) +			return x; +		if (ix == 0x7ff0000000000000) +			return x; +		if (ix > 0x7ff0000000000000) +			return __math_invalid(x); +		/* x is subnormal, normalize it.  */ +		ix = asuint64(x * 0x1p52); +		top = ix >> 52; +		top -= 52;  	} -	/* use floating add to find out rounding direction */ -	if ((ix0|ix1) != 0) { -		z = 1.0 - tiny; /* raise inexact flag */ -		if (z >= 1.0) { -			z = 1.0 + tiny; -			if (q1 == (uint32_t)0xffffffff) { -				q1 = 0; -				q++; -			} else if (z > 1.0) { -				if (q1 == (uint32_t)0xfffffffe) -					q++; -				q1 += 2; -			} else -				q1 += q1 & 1; -		} +	/* argument reduction: +	   x = 4^e m; with integer e, and m in [1, 4) +	   m: fixed point representation [2.62] +	   2^e is the exponent part of the result.  */ +	int even = top & 1; +	m = (ix << 11) | 0x8000000000000000; +	if (even) m >>= 1; +	top = (top + 0x3ff) >> 1; + +	/* approximate r ~ 1/sqrt(m) and s ~ sqrt(m) when m in [1,4) + +	   initial estimate: +	   7bit table lookup (1bit exponent and 6bit significand). + +	   iterative approximation: +	   using 2 goldschmidt iterations with 32bit int arithmetics +	   and a final iteration with 64bit int arithmetics. + +	   details: + +	   the relative error (e = r0 sqrt(m)-1) of a linear estimate +	   (r0 = a m + b) is |e| < 0.085955 ~ 0x1.6p-4 at best, +	   a table lookup is faster and needs one less iteration +	   6 bit lookup table (128b) gives |e| < 0x1.f9p-8 +	   7 bit lookup table (256b) gives |e| < 0x1.fdp-9 +	   for single and double prec 6bit is enough but for quad +	   prec 7bit is needed (or modified iterations). to avoid +	   one more iteration >=13bit table would be needed (16k). + +	   a newton-raphson iteration for r is +	     w = r*r +	     u = 3 - m*w +	     r = r*u/2 +	   can use a goldschmidt iteration for s at the end or +	     s = m*r + +	   first goldschmidt iteration is +	     s = m*r +	     u = 3 - s*r +	     r = r*u/2 +	     s = s*u/2 +	   next goldschmidt iteration is +	     u = 3 - s*r +	     r = r*u/2 +	     s = s*u/2 +	   and at the end r is not computed only s. + +	   they use the same amount of operations and converge at the +	   same quadratic rate, i.e. if +	     r1 sqrt(m) - 1 = e, then +	     r2 sqrt(m) - 1 = -3/2 e^2 - 1/2 e^3 +	   the advantage of goldschmidt is that the mul for s and r +	   are independent (computed in parallel), however it is not +	   "self synchronizing": it only uses the input m in the +	   first iteration so rounding errors accumulate. at the end +	   or when switching to larger precision arithmetics rounding +	   errors dominate so the first iteration should be used. + +	   the fixed point representations are +	     m: 2.30 r: 0.32, s: 2.30, d: 2.30, u: 2.30, three: 2.30 +	   and after switching to 64 bit +	     m: 2.62 r: 0.64, s: 2.62, d: 2.62, u: 2.62, three: 2.62  */ + +	static const uint64_t three = 0xc0000000; +	uint64_t r, s, d, u, i; + +	i = (ix >> 46) % 128; +	r = (uint32_t)__rsqrt_tab[i] << 16; +	/* |r sqrt(m) - 1| < 0x1.fdp-9 */ +	s = mul32(m>>32, r); +	/* |s/sqrt(m) - 1| < 0x1.fdp-9 */ +	d = mul32(s, r); +	u = three - d; +	r = mul32(r, u) << 1; +	/* |r sqrt(m) - 1| < 0x1.7bp-16 */ +	s = mul32(s, u) << 1; +	/* |s/sqrt(m) - 1| < 0x1.7bp-16 */ +	d = mul32(s, r); +	u = three - d; +	r = mul32(r, u) << 1; +	/* |r sqrt(m) - 1| < 0x1.3704p-29 (measured worst-case) */ +	r = r << 32; +	s = mul64(m, r); +	d = mul64(s, r); +	u = (three<<32) - d; +	s = mul64(s, u);  /* repr: 3.61 */ +	/* -0x1p-57 < s - sqrt(m) < 0x1.8001p-61 */ +	s = (s - 2) >> 9; /* repr: 12.52 */ +	/* -0x1.09p-52 < s - sqrt(m) < -0x1.fffcp-63 */ + +	/* s < sqrt(m) < s + 0x1.09p-52, +	   compute nearest rounded result: +	   the nearest result to 52 bits is either s or s+0x1p-52, +	   we can decide by comparing (2^52 s + 0.5)^2 to 2^104 m.  */ +	uint64_t d0, d1, d2; +	double y, t; +	d0 = (m << 42) - s*s; +	d1 = s - d0; +	d2 = d1 + s + 1; +	s += d1 >> 63; +	s &= 0x000fffffffffffff; +	s |= top << 52; +	y = asdouble(s); +	if (FENV_SUPPORT) { +		/* handle rounding modes and inexact exception: +		   only (s+1)^2 == 2^42 m case is exact otherwise +		   add a tiny value to cause the fenv effects.  */ +		uint64_t tiny = predict_false(d2==0) ? 0 : 0x0010000000000000; +		tiny |= (d1^d2) & 0x8000000000000000; +		t = asdouble(tiny); +		y = eval_as_double(y + t);  	} -	ix0 = (q>>1) + 0x3fe00000; -	ix1 = q1>>1; -	if (q&1) -		ix1 |= sign; -	ix0 += m << 20; -	INSERT_WORDS(z, ix0, ix1); -	return z; +	return y;  } diff --git a/src/math/sqrt_data.c b/src/math/sqrt_data.c new file mode 100644 index 00000000..61bc22f4 --- /dev/null +++ b/src/math/sqrt_data.c @@ -0,0 +1,19 @@ +#include "sqrt_data.h" +const uint16_t __rsqrt_tab[128] = { +0xb451,0xb2f0,0xb196,0xb044,0xaef9,0xadb6,0xac79,0xab43, +0xaa14,0xa8eb,0xa7c8,0xa6aa,0xa592,0xa480,0xa373,0xa26b, +0xa168,0xa06a,0x9f70,0x9e7b,0x9d8a,0x9c9d,0x9bb5,0x9ad1, +0x99f0,0x9913,0x983a,0x9765,0x9693,0x95c4,0x94f8,0x9430, +0x936b,0x92a9,0x91ea,0x912e,0x9075,0x8fbe,0x8f0a,0x8e59, +0x8daa,0x8cfe,0x8c54,0x8bac,0x8b07,0x8a64,0x89c4,0x8925, +0x8889,0x87ee,0x8756,0x86c0,0x862b,0x8599,0x8508,0x8479, +0x83ec,0x8361,0x82d8,0x8250,0x81c9,0x8145,0x80c2,0x8040, +0xff02,0xfd0e,0xfb25,0xf947,0xf773,0xf5aa,0xf3ea,0xf234, +0xf087,0xeee3,0xed47,0xebb3,0xea27,0xe8a3,0xe727,0xe5b2, +0xe443,0xe2dc,0xe17a,0xe020,0xdecb,0xdd7d,0xdc34,0xdaf1, +0xd9b3,0xd87b,0xd748,0xd61a,0xd4f1,0xd3cd,0xd2ad,0xd192, +0xd07b,0xcf69,0xce5b,0xcd51,0xcc4a,0xcb48,0xca4a,0xc94f, +0xc858,0xc764,0xc674,0xc587,0xc49d,0xc3b7,0xc2d4,0xc1f4, +0xc116,0xc03c,0xbf65,0xbe90,0xbdbe,0xbcef,0xbc23,0xbb59, +0xba91,0xb9cc,0xb90a,0xb84a,0xb78c,0xb6d0,0xb617,0xb560, +}; diff --git a/src/math/sqrt_data.h b/src/math/sqrt_data.h new file mode 100644 index 00000000..260c7f9c --- /dev/null +++ b/src/math/sqrt_data.h @@ -0,0 +1,13 @@ +#ifndef _SQRT_DATA_H +#define _SQRT_DATA_H + +#include <features.h> +#include <stdint.h> + +/* if x in [1,2): i = (int)(64*x); +   if x in [2,4): i = (int)(32*x-64); +   __rsqrt_tab[i]*2^-16 is estimating 1/sqrt(x) with small relative error: +   |__rsqrt_tab[i]*0x1p-16*sqrt(x) - 1| < -0x1.fdp-9 < 2^-8 */ +extern hidden const uint16_t __rsqrt_tab[128]; + +#endif diff --git a/src/math/sqrtf.c b/src/math/sqrtf.c index 28cb4ad3..740d81cb 100644 --- a/src/math/sqrtf.c +++ b/src/math/sqrtf.c @@ -1,84 +1,83 @@ -/* origin: FreeBSD /usr/src/lib/msun/src/e_sqrtf.c */ -/* - * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com. - */ -/* - * ==================================================== - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. - * - * Developed at SunPro, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - */ - +#include <stdint.h> +#include <math.h>  #include "libm.h" +#include "sqrt_data.h" -static const float tiny = 1.0e-30; +#define FENV_SUPPORT 1 -float sqrtf(float x) +static inline uint32_t mul32(uint32_t a, uint32_t b)  { -	float z; -	int32_t sign = (int)0x80000000; -	int32_t ix,s,q,m,t,i; -	uint32_t r; +	return (uint64_t)a*b >> 32; +} -	GET_FLOAT_WORD(ix, x); +/* see sqrt.c for more detailed comments.  */ -	/* take care of Inf and NaN */ -	if ((ix&0x7f800000) == 0x7f800000) -		return x*x + x; /* sqrt(NaN)=NaN, sqrt(+inf)=+inf, sqrt(-inf)=sNaN */ +float sqrtf(float x) +{ +	uint32_t ix, m, m1, m0, even, ey; -	/* take care of zero */ -	if (ix <= 0) { -		if ((ix&~sign) == 0) -			return x;  /* sqrt(+-0) = +-0 */ -		if (ix < 0) -			return (x-x)/(x-x);  /* sqrt(-ve) = sNaN */ -	} -	/* normalize x */ -	m = ix>>23; -	if (m == 0) {  /* subnormal x */ -		for (i = 0; (ix&0x00800000) == 0; i++) -			ix<<=1; -		m -= i - 1; +	ix = asuint(x); +	if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000)) { +		/* x < 0x1p-126 or inf or nan.  */ +		if (ix * 2 == 0) +			return x; +		if (ix == 0x7f800000) +			return x; +		if (ix > 0x7f800000) +			return __math_invalidf(x); +		/* x is subnormal, normalize it.  */ +		ix = asuint(x * 0x1p23f); +		ix -= 23 << 23;  	} -	m -= 127;  /* unbias exponent */ -	ix = (ix&0x007fffff)|0x00800000; -	if (m&1)  /* odd m, double x to make it even */ -		ix += ix; -	m >>= 1;  /* m = [m/2] */ -	/* generate sqrt(x) bit by bit */ -	ix += ix; -	q = s = 0;       /* q = sqrt(x) */ -	r = 0x01000000;  /* r = moving bit from right to left */ +	/* x = 4^e m; with int e and m in [1, 4).  */ +	even = ix & 0x00800000; +	m1 = (ix << 8) | 0x80000000; +	m0 = (ix << 7) & 0x7fffffff; +	m = even ? m0 : m1; -	while (r != 0) { -		t = s + r; -		if (t <= ix) { -			s = t+r; -			ix -= t; -			q += r; -		} -		ix += ix; -		r >>= 1; -	} +	/* 2^e is the exponent part of the return value.  */ +	ey = ix >> 1; +	ey += 0x3f800000 >> 1; +	ey &= 0x7f800000; + +	/* compute r ~ 1/sqrt(m), s ~ sqrt(m) with 2 goldschmidt iterations.  */ +	static const uint32_t three = 0xc0000000; +	uint32_t r, s, d, u, i; +	i = (ix >> 17) % 128; +	r = (uint32_t)__rsqrt_tab[i] << 16; +	/* |r*sqrt(m) - 1| < 0x1p-8 */ +	s = mul32(m, r); +	/* |s/sqrt(m) - 1| < 0x1p-8 */ +	d = mul32(s, r); +	u = three - d; +	r = mul32(r, u) << 1; +	/* |r*sqrt(m) - 1| < 0x1.7bp-16 */ +	s = mul32(s, u) << 1; +	/* |s/sqrt(m) - 1| < 0x1.7bp-16 */ +	d = mul32(s, r); +	u = three - d; +	s = mul32(s, u); +	/* -0x1.03p-28 < s/sqrt(m) - 1 < 0x1.fp-31 */ +	s = (s - 1)>>6; +	/* s < sqrt(m) < s + 0x1.08p-23 */ -	/* use floating add to find out rounding direction */ -	if (ix != 0) { -		z = 1.0f - tiny; /* raise inexact flag */ -		if (z >= 1.0f) { -			z = 1.0f + tiny; -			if (z > 1.0f) -				q += 2; -			else -				q += q & 1; -		} +	/* compute nearest rounded result.  */ +	uint32_t d0, d1, d2; +	float y, t; +	d0 = (m << 16) - s*s; +	d1 = s - d0; +	d2 = d1 + s + 1; +	s += d1 >> 31; +	s &= 0x007fffff; +	s |= ey; +	y = asfloat(s); +	if (FENV_SUPPORT) { +		/* handle rounding and inexact exception. */ +		uint32_t tiny = predict_false(d2==0) ? 0 : 0x01000000; +		tiny |= (d1^d2) & 0x80000000; +		t = asfloat(tiny); +		y = eval_as_float(y + t);  	} -	ix = (q>>1) + 0x3f000000; -	ix += m << 23; -	SET_FLOAT_WORD(z, ix); -	return z; +	return y;  } diff --git a/src/math/sqrtl.c b/src/math/sqrtl.c index 83a8f80c..a231b3f2 100644 --- a/src/math/sqrtl.c +++ b/src/math/sqrtl.c @@ -1,7 +1,259 @@ +#include <stdint.h>  #include <math.h> +#include <float.h> +#include "libm.h" +#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024  long double sqrtl(long double x)  { -	/* FIXME: implement in C, this is for LDBL_MANT_DIG == 64 only */  	return sqrt(x);  } +#elif (LDBL_MANT_DIG == 113 || LDBL_MANT_DIG == 64) && LDBL_MAX_EXP == 16384 +#include "sqrt_data.h" + +#define FENV_SUPPORT 1 + +typedef struct { +	uint64_t hi; +	uint64_t lo; +} u128; + +/* top: 16 bit sign+exponent, x: significand.  */ +static inline long double mkldbl(uint64_t top, u128 x) +{ +	union ldshape u; +#if LDBL_MANT_DIG == 113 +	u.i2.hi = x.hi; +	u.i2.lo = x.lo; +	u.i2.hi &= 0x0000ffffffffffff; +	u.i2.hi |= top << 48; +#elif LDBL_MANT_DIG == 64 +	u.i.se = top; +	u.i.m = x.lo; +	/* force the top bit on non-zero (and non-subnormal) results.  */ +	if (top & 0x7fff) +		u.i.m |= 0x8000000000000000; +#endif +	return u.f; +} + +/* return: top 16 bit is sign+exp and following bits are the significand.  */ +static inline u128 asu128(long double x) +{ +	union ldshape u = {.f=x}; +	u128 r; +#if LDBL_MANT_DIG == 113 +	r.hi = u.i2.hi; +	r.lo = u.i2.lo; +#elif LDBL_MANT_DIG == 64 +	r.lo = u.i.m<<49; +	/* ignore the top bit: pseudo numbers are not handled. */ +	r.hi = u.i.m>>15; +	r.hi &= 0x0000ffffffffffff; +	r.hi |= (uint64_t)u.i.se << 48; +#endif +	return r; +} + +/* returns a*b*2^-32 - e, with error 0 <= e < 1.  */ +static inline uint32_t mul32(uint32_t a, uint32_t b) +{ +	return (uint64_t)a*b >> 32; +} + +/* returns a*b*2^-64 - e, with error 0 <= e < 3.  */ +static inline uint64_t mul64(uint64_t a, uint64_t b) +{ +	uint64_t ahi = a>>32; +	uint64_t alo = a&0xffffffff; +	uint64_t bhi = b>>32; +	uint64_t blo = b&0xffffffff; +	return ahi*bhi + (ahi*blo >> 32) + (alo*bhi >> 32); +} + +static inline u128 add64(u128 a, uint64_t b) +{ +	u128 r; +	r.lo = a.lo + b; +	r.hi = a.hi; +	if (r.lo < a.lo) +		r.hi++; +	return r; +} + +static inline u128 add128(u128 a, u128 b) +{ +	u128 r; +	r.lo = a.lo + b.lo; +	r.hi = a.hi + b.hi; +	if (r.lo < a.lo) +		r.hi++; +	return r; +} + +static inline u128 sub64(u128 a, uint64_t b) +{ +	u128 r; +	r.lo = a.lo - b; +	r.hi = a.hi; +	if (a.lo < b) +		r.hi--; +	return r; +} + +static inline u128 sub128(u128 a, u128 b) +{ +	u128 r; +	r.lo = a.lo - b.lo; +	r.hi = a.hi - b.hi; +	if (a.lo < b.lo) +		r.hi--; +	return r; +} + +/* a<<n, 0 <= n <= 127 */ +static inline u128 lsh(u128 a, int n) +{ +	if (n == 0) +		return a; +	if (n >= 64) { +		a.hi = a.lo<<(n-64); +		a.lo = 0; +	} else { +		a.hi = (a.hi<<n) | (a.lo>>(64-n)); +		a.lo = a.lo<<n; +	} +	return a; +} + +/* a>>n, 0 <= n <= 127 */ +static inline u128 rsh(u128 a, int n) +{ +	if (n == 0) +		return a; +	if (n >= 64) { +		a.lo = a.hi>>(n-64); +		a.hi = 0; +	} else { +		a.lo = (a.lo>>n) | (a.hi<<(64-n)); +		a.hi = a.hi>>n; +	} +	return a; +} + +/* returns a*b exactly.  */ +static inline u128 mul64_128(uint64_t a, uint64_t b) +{ +	u128 r; +	uint64_t ahi = a>>32; +	uint64_t alo = a&0xffffffff; +	uint64_t bhi = b>>32; +	uint64_t blo = b&0xffffffff; +	uint64_t lo1 = ((ahi*blo)&0xffffffff) + ((alo*bhi)&0xffffffff) + (alo*blo>>32); +	uint64_t lo2 = (alo*blo)&0xffffffff; +	r.hi = ahi*bhi + (ahi*blo>>32) + (alo*bhi>>32) + (lo1>>32); +	r.lo = (lo1<<32) + lo2; +	return r; +} + +/* returns a*b*2^-128 - e, with error 0 <= e < 7.  */ +static inline u128 mul128(u128 a, u128 b) +{ +	u128 hi = mul64_128(a.hi, b.hi); +	uint64_t m1 = mul64(a.hi, b.lo); +	uint64_t m2 = mul64(a.lo, b.hi); +	return add64(add64(hi, m1), m2); +} + +/* returns a*b % 2^128.  */ +static inline u128 mul128_tail(u128 a, u128 b) +{ +	u128 lo = mul64_128(a.lo, b.lo); +	lo.hi += a.hi*b.lo + a.lo*b.hi; +	return lo; +} + + +/* see sqrt.c for detailed comments.  */ + +long double sqrtl(long double x) +{ +	u128 ix, ml; +	uint64_t top; + +	ix = asu128(x); +	top = ix.hi >> 48; +	if (predict_false(top - 0x0001 >= 0x7fff - 0x0001)) { +		/* x < 0x1p-16382 or inf or nan.  */ +		if (2*ix.hi == 0 && ix.lo == 0) +			return x; +		if (ix.hi == 0x7fff000000000000 && ix.lo == 0) +			return x; +		if (top >= 0x7fff) +			return __math_invalidl(x); +		/* x is subnormal, normalize it.  */ +		ix = asu128(x * 0x1p112); +		top = ix.hi >> 48; +		top -= 112; +	} + +	/* x = 4^e m; with int e and m in [1, 4) */ +	int even = top & 1; +	ml = lsh(ix, 15); +	ml.hi |= 0x8000000000000000; +	if (even) ml = rsh(ml, 1); +	top = (top + 0x3fff) >> 1; + +	/* r ~ 1/sqrt(m) */ +	const uint64_t three = 0xc0000000; +	uint64_t r, s, d, u, i; +	i = (ix.hi >> 42) % 128; +	r = (uint32_t)__rsqrt_tab[i] << 16; +	/* |r sqrt(m) - 1| < 0x1p-8 */ +	s = mul32(ml.hi>>32, r); +	d = mul32(s, r); +	u = three - d; +	r = mul32(u, r) << 1; +	/* |r sqrt(m) - 1| < 0x1.7bp-16, switch to 64bit */ +	r = r<<32; +	s = mul64(ml.hi, r); +	d = mul64(s, r); +	u = (three<<32) - d; +	r = mul64(u, r) << 1; +	/* |r sqrt(m) - 1| < 0x1.a5p-31 */ +	s = mul64(u, s) << 1; +	d = mul64(s, r); +	u = (three<<32) - d; +	r = mul64(u, r) << 1; +	/* |r sqrt(m) - 1| < 0x1.c001p-59, switch to 128bit */ + +	const u128 threel = {.hi=three<<32, .lo=0}; +	u128 rl, sl, dl, ul; +	rl.hi = r; +	rl.lo = 0; +	sl = mul128(ml, rl); +	dl = mul128(sl, rl); +	ul = sub128(threel, dl); +	sl = mul128(ul, sl); /* repr: 3.125 */ +	/* -0x1p-116 < s - sqrt(m) < 0x3.8001p-125 */ +	sl = rsh(sub64(sl, 4), 125-(LDBL_MANT_DIG-1)); +	/* s < sqrt(m) < s + 1 ULP + tiny */ + +	long double y; +	u128 d2, d1, d0; +	d0 = sub128(lsh(ml, 2*(LDBL_MANT_DIG-1)-126), mul128_tail(sl,sl)); +	d1 = sub128(sl, d0); +	d2 = add128(add64(sl, 1), d1); +	sl = add64(sl, d1.hi >> 63); +	y = mkldbl(top, sl); +	if (FENV_SUPPORT) { +		/* handle rounding modes and inexact exception.  */ +		top = predict_false((d2.hi|d2.lo)==0) ? 0 : 1; +		top |= ((d1.hi^d2.hi)&0x8000000000000000) >> 48; +		y += mkldbl(top, (u128){0}); +	} +	return y; +} +#else +#error unsupported long double format +#endif diff --git a/src/math/x32/lrintl.s b/src/math/x32/lrintl.s index ee97d1cf..d4355c32 100644 --- a/src/math/x32/lrintl.s +++ b/src/math/x32/lrintl.s @@ -2,6 +2,6 @@  .type lrintl,@function  lrintl:  	fldt 8(%esp) -	fistpll 8(%esp) -	mov 8(%esp),%rax +	fistpl 8(%esp) +	movl 8(%esp),%eax  	ret diff --git a/src/math/x86_64/fabs.c b/src/math/x86_64/fabs.c new file mode 100644 index 00000000..16562477 --- /dev/null +++ b/src/math/x86_64/fabs.c @@ -0,0 +1,10 @@ +#include <math.h> + +double fabs(double x) +{ +	double t; +	__asm__ ("pcmpeqd %0, %0" : "=x"(t));          // t = ~0 +	__asm__ ("psrlq   $1, %0" : "+x"(t));          // t >>= 1 +	__asm__ ("andps   %1, %0" : "+x"(x) : "x"(t)); // x &= t +	return x; +} diff --git a/src/math/x86_64/fabs.s b/src/math/x86_64/fabs.s deleted file mode 100644 index 5715005e..00000000 --- a/src/math/x86_64/fabs.s +++ /dev/null @@ -1,9 +0,0 @@ -.global fabs -.type fabs,@function -fabs: -	xor %eax,%eax -	dec %rax -	shr %rax -	movq %rax,%xmm1 -	andpd %xmm1,%xmm0 -	ret diff --git a/src/math/x86_64/fabsf.c b/src/math/x86_64/fabsf.c new file mode 100644 index 00000000..36ea7481 --- /dev/null +++ b/src/math/x86_64/fabsf.c @@ -0,0 +1,10 @@ +#include <math.h> + +float fabsf(float x) +{ +	float t; +	__asm__ ("pcmpeqd %0, %0" : "=x"(t));          // t = ~0 +	__asm__ ("psrld   $1, %0" : "+x"(t));          // t >>= 1 +	__asm__ ("andps   %1, %0" : "+x"(x) : "x"(t)); // x &= t +	return x; +} diff --git a/src/math/x86_64/fabsf.s b/src/math/x86_64/fabsf.s deleted file mode 100644 index 501a1f17..00000000 --- a/src/math/x86_64/fabsf.s +++ /dev/null @@ -1,7 +0,0 @@ -.global fabsf -.type fabsf,@function -fabsf: -	mov $0x7fffffff,%eax -	movq %rax,%xmm1 -	andps %xmm1,%xmm0 -	ret diff --git a/src/math/x86_64/fabsl.c b/src/math/x86_64/fabsl.c new file mode 100644 index 00000000..cc1c9ed9 --- /dev/null +++ b/src/math/x86_64/fabsl.c @@ -0,0 +1,7 @@ +#include <math.h> + +long double fabsl(long double x) +{ +	__asm__ ("fabs" : "+t"(x)); +	return x; +} diff --git a/src/math/x86_64/fabsl.s b/src/math/x86_64/fabsl.s deleted file mode 100644 index 4e7ab525..00000000 --- a/src/math/x86_64/fabsl.s +++ /dev/null @@ -1,6 +0,0 @@ -.global fabsl -.type fabsl,@function -fabsl: -	fldt 8(%rsp) -	fabs -	ret diff --git a/src/math/x86_64/fmodl.c b/src/math/x86_64/fmodl.c new file mode 100644 index 00000000..3daeab06 --- /dev/null +++ b/src/math/x86_64/fmodl.c @@ -0,0 +1,9 @@ +#include <math.h> + +long double fmodl(long double x, long double y) +{ +	unsigned short fpsr; +	do __asm__ ("fprem; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y)); +	while (fpsr & 0x400); +	return x; +} diff --git a/src/math/x86_64/fmodl.s b/src/math/x86_64/fmodl.s deleted file mode 100644 index ea07b402..00000000 --- a/src/math/x86_64/fmodl.s +++ /dev/null @@ -1,11 +0,0 @@ -.global fmodl -.type fmodl,@function -fmodl: -	fldt 24(%rsp) -	fldt 8(%rsp) -1:	fprem -	fnstsw %ax -	testb $4,%ah -	jnz 1b -	fstp %st(1) -	ret diff --git a/src/math/x86_64/llrint.c b/src/math/x86_64/llrint.c new file mode 100644 index 00000000..dd38a722 --- /dev/null +++ b/src/math/x86_64/llrint.c @@ -0,0 +1,8 @@ +#include <math.h> + +long long llrint(double x) +{ +	long long r; +	__asm__ ("cvtsd2si %1, %0" : "=r"(r) : "x"(x)); +	return r; +} diff --git a/src/math/x86_64/llrint.s b/src/math/x86_64/llrint.s deleted file mode 100644 index bf476498..00000000 --- a/src/math/x86_64/llrint.s +++ /dev/null @@ -1,5 +0,0 @@ -.global llrint -.type llrint,@function -llrint: -	cvtsd2si %xmm0,%rax -	ret diff --git a/src/math/x86_64/llrintf.c b/src/math/x86_64/llrintf.c new file mode 100644 index 00000000..fc8625e8 --- /dev/null +++ b/src/math/x86_64/llrintf.c @@ -0,0 +1,8 @@ +#include <math.h> + +long long llrintf(float x) +{ +	long long r; +	__asm__ ("cvtss2si %1, %0" : "=r"(r) : "x"(x)); +	return r; +} diff --git a/src/math/x86_64/llrintf.s b/src/math/x86_64/llrintf.s deleted file mode 100644 index d7204ac0..00000000 --- a/src/math/x86_64/llrintf.s +++ /dev/null @@ -1,5 +0,0 @@ -.global llrintf -.type llrintf,@function -llrintf: -	cvtss2si %xmm0,%rax -	ret diff --git a/src/math/x86_64/llrintl.c b/src/math/x86_64/llrintl.c new file mode 100644 index 00000000..c439ef28 --- /dev/null +++ b/src/math/x86_64/llrintl.c @@ -0,0 +1,8 @@ +#include <math.h> + +long long llrintl(long double x) +{ +	long long r; +	__asm__ ("fistpll %0" : "=m"(r) : "t"(x) : "st"); +	return r; +} diff --git a/src/math/x86_64/llrintl.s b/src/math/x86_64/llrintl.s deleted file mode 100644 index 1ec0817d..00000000 --- a/src/math/x86_64/llrintl.s +++ /dev/null @@ -1,7 +0,0 @@ -.global llrintl -.type llrintl,@function -llrintl: -	fldt 8(%rsp) -	fistpll 8(%rsp) -	mov 8(%rsp),%rax -	ret diff --git a/src/math/x86_64/lrint.c b/src/math/x86_64/lrint.c new file mode 100644 index 00000000..a742fec6 --- /dev/null +++ b/src/math/x86_64/lrint.c @@ -0,0 +1,8 @@ +#include <math.h> + +long lrint(double x) +{ +	long r; +	__asm__ ("cvtsd2si %1, %0" : "=r"(r) : "x"(x)); +	return r; +} diff --git a/src/math/x86_64/lrint.s b/src/math/x86_64/lrint.s deleted file mode 100644 index 15fc2454..00000000 --- a/src/math/x86_64/lrint.s +++ /dev/null @@ -1,5 +0,0 @@ -.global lrint -.type lrint,@function -lrint: -	cvtsd2si %xmm0,%rax -	ret diff --git a/src/math/x86_64/lrintf.c b/src/math/x86_64/lrintf.c new file mode 100644 index 00000000..2ba5639d --- /dev/null +++ b/src/math/x86_64/lrintf.c @@ -0,0 +1,8 @@ +#include <math.h> + +long lrintf(float x) +{ +	long r; +	__asm__ ("cvtss2si %1, %0" : "=r"(r) : "x"(x)); +	return r; +} diff --git a/src/math/x86_64/lrintf.s b/src/math/x86_64/lrintf.s deleted file mode 100644 index 488423d2..00000000 --- a/src/math/x86_64/lrintf.s +++ /dev/null @@ -1,5 +0,0 @@ -.global lrintf -.type lrintf,@function -lrintf: -	cvtss2si %xmm0,%rax -	ret diff --git a/src/math/x86_64/lrintl.c b/src/math/x86_64/lrintl.c new file mode 100644 index 00000000..068e2e4d --- /dev/null +++ b/src/math/x86_64/lrintl.c @@ -0,0 +1,8 @@ +#include <math.h> + +long lrintl(long double x) +{ +	long r; +	__asm__ ("fistpll %0" : "=m"(r) : "t"(x) : "st"); +	return r; +} diff --git a/src/math/x86_64/lrintl.s b/src/math/x86_64/lrintl.s deleted file mode 100644 index d587b12b..00000000 --- a/src/math/x86_64/lrintl.s +++ /dev/null @@ -1,7 +0,0 @@ -.global lrintl -.type lrintl,@function -lrintl: -	fldt 8(%rsp) -	fistpll 8(%rsp) -	mov 8(%rsp),%rax -	ret diff --git a/src/math/x86_64/remainderl.c b/src/math/x86_64/remainderl.c new file mode 100644 index 00000000..8cf75071 --- /dev/null +++ b/src/math/x86_64/remainderl.c @@ -0,0 +1,9 @@ +#include <math.h> + +long double remainderl(long double x, long double y) +{ +	unsigned short fpsr; +	do __asm__ ("fprem1; fnstsw %%ax" : "+t"(x), "=a"(fpsr) : "u"(y)); +	while (fpsr & 0x400); +	return x; +} diff --git a/src/math/x86_64/remainderl.s b/src/math/x86_64/remainderl.s deleted file mode 100644 index cb3857b4..00000000 --- a/src/math/x86_64/remainderl.s +++ /dev/null @@ -1,11 +0,0 @@ -.global remainderl -.type remainderl,@function -remainderl: -	fldt 24(%rsp) -	fldt 8(%rsp) -1:	fprem1 -	fnstsw %ax -	testb $4,%ah -	jnz 1b -	fstp %st(1) -	ret diff --git a/src/math/x86_64/remquol.c b/src/math/x86_64/remquol.c new file mode 100644 index 00000000..60eef089 --- /dev/null +++ b/src/math/x86_64/remquol.c @@ -0,0 +1,32 @@ +#include <math.h> + +long double remquol(long double x, long double y, int *quo) +{ +	signed char *cx = (void *)&x, *cy = (void *)&y; +	/* By ensuring that addresses of x and y cannot be discarded, +	 * this empty asm guides GCC into representing extraction of +	 * their sign bits as memory loads rather than making x and y +	 * not-address-taken internally and using bitfield operations, +	 * which in the end wouldn't work out, as extraction from FPU +	 * registers needs to go through memory anyway. This way GCC +	 * should manage to use incoming stack slots without spills. */ +	__asm__ ("" :: "X"(cx), "X"(cy)); + +	long double t = x; +	unsigned fpsr; +	do __asm__ ("fprem1; fnstsw %%ax" : "+t"(t), "=a"(fpsr) : "u"(y)); +	while (fpsr & 0x400); +	/* C0, C1, C3 flags in x87 status word carry low bits of quotient: +	 * 15 14 13 12 11 10  9  8 +	 *  . C3  .  .  . C2 C1 C0 +	 *  . b1  .  .  .  0 b0 b2 */ +	unsigned char i = fpsr >> 8; +	i = i>>4 | i<<4; +	/* i[5:2] is now {b0 b2 ? b1}. Retrieve {0 b2 b1 b0} via +	 * in-register table lookup. */ +	unsigned qbits = 0x7575313164642020 >> (i & 60); +	qbits &= 7; + +	*quo = (cx[9]^cy[9]) < 0 ? -qbits : qbits; +	return t; +} diff --git a/src/math/x86_64/rintl.c b/src/math/x86_64/rintl.c new file mode 100644 index 00000000..e1a92077 --- /dev/null +++ b/src/math/x86_64/rintl.c @@ -0,0 +1,7 @@ +#include <math.h> + +long double rintl(long double x) +{ +	__asm__ ("frndint" : "+t"(x)); +	return x; +} diff --git a/src/math/x86_64/rintl.s b/src/math/x86_64/rintl.s deleted file mode 100644 index 64e663cd..00000000 --- a/src/math/x86_64/rintl.s +++ /dev/null @@ -1,6 +0,0 @@ -.global rintl -.type rintl,@function -rintl: -	fldt 8(%rsp) -	frndint -	ret diff --git a/src/math/x86_64/sqrt.c b/src/math/x86_64/sqrt.c new file mode 100644 index 00000000..657e09e3 --- /dev/null +++ b/src/math/x86_64/sqrt.c @@ -0,0 +1,7 @@ +#include <math.h> + +double sqrt(double x) +{ +	__asm__ ("sqrtsd %1, %0" : "=x"(x) : "x"(x)); +	return x; +} diff --git a/src/math/x86_64/sqrt.s b/src/math/x86_64/sqrt.s deleted file mode 100644 index d3c609f9..00000000 --- a/src/math/x86_64/sqrt.s +++ /dev/null @@ -1,4 +0,0 @@ -.global sqrt -.type sqrt,@function -sqrt:	sqrtsd %xmm0, %xmm0 -	ret diff --git a/src/math/x86_64/sqrtf.c b/src/math/x86_64/sqrtf.c new file mode 100644 index 00000000..720baec6 --- /dev/null +++ b/src/math/x86_64/sqrtf.c @@ -0,0 +1,7 @@ +#include <math.h> + +float sqrtf(float x) +{ +	__asm__ ("sqrtss %1, %0" : "=x"(x) : "x"(x)); +	return x; +} diff --git a/src/math/x86_64/sqrtf.s b/src/math/x86_64/sqrtf.s deleted file mode 100644 index eec48c60..00000000 --- a/src/math/x86_64/sqrtf.s +++ /dev/null @@ -1,4 +0,0 @@ -.global sqrtf -.type sqrtf,@function -sqrtf:  sqrtss %xmm0, %xmm0 -	ret diff --git a/src/math/x86_64/sqrtl.c b/src/math/x86_64/sqrtl.c new file mode 100644 index 00000000..864cfcc4 --- /dev/null +++ b/src/math/x86_64/sqrtl.c @@ -0,0 +1,7 @@ +#include <math.h> + +long double sqrtl(long double x) +{ +	__asm__ ("fsqrt" : "+t"(x)); +	return x; +} diff --git a/src/math/x86_64/sqrtl.s b/src/math/x86_64/sqrtl.s deleted file mode 100644 index 23cd687d..00000000 --- a/src/math/x86_64/sqrtl.s +++ /dev/null @@ -1,5 +0,0 @@ -.global sqrtl -.type sqrtl,@function -sqrtl:	fldt 8(%rsp) -	fsqrt -	ret | 
