From 07039ed8563b850624146c938ae201a1099d2f75 Mon Sep 17 00:00:00 2001 From: Szabolcs Nagy Date: Thu, 5 Sep 2013 10:58:48 +0000 Subject: math: fix exp2l asm on x86 (raise underflow correctly) there were two problems: * omitted underflow on subnormal results: exp2l(-16383.5) was calculated as sqrt(2)*2^-16384, the last bits of sqrt(2) are zero so the down scaling does not underflow eventhough the result is in subnormal range * spurious underflow for subnormal inputs: exp2l(0x1p-16400) was evaluated as f2xm1(x)+1 and f2xm1 raised underflow (because inexact subnormal result) the first issue is fixed by raising underflow manually if x is in (-32768,-16382] and not integer (x-0x1p63+0x1p63 != x) the second issue is fixed by treating x in (-0x1p64,0x1p64) specially for these fixes the special case handling was completely rewritten --- src/math/i386/exp.s | 70 ++++++++++++++++++++++++--------------------- src/math/x86_64/exp2l.s | 75 ++++++++++++++++++++++++++----------------------- 2 files changed, 78 insertions(+), 67 deletions(-) diff --git a/src/math/i386/exp.s b/src/math/i386/exp.s index e5f54588..abb90369 100644 --- a/src/math/i386/exp.s +++ b/src/math/i386/exp.s @@ -95,42 +95,32 @@ exp: .type exp2,@function exp2: fldl 4(%esp) -1: pushl $0x467ff000 - flds (%esp) # 16380 - xorl %eax,%eax - pushl $0x80000000 - push %eax - fld %st(1) - fabs - fucomp %st(1) - fnstsw - fstp %st(0) - sahf - ja 3f # |x| > 16380 - jp 2f # x is nan (avoid invalid except in fistp) +1: sub $12,%esp fld %st(0) - fistpl 8(%esp) - fildl 8(%esp) - fxch %st(1) - fsub %st(1) - mov $0x3fff,%eax - add %eax,8(%esp) - f2xm1 - fld1 - faddp # 2^(x-rint(x)) - fldt (%esp) # 2^rint(x) - fmulp - fstp %st(1) -2: add $12,%esp - ret - -3: fld %st(0) fstpt (%esp) - fld1 mov 8(%esp),%ax and $0x7fff,%ax - cmp $0x7fff,%ax - je 1f # x = +-inf + cmp $0x3fff+13,%ax + jb 4f # |x| < 8192 + cmp $0x3fff+15,%ax + jae 3f # |x| >= 32768 + fsts (%esp) + cmpl $0xc67ff800,(%esp) + jb 2f # x > -16382 + movl $0x5f000000,(%esp) + flds (%esp) # 0x1p63 + fld %st(1) + fsub %st(1) + faddp + fucomp %st(1) + fnstsw + sahf + je 2f # x - 0x1p63 + 0x1p63 == x + movl $1,(%esp) + flds (%esp) # 0x1p-149 + fdiv %st(1) + fstps (%esp) # raise underflow +2: fld1 fld %st(1) frndint fxch %st(2) @@ -141,3 +131,19 @@ exp2: fstp %st(1) add $12,%esp ret +3: xor %eax,%eax +4: cmp $0x3fff-64,%ax + fld1 + jb 1b # |x| < 0x1p-64 + fstpt (%esp) + fistl 8(%esp) + fildl 8(%esp) + fsubrp %st(1) + addl $0x3fff,8(%esp) + f2xm1 + fld1 + faddp # 2^(x-rint(x)) + fldt (%esp) # 2^rint(x) + fmulp + add $12,%esp + ret diff --git a/src/math/x86_64/exp2l.s b/src/math/x86_64/exp2l.s index 1f8ed7bb..e7145881 100644 --- a/src/math/x86_64/exp2l.s +++ b/src/math/x86_64/exp2l.s @@ -26,44 +26,32 @@ expm1l: .type exp2l,@function exp2l: fldt 8(%rsp) -1: mov $0x467ff000,%eax - mov %eax,-16(%rsp) - mov $0x80000000,%eax - mov %eax,-20(%rsp) - xor %eax,%eax - mov %eax,-24(%rsp) - flds -16(%rsp) # 16380 +1: fld %st(0) + sub $16,%rsp + fstpt (%rsp) + mov 8(%rsp),%ax + and $0x7fff,%ax + cmp $0x3fff+13,%ax + jb 4f # |x| < 8192 + cmp $0x3fff+15,%ax + jae 3f # |x| >= 32768 + fsts (%rsp) + cmpl $0xc67ff800,(%rsp) + jb 2f # x > -16382 + movl $0x5f000000,(%rsp) + flds (%rsp) # 0x1p63 fld %st(1) - fabs - fucom %st(1) + fsub %st(1) + faddp + fucomp %st(1) fnstsw - fstp %st(0) - fstp %st(0) sahf - ja 3f # |x| > 16380 - jp 2f # x is nan (avoid invalid except in fistp) - fld %st(0) - fistpl -16(%rsp) - fildl -16(%rsp) - fxch %st(1) - fsub %st(1) - mov $0x3fff,%eax - add %eax,-16(%rsp) - f2xm1 - fld1 - faddp # 2^(x-rint(x)) - fldt -24(%rsp) # 2^rint(x) - fmulp -2: fstp %st(1) - ret - -3: fld %st(0) - fstpt -24(%rsp) - fld1 - mov -15(%rsp),%ax - and $0x7fff,%ax - cmp $0x7fff,%ax - je 1f # x = +-inf + je 2f # x - 0x1p63 + 0x1p63 == x + movl $1,(%rsp) + flds (%rsp) # 0x1p-149 + fdiv %st(1) + fstps (%rsp) # raise underflow +2: fld1 fld %st(1) frndint fxch %st(2) @@ -72,4 +60,21 @@ exp2l: faddp # 2^(x-rint(x)) 1: fscale fstp %st(1) + add $16,%rsp + ret +3: xor %eax,%eax +4: cmp $0x3fff-64,%ax + fld1 + jb 1b # |x| < 0x1p-64 + fstpt (%rsp) + fistl 8(%rsp) + fildl 8(%rsp) + fsubrp %st(1) + addl $0x3fff,8(%rsp) + f2xm1 + fld1 + faddp # 2^(x-rint(x)) + fldt (%rsp) # 2^rint(x) + fmulp + add $16,%rsp ret -- cgit v1.2.1