From 7513d3ecabb998e2c8c4cb9ed5de48c4b64a166b Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Mon, 19 Mar 2012 23:53:52 -0400 Subject: remquo asm: return quotient mod 8, as intended by the spec this is a lot more efficient and also what is generally wanted. perhaps the bit shuffling could be more efficient... --- src/math/i386/remquo.s | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) (limited to 'src') diff --git a/src/math/i386/remquo.s b/src/math/i386/remquo.s index 86ae2dc3..37a24450 100644 --- a/src/math/i386/remquo.s +++ b/src/math/i386/remquo.s @@ -2,40 +2,49 @@ .type remquof,@function remquof: mov 12(%esp),%ecx - fldl 4(%esp) fldl 8(%esp) + fldl 4(%esp) + mov 11(%esp),%dh + xor 7(%esp),%dh jmp 1f .global remquol .type remquol,@function remquol: mov 28(%esp),%ecx - fldl 4(%esp) fldl 16(%esp) + fldl 4(%esp) + mov 25(%esp),%dh + xor 13(%esp),%dh jmp 1f .global remquo .type remquo,@function remquo: mov 20(%esp),%ecx - fldl 4(%esp) fldl 12(%esp) -1: fld %st(1) + fldl 4(%esp) + mov 19(%esp),%dh + xor 11(%esp),%dh 1: fprem1 fnstsw %ax sahf jp 1b - fsubr %st(0),%st(2) - fxch %st(2) - fdivp - mov $0x4f000000,%eax - mov %eax,4(%esp) - flds 4(%esp) - fxch %st(1) -1: fprem - fnstsw %ax - sahf - jp 1b - fistpl (%ecx) - fstp %st(0) + fstp %st(1) + mov %ah,%dl + shr %dl + and $1,%dl + mov %ah,%al + shr $5,%al + and $2,%al + or %al,%dl + mov %ah,%al + shl $2,%al + and $4,%al + or %al,%dl + test %dh,%dh + jns 1f + neg %dl +1: movsbl %dl,%edx + mov %edx,(%ecx) ret -- cgit v1.2.1 From baa43bca0a051e8deb0d6a9a8882ceeea5c27249 Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Tue, 20 Mar 2012 00:51:32 -0400 Subject: optimize scalbn family the fscale instruction is slow everywhere, probably because it involves a costly and unnecessary integer truncation operation that ends up being a no-op in common usages. instead, construct a floating point scale value with integer arithmetic and simply multiply by it, when possible. for float and double, this is always possible by going to the next-larger type. we use some cheap but effective saturating arithmetic tricks to make sure even very large-magnitude exponents fit. for long double, if the scaling exponent is too large to fit in the exponent of a long double value, we simply fallback to the expensive fscale method. on atom cpu, these changes speed up scalbn by over 30%. (min rdtsc timing dropped from 110 cycles to 70 cycles.) --- src/math/i386/scalbn.s | 19 ++++++++++++++++--- src/math/i386/scalbnf.s | 18 +++++++++++++++--- src/math/i386/scalbnl.s | 16 +++++++++++++++- 3 files changed, 46 insertions(+), 7 deletions(-) (limited to 'src') diff --git a/src/math/i386/scalbn.s b/src/math/i386/scalbn.s index e275d14f..8bf302f2 100644 --- a/src/math/i386/scalbn.s +++ b/src/math/i386/scalbn.s @@ -11,10 +11,23 @@ scalbln: .global scalbn .type scalbn,@function scalbn: - fildl 12(%esp) + mov 12(%esp),%eax + add $0x3ffe,%eax + cmp $0x7ffd,%eax + jb 1f + sub $0x3ffe,%eax + sar $31,%eax + xor $0xfff,%eax + add $0x3ffe,%eax +1: inc %eax fldl 4(%esp) - fscale - fstp %st(1) + mov %eax,12(%esp) + mov $0x80000000,%eax + mov %eax,8(%esp) + xor %eax,%eax + mov %eax,4(%esp) + fldt 4(%esp) + fmulp fstpl 4(%esp) fldl 4(%esp) ret diff --git a/src/math/i386/scalbnf.s b/src/math/i386/scalbnf.s index 40232b6a..9cb9ef5f 100644 --- a/src/math/i386/scalbnf.s +++ b/src/math/i386/scalbnf.s @@ -11,10 +11,22 @@ scalblnf: .global scalbnf .type scalbnf,@function scalbnf: - fildl 8(%esp) + mov 8(%esp),%eax + add $0x3fe,%eax + cmp $0x7fd,%eax + jb 1f + sub $0x3fe,%eax + sar $31,%eax + xor $0x1ff,%eax + add $0x3fe,%eax +1: inc %eax + shl $20,%eax flds 4(%esp) - fscale - fstp %st(1) + mov %eax,8(%esp) + xor %eax,%eax + mov %eax,4(%esp) + fldl 4(%esp) + fmulp fstps 4(%esp) flds 4(%esp) ret diff --git a/src/math/i386/scalbnl.s b/src/math/i386/scalbnl.s index 224b1bef..54414c2e 100644 --- a/src/math/i386/scalbnl.s +++ b/src/math/i386/scalbnl.s @@ -11,7 +11,21 @@ scalblnl: .global scalbnl .type scalbnl,@function scalbnl: - fildl 16(%esp) + mov 16(%esp),%eax + add $0x3ffe,%eax + cmp $0x7ffd,%eax + jae 1f + inc %eax + fldt 4(%esp) + mov %eax,12(%esp) + mov $0x80000000,%eax + mov %eax,8(%esp) + xor %eax,%eax + mov %eax,4(%esp) + fldt 4(%esp) + fmulp + ret +1: fildl 16(%esp) fldt 4(%esp) fscale fstp %st(1) -- cgit v1.2.1