math: fix exp2l asm on x86 (raise underflow correctly)

there were two problems: * omitted underflow on subnormal results: exp2l(-16383.5) was calculated as sqrt(2)*2^-16384, the last bits of sqrt(2) are zero so the down scaling does not underflow eventhough the result is in subnormal range * spurious underflow for subnormal inputs: exp2l(0x1p-16400) was evaluated as f2xm1(x)+1 and f2xm1 raised underflow (because inexact subnormal result) the first issue is fixed by raising underflow manually if x is in (-32768,-16382] and not integer (x-0x1p63+0x1p63 != x) the second issue is fixed by treating x in (-0x1p64,0x1p64) specially for these fixes the special case handling was completely rewritten
13 years ago · 07039ed856
2 changed files with 78 additions and 67 deletions
--- a/src/math/i386/exp.s
+++ b/src/math/i386/exp.s
@ -95,42 +95,32 @@ exp:
 .type exp2,@function
 exp2:
 	fldl 4(%esp)
-1:	pushl $0x467ff000
-	flds (%esp)       # 16380
-	xorl %eax,%eax
-	pushl $0x80000000
-	push %eax
-	fld %st(1)
-	fabs
-	fucomp %st(1)
-	fnstsw
-	fstp %st(0)
-	sahf
-	ja 3f             # |x| > 16380
-	jp 2f             # x is nan (avoid invalid except in fistp)
+1:	sub $12,%esp
 	fld %st(0)
-	fistpl 8(%esp)
-	fildl 8(%esp)
-	fxch %st(1)
-	fsub %st(1)
-	mov $0x3fff,%eax
-	add %eax,8(%esp)
-	f2xm1
-	fld1
-	faddp             # 2^(x-rint(x))
-	fldt (%esp)       # 2^rint(x)
-	fmulp
-	fstp %st(1)
-2:	add $12,%esp
-	ret
-
-3:	fld %st(0)
 	fstpt (%esp)
-	fld1
 	mov 8(%esp),%ax
 	and $0x7fff,%ax
-	cmp $0x7fff,%ax
-	je 1f             # x = +-inf
+	cmp $0x3fff+13,%ax
+	jb 4f             # |x| < 8192
+	cmp $0x3fff+15,%ax
+	jae 3f            # |x| >= 32768
+	fsts (%esp)
+	cmpl $0xc67ff800,(%esp)
+	jb 2f             # x > -16382
+	movl $0x5f000000,(%esp)
+	flds (%esp)       # 0x1p63
+	fld %st(1)
+	fsub %st(1)
+	faddp
+	fucomp %st(1)
+	fnstsw
+	sahf
+	je 2f             # x - 0x1p63 + 0x1p63 == x
+	movl $1,(%esp)
+	flds (%esp)       # 0x1p-149
+	fdiv %st(1)
+	fstps (%esp)      # raise underflow
+2:	fld1
 	fld %st(1)
 	frndint
 	fxch %st(2)
@ -141,3 +131,19 @@ exp2:
 	fstp %st(1)
 	add $12,%esp
 	ret
+3:	xor %eax,%eax
+4:	cmp $0x3fff-64,%ax
+	fld1
+	jb 1b             # |x| < 0x1p-64
+	fstpt (%esp)
+	fistl 8(%esp)
+	fildl 8(%esp)
+	fsubrp %st(1)
+	addl $0x3fff,8(%esp)
+	f2xm1
+	fld1
+	faddp             # 2^(x-rint(x))
+	fldt (%esp)       # 2^rint(x)
+	fmulp
+	add $12,%esp
+	ret
--- a/src/math/x86_64/exp2l.s
+++ b/src/math/x86_64/exp2l.s
@ -26,44 +26,32 @@ expm1l:
 .type exp2l,@function
 exp2l:
 	fldt 8(%rsp)
-1:	mov $0x467ff000,%eax
-	mov %eax,-16(%rsp)
-	mov $0x80000000,%eax
-	mov %eax,-20(%rsp)
-	xor %eax,%eax
-	mov %eax,-24(%rsp)
-	flds -16(%rsp)    # 16380
+1:	fld %st(0)
+	sub $16,%rsp
+	fstpt (%rsp)
+	mov 8(%rsp),%ax
+	and $0x7fff,%ax
+	cmp $0x3fff+13,%ax
+	jb 4f             # |x| < 8192
+	cmp $0x3fff+15,%ax
+	jae 3f            # |x| >= 32768
+	fsts (%rsp)
+	cmpl $0xc67ff800,(%rsp)
+	jb 2f             # x > -16382
+	movl $0x5f000000,(%rsp)
+	flds (%rsp)       # 0x1p63
 	fld %st(1)
-	fabs
-	fucom %st(1)
+	fsub %st(1)
+	faddp
+	fucomp %st(1)
 	fnstsw
-	fstp %st(0)
-	fstp %st(0)
 	sahf
-	ja 3f             # |x| > 16380
-	jp 2f             # x is nan (avoid invalid except in fistp)
-	fld %st(0)
-	fistpl -16(%rsp)
-	fildl -16(%rsp)
-	fxch %st(1)
-	fsub %st(1)
-	mov $0x3fff,%eax
-	add %eax,-16(%rsp)
-	f2xm1
-	fld1
-	faddp             # 2^(x-rint(x))
-	fldt -24(%rsp)    # 2^rint(x)
-	fmulp
-2:	fstp %st(1)
-	ret
-
-3:	fld %st(0)
-	fstpt -24(%rsp)
-	fld1
-	mov -15(%rsp),%ax
-	and $0x7fff,%ax
-	cmp $0x7fff,%ax
-	je 1f             # x = +-inf
+	je 2f             # x - 0x1p63 + 0x1p63 == x
+	movl $1,(%rsp)
+	flds (%rsp)       # 0x1p-149
+	fdiv %st(1)
+	fstps (%rsp)      # raise underflow
+2:	fld1
 	fld %st(1)
 	frndint
 	fxch %st(2)
@ -72,4 +60,21 @@ exp2l:
 	faddp             # 2^(x-rint(x))
 1:	fscale
 	fstp %st(1)
+	add $16,%rsp
+	ret
+3:	xor %eax,%eax
+4:	cmp $0x3fff-64,%ax
+	fld1
+	jb 1b             # |x| < 0x1p-64
+	fstpt (%rsp)
+	fistl 8(%rsp)
+	fildl 8(%rsp)
+	fsubrp %st(1)
+	addl $0x3fff,8(%rsp)
+	f2xm1
+	fld1
+	faddp             # 2^(x-rint(x))
+	fldt (%rsp)       # 2^rint(x)
+	fmulp
+	add $16,%rsp
 	ret