NetBSD Problem Report #51645
From www@NetBSD.org Wed Nov 23 10:54:12 2016
Return-Path: <www@NetBSD.org>
Received: from mail.netbsd.org (mail.netbsd.org [199.233.217.200])
(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
(Client CN "mail.netbsd.org", Issuer "Postmaster NetBSD.org" (verified OK))
by mollari.NetBSD.org (Postfix) with ESMTPS id 8C1F77A316
for <gnats-bugs@gnats.NetBSD.org>; Wed, 23 Nov 2016 10:54:12 +0000 (UTC)
Message-Id: <20161123105411.52A6B7A329@mollari.NetBSD.org>
Date: Wed, 23 Nov 2016 10:54:11 +0000 (UTC)
From: rokuyama@rk.phys.keio.ac.jp
Reply-To: rokuyama@rk.phys.keio.ac.jp
To: gnats-bugs@NetBSD.org
Subject: exponential and hyperbolic functions are slow on m68k FPU emulator
X-Send-Pr-Version: www-1.0
>Number: 51645
>Category: port-m68k
>Synopsis: exponential and hyperbolic functions are slow on m68k FPU emulator
>Confidential: no
>Severity: serious
>Priority: medium
>Responsible: isaki
>State: closed
>Class: change-request
>Submitter-Id: net
>Arrival-Date: Wed Nov 23 10:55:00 +0000 2016
>Closed-Date: Mon Dec 05 15:39:58 +0000 2016
>Last-Modified: Mon Dec 05 15:39:58 +0000 2016
>Originator: Rin Okuyama
>Release: 7.99.42
>Organization:
Faculty of Science and Technology, Keio University
>Environment:
NetBSD x68k 7.99.42 NetBSD 7.99.42 (GENERIC) #0: Wed Nov 23 18:04:04 JST 2016 rin@xxx:xxx x68k
>Description:
On FPU emulator for m68k, the exponential and hyperbolic functions
are evaluated by Taylor expansion around x = 0. Therefore, they get
slower as |x| increases.
In the attached patch, exp(x) is factorized into 2^k * exp(r) with
k = round(x / ln2) and r = x - k * ln2. Since |r| < 0.5, exp(r) is
easily evaluated by Taylor expansion. The hyperbolic functions are
calculated using exp(x). Then, calculation time is significantly
reduced for |x| >> 1.
Note that this algorithm is partially taken from libm, where exp(r)
is approximated by a rational function of r:
https://nxr.netbsd.org/source/xref/src/lib/libm/src/e_exp.c
>How-To-Repeat:
I prepared a test program:
http://www.netbsd.org/~rin/20161123_m68k_fpe/test.c
The usage is as follows.
% cc -O2 -lm test.c -o test
% ./test -{e|h} range count
Then it calculates exp(x) (with -e flag), or cosh(x) and sinh(x)
(with -f) for -range <= x <= range, divided into 2*count regions.
On 68040 @ 100MHz emulated by XM6i (http://xm6i.org/), the total
elapsed time is measured for the following commands:
(a) test -e 1000 1000
(b) test -h 1000 1000
(c) test -e 1 1000
(d) test -h 1 1000
Before applying the patch:
(a) 360 s, (b) 390 s, (c) 22 s, (d) 29 s
After applying the patch:
(a) 23 s, (b) 37 s, (c) 21 s, (d) 35 s
The calculation results with my patch are in good agreement with
those obtained without the patch.
>Fix:
--- src/sys/arch/m68k/fpe/fpu_exp.c.orig 2016-11-22 22:46:33.910890342 +0900
+++ src/sys/arch/m68k/fpe/fpu_exp.c 2016-11-23 01:25:37.345265106 +0900
@@ -100,12 +100,16 @@
}
/*
- * exp(x)
+ * exp(x) = 2^k * exp(r) with k = round(x / ln2) and r = x - k * ln2
+ *
+ * Algorithm partially taken from libm, where exp(r) is approximated by a
+ * rational function of r. We use the Taylor expansion instead.
*/
struct fpn *
fpu_etox(struct fpemu *fe)
{
- struct fpn *fp;
+ struct fpn x, *fp;
+ int j, k;
if (ISNAN(&fe->fe_f2))
return &fe->fe_f2;
@@ -115,19 +119,46 @@
return &fe->fe_f2;
}
- if (fe->fe_f2.fp_sign == 0) {
- /* exp(x) */
- fp = fpu_etox_taylor(fe);
- } else {
- /* 1/exp(-x) */
- fe->fe_f2.fp_sign = 0;
- fp = fpu_etox_taylor(fe);
+ CPYFPN(&x, &fe->fe_f2);
- CPYFPN(&fe->fe_f2, fp);
- fpu_const(&fe->fe_f1, FPU_CONST_1);
- fp = fpu_div(fe);
+ /* k = round(x / ln2) */
+ CPYFPN(&fe->fe_f1, &fe->fe_f2);
+ fpu_const(&fe->fe_f2, FPU_CONST_LN_2);
+ fp = fpu_div(fe);
+ CPYFPN(&fe->fe_f2, fp);
+ fp = fpu_int(fe);
+ if (ISZERO(fp)) {
+ /* k = 0 */
+ CPYFPN(&fe->fe_f2, &x);
+ fp = fpu_etox_taylor(fe);
+ return fp;
+ }
+ j = FP_LG - fp->fp_exp;
+ if (j < 0) {
+ if (fp->fp_sign)
+ fpu_const(&fe->fe_f2, FPU_CONST_0); /* k < -2^18 */
+ else
+ fp->fp_class = FPC_INF; /* k > 2^18 */
+ return fp;
}
-
+ k = fp->fp_mant[0] >> j;
+ if (fp->fp_sign)
+ k *= -1;
+
+ /* exp(r) = exp(x - k * ln2) */
+ CPYFPN(&fe->fe_f1, fp);
+ fpu_const(&fe->fe_f2, FPU_CONST_LN_2);
+ fp = fpu_mul(fe);
+ fp->fp_sign = !fp->fp_sign;
+ CPYFPN(&fe->fe_f1, fp);
+ CPYFPN(&fe->fe_f2, &x);
+ fp = fpu_add(fe);
+ CPYFPN(&fe->fe_f2, fp);
+ fp = fpu_etox_taylor(fe);
+
+ /* 2^k */
+ fp->fp_exp += k;
+
return fp;
}
--- src/sys/arch/m68k/fpe/fpu_hyperb.c.orig 2016-11-22 23:20:06.260199530 +0900
+++ src/sys/arch/m68k/fpe/fpu_hyperb.c 2016-11-23 01:38:13.988524951 +0900
@@ -137,71 +137,14 @@
}
/*
- * taylor expansion used by sinh(), cosh().
+ * exp(x) + exp(-x)
+ * cosh(x) = ------------------
+ * 2
*/
-static struct fpn *
-__fpu_sinhcosh_taylor(struct fpemu *fe, struct fpn *s0, uint32_t f)
-{
- struct fpn res;
- struct fpn x2;
- struct fpn *s1;
- struct fpn *r;
- int sign;
- uint32_t k;
-
- /* x2 := x * x */
- CPYFPN(&fe->fe_f1, &fe->fe_f2);
- r = fpu_mul(fe);
- CPYFPN(&x2, r);
-
- /* res := s0 */
- CPYFPN(&res, s0);
-
- sign = 1; /* sign := (-1)^n */
-
- for (; f < (2 * MAX_ITEMS); ) {
- /* (f1 :=) s0 * x^2 */
- CPYFPN(&fe->fe_f1, s0);
- CPYFPN(&fe->fe_f2, &x2);
- r = fpu_mul(fe);
- CPYFPN(&fe->fe_f1, r);
-
- /*
- * for sinh(), s1 := s0 * x^2 / (2n+1)2n
- * for cosh(), s1 := s0 * x^2 / 2n(2n-1)
- */
- k = f * (f + 1);
- fpu_explode(fe, &fe->fe_f2, FTYPE_LNG, &k);
- s1 = fpu_div(fe);
-
- /* break if s1 is enough small */
- if (ISZERO(s1))
- break;
- if (res.fp_exp - s1->fp_exp >= EXT_FRACBITS)
- break;
-
- /* s0 := s1 for next loop */
- CPYFPN(s0, s1);
-
- /* res += s1 */
- CPYFPN(&fe->fe_f2, s1);
- CPYFPN(&fe->fe_f1, &res);
- r = fpu_add(fe);
- CPYFPN(&res, r);
-
- f += 2;
- sign ^= 1;
- }
-
- CPYFPN(&fe->fe_f2, &res);
- return &fe->fe_f2;
-}
-
struct fpn *
fpu_cosh(struct fpemu *fe)
{
- struct fpn s0;
- struct fpn *r;
+ struct fpn x, *fp;
if (ISNAN(&fe->fe_f2))
return &fe->fe_f2;
@@ -211,17 +154,37 @@
return &fe->fe_f2;
}
- fpu_const(&s0, FPU_CONST_1);
- r = __fpu_sinhcosh_taylor(fe, &s0, 1);
+ /* if x is +0/-0, return 1 */ /* XXX is this necessary? */
+ if (ISZERO(&fe->fe_f2)) {
+ fpu_const(&fe->fe_f2, FPU_CONST_1);
+ return &fe->fe_f2;
+ }
- return r;
+ fp = fpu_etox(fe);
+ CPYFPN(&x, fp);
+
+ fpu_const(&fe->fe_f1, FPU_CONST_1);
+ CPYFPN(&fe->fe_f2, fp);
+ fp = fpu_div(fe);
+
+ CPYFPN(&fe->fe_f1, fp);
+ CPYFPN(&fe->fe_f2, &x);
+ fp = fpu_add(fe);
+
+ fp->fp_exp--;
+
+ return fp;
}
+/*
+ * exp(x) - exp(-x)
+ * sinh(x) = ------------------
+ * 2
+ */
struct fpn *
fpu_sinh(struct fpemu *fe)
{
- struct fpn s0;
- struct fpn *r;
+ struct fpn x, *fp;
if (ISNAN(&fe->fe_f2))
return &fe->fe_f2;
@@ -232,12 +195,28 @@
if (ISZERO(&fe->fe_f2))
return &fe->fe_f2;
- CPYFPN(&s0, &fe->fe_f2);
- r = __fpu_sinhcosh_taylor(fe, &s0, 2);
+ fp = fpu_etox(fe);
+ CPYFPN(&x, fp);
- return r;
+ fpu_const(&fe->fe_f1, FPU_CONST_1);
+ CPYFPN(&fe->fe_f2, fp);
+ fp = fpu_div(fe);
+
+ fp->fp_sign = 1;
+ CPYFPN(&fe->fe_f1, fp);
+ CPYFPN(&fe->fe_f2, &x);
+ fp = fpu_add(fe);
+
+ fp->fp_exp--;
+
+ return fp;
}
+/*
+ * sinh(x)
+ * tanh(x) = ---------
+ * cosh(x)
+ */
struct fpn *
fpu_tanh(struct fpemu *fe)
{
>Release-Note:
>Audit-Trail:
Responsible-Changed-From-To: port-m68k-maintainer->isaki
Responsible-Changed-By: isaki@NetBSD.org
Responsible-Changed-When: Mon, 28 Nov 2016 06:40:28 +0000
Responsible-Changed-Why:
I'll take it.
State-Changed-From-To: open->analyzed
State-Changed-By: isaki@NetBSD.org
State-Changed-When: Mon, 28 Nov 2016 06:40:28 +0000
State-Changed-Why:
From: Tetsuya Isaki <isaki@pastel-flower.jp>
To: gnats-bugs@NetBSD.org
Cc: isaki@NetBSD.org, port-m68k-maintainer@netbsd.org,
netbsd-bugs@netbsd.org,
gnats-admin@netbsd.org,
rokuyama@rk.phys.keio.ac.jp
Subject: Re: port-m68k/51645 (exponential and hyperbolic functions are slow on m68k FPU emulator)
Date: Mon, 05 Dec 2016 21:49:10 +0900
It's a nice patch!
It's 10..100 times faster as you wrote.
The accuracy decreased slightly in extended precision but
I think it's not a big problem because faster is better.
I'll add the following diffs to your patch and commit.
--- fpu_exp.c 2016-12-02 15:50:41.000000000 +0900
+++ fpu_exp.c 2016-12-02 15:52:14.000000000 +0900
@@ -133,10 +133,11 @@
fp = fpu_etox_taylor(fe);
return fp;
}
+ /* extract k as integer format from fpn format */
j = FP_LG - fp->fp_exp;
if (j < 0) {
if (fp->fp_sign)
- fpu_const(&fe->fe_f2, FPU_CONST_0); /* k < -2^18 */
+ fp->fp_class = FPC_ZERO; /* k < -2^18 */
else
fp->fp_class = FPC_INF; /* k > 2^18 */
return fp;
--- fpu_hyperb.c 2016-11-28 13:55:22.000000000 +0900
+++ fpu_hyperb.c 2016-12-02 15:54:43.000000000 +0900
@@ -63,9 +63,6 @@
#include "fpu_emulate.h"
-/* The number of items to terminate the Taylor expansion */
-#define MAX_ITEMS (2000)
-
/*
* fpu_hyperb.c: defines the following functions
*
---
Tetsuya Isaki <isaki@pastel-flower.jp / isaki@NetBSD.org>
From: "Tetsuya Isaki" <isaki@netbsd.org>
To: gnats-bugs@gnats.NetBSD.org
Cc:
Subject: PR/51645 CVS commit: src/sys/arch/m68k/fpe
Date: Mon, 5 Dec 2016 15:31:01 +0000
Module Name: src
Committed By: isaki
Date: Mon Dec 5 15:31:01 UTC 2016
Modified Files:
src/sys/arch/m68k/fpe: fpu_exp.c fpu_hyperb.c
Log Message:
Improve the exponential and hyperbolic function's performance
10..100 times faster.
PR port-m68k/51645 from rin@ (and modified by me)
To generate a diff of this commit:
cvs rdiff -u -r1.8 -r1.9 src/sys/arch/m68k/fpe/fpu_exp.c
cvs rdiff -u -r1.16 -r1.17 src/sys/arch/m68k/fpe/fpu_hyperb.c
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
State-Changed-From-To: analyzed->closed
State-Changed-By: isaki@NetBSD.org
State-Changed-When: Mon, 05 Dec 2016 15:39:58 +0000
State-Changed-Why:
commited.
>Unformatted:
(Contact us)
$NetBSD: query-full-pr,v 1.39 2013/11/01 18:47:49 spz Exp $
$NetBSD: gnats_config.sh,v 1.8 2006/05/07 09:23:38 tsutsui Exp $
Copyright © 1994-2014
The NetBSD Foundation, Inc. ALL RIGHTS RESERVED.