NetBSD Problem Report #51645

From www@NetBSD.org  Wed Nov 23 10:54:12 2016
Return-Path: <www@NetBSD.org>
Received: from mail.netbsd.org (mail.netbsd.org [199.233.217.200])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(Client CN "mail.netbsd.org", Issuer "Postmaster NetBSD.org" (verified OK))
	by mollari.NetBSD.org (Postfix) with ESMTPS id 8C1F77A316
	for <gnats-bugs@gnats.NetBSD.org>; Wed, 23 Nov 2016 10:54:12 +0000 (UTC)
Message-Id: <20161123105411.52A6B7A329@mollari.NetBSD.org>
Date: Wed, 23 Nov 2016 10:54:11 +0000 (UTC)
From: rokuyama@rk.phys.keio.ac.jp
Reply-To: rokuyama@rk.phys.keio.ac.jp
To: gnats-bugs@NetBSD.org
Subject: exponential and hyperbolic functions are slow on m68k FPU emulator
X-Send-Pr-Version: www-1.0

>Number:         51645
>Category:       port-m68k
>Synopsis:       exponential and hyperbolic functions are slow on m68k FPU emulator
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    isaki
>State:          closed
>Class:          change-request
>Submitter-Id:   net
>Arrival-Date:   Wed Nov 23 10:55:00 +0000 2016
>Closed-Date:    Mon Dec 05 15:39:58 +0000 2016
>Last-Modified:  Mon Dec 05 15:39:58 +0000 2016
>Originator:     Rin Okuyama
>Release:        7.99.42
>Organization:
Faculty of Science and Technology, Keio University
>Environment:
NetBSD x68k 7.99.42 NetBSD 7.99.42 (GENERIC) #0: Wed Nov 23 18:04:04 JST 2016  rin@xxx:xxx x68k
>Description:
On FPU emulator for m68k, the exponential and hyperbolic functions
are evaluated by Taylor expansion around x = 0. Therefore, they get
slower as |x| increases.

In the attached patch, exp(x) is factorized into 2^k * exp(r) with
k = round(x / ln2) and r = x - k * ln2. Since |r| < 0.5, exp(r) is
easily evaluated by Taylor expansion. The hyperbolic functions are
calculated using exp(x). Then, calculation time is significantly
reduced for |x| >> 1.

Note that this algorithm is partially taken from libm, where exp(r)
is approximated by a rational function of r:

  https://nxr.netbsd.org/source/xref/src/lib/libm/src/e_exp.c
>How-To-Repeat:
I prepared a test program:

  http://www.netbsd.org/~rin/20161123_m68k_fpe/test.c

The usage is as follows.

  % cc -O2 -lm test.c -o test
  % ./test -{e|h} range count

Then it calculates exp(x) (with -e flag), or cosh(x) and sinh(x)
(with -f) for -range <= x <= range, divided into 2*count regions.

On 68040 @ 100MHz emulated by XM6i (http://xm6i.org/), the total
elapsed time is measured for the following commands:

  (a) test -e 1000 1000
  (b) test -h 1000 1000
  (c) test -e 1    1000
  (d) test -h 1    1000

Before applying the patch:

  (a) 360 s, (b) 390 s, (c) 22 s, (d) 29 s

After applying the patch:

  (a)  23 s, (b)  37 s, (c) 21 s, (d) 35 s

The calculation results with my patch are in good agreement with
those obtained without the patch.
>Fix:
--- src/sys/arch/m68k/fpe/fpu_exp.c.orig	2016-11-22 22:46:33.910890342 +0900
+++ src/sys/arch/m68k/fpe/fpu_exp.c	2016-11-23 01:25:37.345265106 +0900
@@ -100,12 +100,16 @@
 }

 /*
- * exp(x)
+ * exp(x) = 2^k * exp(r) with k = round(x / ln2) and r = x - k * ln2
+ *
+ * Algorithm partially taken from libm, where exp(r) is approximated by a
+ * rational function of r. We use the Taylor expansion instead.
  */
 struct fpn *
 fpu_etox(struct fpemu *fe)
 {
-	struct fpn *fp;
+	struct fpn x, *fp;
+	int j, k;

 	if (ISNAN(&fe->fe_f2))
 		return &fe->fe_f2;
@@ -115,19 +119,46 @@
 		return &fe->fe_f2;
 	}

-	if (fe->fe_f2.fp_sign == 0) {
-		/* exp(x) */
-		fp = fpu_etox_taylor(fe);
-	} else {
-		/* 1/exp(-x) */
-		fe->fe_f2.fp_sign = 0;
-		fp = fpu_etox_taylor(fe);
+	CPYFPN(&x, &fe->fe_f2);

-		CPYFPN(&fe->fe_f2, fp);
-		fpu_const(&fe->fe_f1, FPU_CONST_1);
-		fp = fpu_div(fe);
+	/* k = round(x / ln2) */
+	CPYFPN(&fe->fe_f1, &fe->fe_f2);
+	fpu_const(&fe->fe_f2, FPU_CONST_LN_2);
+	fp = fpu_div(fe);
+	CPYFPN(&fe->fe_f2, fp);
+	fp = fpu_int(fe);
+	if (ISZERO(fp)) {
+		/* k = 0 */
+		CPYFPN(&fe->fe_f2, &x);
+		fp = fpu_etox_taylor(fe);
+		return fp;
+	}
+	j = FP_LG - fp->fp_exp;
+	if (j < 0) {
+		if (fp->fp_sign)
+			fpu_const(&fe->fe_f2, FPU_CONST_0);	/* k < -2^18 */
+		else
+			fp->fp_class = FPC_INF;			/* k >  2^18 */
+		return fp;
 	}
-	
+	k = fp->fp_mant[0] >> j;
+	if (fp->fp_sign)
+		k *= -1;
+
+	/* exp(r) = exp(x - k * ln2) */
+	CPYFPN(&fe->fe_f1, fp);
+	fpu_const(&fe->fe_f2, FPU_CONST_LN_2);
+	fp = fpu_mul(fe);
+	fp->fp_sign = !fp->fp_sign;
+	CPYFPN(&fe->fe_f1, fp);
+	CPYFPN(&fe->fe_f2, &x);
+	fp = fpu_add(fe);
+	CPYFPN(&fe->fe_f2, fp);
+	fp = fpu_etox_taylor(fe);
+
+	/* 2^k */
+	fp->fp_exp += k;
+
 	return fp;
 }

--- src/sys/arch/m68k/fpe/fpu_hyperb.c.orig	2016-11-22 23:20:06.260199530 +0900
+++ src/sys/arch/m68k/fpe/fpu_hyperb.c	2016-11-23 01:38:13.988524951 +0900
@@ -137,71 +137,14 @@
 }

 /*
- * taylor expansion used by sinh(), cosh().
+ *            exp(x) + exp(-x)
+ * cosh(x) = ------------------
+ *                   2
  */
-static struct fpn *
-__fpu_sinhcosh_taylor(struct fpemu *fe, struct fpn *s0, uint32_t f)
-{
-	struct fpn res;
-	struct fpn x2;
-	struct fpn *s1;
-	struct fpn *r;
-	int sign;
-	uint32_t k;
-
-	/* x2 := x * x */
-	CPYFPN(&fe->fe_f1, &fe->fe_f2);
-	r = fpu_mul(fe);
-	CPYFPN(&x2, r);
-
-	/* res := s0 */
-	CPYFPN(&res, s0);
-
-	sign = 1;	/* sign := (-1)^n */
-
-	for (; f < (2 * MAX_ITEMS); ) {
-		/* (f1 :=) s0 * x^2 */
-		CPYFPN(&fe->fe_f1, s0);
-		CPYFPN(&fe->fe_f2, &x2);
-		r = fpu_mul(fe);
-		CPYFPN(&fe->fe_f1, r);
-
-		/*
-		 * for sinh(),  s1 := s0 * x^2 / (2n+1)2n
-		 * for cosh(),  s1 := s0 * x^2 / 2n(2n-1)
-		 */
-		k = f * (f + 1);
-		fpu_explode(fe, &fe->fe_f2, FTYPE_LNG, &k);
-		s1 = fpu_div(fe);
-
-		/* break if s1 is enough small */
-		if (ISZERO(s1))
-			break;
-		if (res.fp_exp - s1->fp_exp >= EXT_FRACBITS)
-			break;
-
-		/* s0 := s1 for next loop */
-		CPYFPN(s0, s1);
-
-		/* res += s1 */
-		CPYFPN(&fe->fe_f2, s1);
-		CPYFPN(&fe->fe_f1, &res);
-		r = fpu_add(fe);
-		CPYFPN(&res, r);
-
-		f += 2;
-		sign ^= 1;
-	}
-
-	CPYFPN(&fe->fe_f2, &res);
-	return &fe->fe_f2;
-}
-
 struct fpn *
 fpu_cosh(struct fpemu *fe)
 {
-	struct fpn s0;
-	struct fpn *r;
+	struct fpn x, *fp;

 	if (ISNAN(&fe->fe_f2))
 		return &fe->fe_f2;
@@ -211,17 +154,37 @@
 		return &fe->fe_f2;
 	}

-	fpu_const(&s0, FPU_CONST_1);
-	r = __fpu_sinhcosh_taylor(fe, &s0, 1);
+	/* if x is +0/-0, return 1 */ /* XXX is this necessary? */
+	if (ISZERO(&fe->fe_f2)) {
+		fpu_const(&fe->fe_f2, FPU_CONST_1);
+		return &fe->fe_f2;
+	}

-	return r;
+	fp = fpu_etox(fe);
+	CPYFPN(&x, fp);
+
+	fpu_const(&fe->fe_f1, FPU_CONST_1);
+	CPYFPN(&fe->fe_f2, fp);
+	fp = fpu_div(fe);
+
+	CPYFPN(&fe->fe_f1, fp);
+	CPYFPN(&fe->fe_f2, &x);
+	fp = fpu_add(fe);
+
+	fp->fp_exp--;
+
+	return fp;
 }

+/*
+ *            exp(x) - exp(-x)
+ * sinh(x) = ------------------
+ *                   2
+ */
 struct fpn *
 fpu_sinh(struct fpemu *fe)
 {
-	struct fpn s0;
-	struct fpn *r;
+	struct fpn x, *fp;

 	if (ISNAN(&fe->fe_f2))
 		return &fe->fe_f2;
@@ -232,12 +195,28 @@
 	if (ISZERO(&fe->fe_f2))
 		return &fe->fe_f2;

-	CPYFPN(&s0, &fe->fe_f2);
-	r = __fpu_sinhcosh_taylor(fe, &s0, 2);
+	fp = fpu_etox(fe);
+	CPYFPN(&x, fp);

-	return r;
+	fpu_const(&fe->fe_f1, FPU_CONST_1);
+	CPYFPN(&fe->fe_f2, fp);
+	fp = fpu_div(fe);
+
+	fp->fp_sign = 1;
+	CPYFPN(&fe->fe_f1, fp);
+	CPYFPN(&fe->fe_f2, &x);
+	fp = fpu_add(fe);
+
+	fp->fp_exp--;
+
+	return fp;
 }

+/*
+ *            sinh(x)
+ * tanh(x) = ---------
+ *            cosh(x)
+ */
 struct fpn *
 fpu_tanh(struct fpemu *fe)
 {

>Release-Note:

>Audit-Trail:

Responsible-Changed-From-To: port-m68k-maintainer->isaki
Responsible-Changed-By: isaki@NetBSD.org
Responsible-Changed-When: Mon, 28 Nov 2016 06:40:28 +0000
Responsible-Changed-Why:
I'll take it.


State-Changed-From-To: open->analyzed
State-Changed-By: isaki@NetBSD.org
State-Changed-When: Mon, 28 Nov 2016 06:40:28 +0000
State-Changed-Why:


From: Tetsuya Isaki <isaki@pastel-flower.jp>
To: gnats-bugs@NetBSD.org
Cc: isaki@NetBSD.org, port-m68k-maintainer@netbsd.org,
	netbsd-bugs@netbsd.org,
	gnats-admin@netbsd.org,
	rokuyama@rk.phys.keio.ac.jp
Subject: Re: port-m68k/51645 (exponential and hyperbolic functions are slow on m68k FPU emulator)
Date: Mon, 05 Dec 2016 21:49:10 +0900

 It's a nice patch!
 It's 10..100 times faster as you wrote.
 The accuracy decreased slightly in extended precision but
 I think it's not a big problem because faster is better.

 I'll add the following diffs to your patch and commit.

 --- fpu_exp.c	2016-12-02 15:50:41.000000000 +0900
 +++ fpu_exp.c	2016-12-02 15:52:14.000000000 +0900
 @@ -133,10 +133,11 @@
  		fp = fpu_etox_taylor(fe);
  		return fp;
  	}
 +	/* extract k as integer format from fpn format */
  	j = FP_LG - fp->fp_exp;
  	if (j < 0) {
  		if (fp->fp_sign)
 -			fpu_const(&fe->fe_f2, FPU_CONST_0);	/* k < -2^18 */
 +			fp->fp_class = FPC_ZERO;		/* k < -2^18 */
  		else
  			fp->fp_class = FPC_INF;			/* k >  2^18 */
  		return fp;
 --- fpu_hyperb.c	2016-11-28 13:55:22.000000000 +0900
 +++ fpu_hyperb.c	2016-12-02 15:54:43.000000000 +0900
 @@ -63,9 +63,6 @@

  #include "fpu_emulate.h"

 -/* The number of items to terminate the Taylor expansion */
 -#define MAX_ITEMS	(2000)
 -
  /*
   * fpu_hyperb.c: defines the following functions
   *

 ---
 Tetsuya Isaki <isaki@pastel-flower.jp / isaki@NetBSD.org>

From: "Tetsuya Isaki" <isaki@netbsd.org>
To: gnats-bugs@gnats.NetBSD.org
Cc: 
Subject: PR/51645 CVS commit: src/sys/arch/m68k/fpe
Date: Mon, 5 Dec 2016 15:31:01 +0000

 Module Name:	src
 Committed By:	isaki
 Date:		Mon Dec  5 15:31:01 UTC 2016

 Modified Files:
 	src/sys/arch/m68k/fpe: fpu_exp.c fpu_hyperb.c

 Log Message:
 Improve the exponential and hyperbolic function's performance
 10..100 times faster.
 PR port-m68k/51645 from rin@ (and modified by me)


 To generate a diff of this commit:
 cvs rdiff -u -r1.8 -r1.9 src/sys/arch/m68k/fpe/fpu_exp.c
 cvs rdiff -u -r1.16 -r1.17 src/sys/arch/m68k/fpe/fpu_hyperb.c

 Please note that diffs are not public domain; they are subject to the
 copyright notices on the relevant files.

State-Changed-From-To: analyzed->closed
State-Changed-By: isaki@NetBSD.org
State-Changed-When: Mon, 05 Dec 2016 15:39:58 +0000
State-Changed-Why:
commited.


>Unformatted:

NetBSD Home
NetBSD PR Database Search

(Contact us) $NetBSD: query-full-pr,v 1.39 2013/11/01 18:47:49 spz Exp $
$NetBSD: gnats_config.sh,v 1.8 2006/05/07 09:23:38 tsutsui Exp $
Copyright © 1994-2014 The NetBSD Foundation, Inc. ALL RIGHTS RESERVED.