dead code in lgamma_r[f]?
Steve Kargl
sgk at troutmask.apl.washington.edu
Mon Dec 9 22:37:28 UTC 2013
On Sun, Dec 08, 2013 at 10:59:41AM -0800, Steve Kargl wrote:
> On Sun, Dec 08, 2013 at 10:33:39AM -0800, Steve Kargl wrote:
> >
> > fdlibm's lgammaf_r does not appear to have an issue near 1 or 2.
> > Starting at the lower bound of each interval and using nextafterf
> > to scan up to the upper bound, I'm seeing
> >
> > % make testf && ./testf
> > Interval: Time ULP Value
> > [4.7683716e-07, 2.0000000e+00): 0.04590 0.98491 1.231645e+00
> > [2.0000000e+00, 8.0000000e+00): 0.08754 0.82825 4.012439e+00
> > [8.0000000e+00, 2.8823038e+17): 0.05841 1.43789 8.942273e+06
> > [2.8823038e+17, 2.6584560e+36): 0.05095 0.50000 1.491249e+23
> >
> > where the reference value is from lgamma_r. The different
> > intervals test specific branches in lgammaf_r. The upper
> > bound of 2.6584560e36 is 0x1p121. Time is the average value
> > for all calls in the interval in usec/call.
>
> Correction. The timing is for 1 million calls uniformly
> distributed over the interval.
>
I would like to commit the attached patch. I will do it in
3 passes:
1) whitespace fixes,
2) literal constants changes (checked by md5),
3) the code re-organization, threshold change, and dead code removed.
* lib/msun/src/e_lgamma_r.c
. Remove trailing space and trailing blank line in Copyright.
. Fix prototype for sin_pi to agree with the intended commit
r97413 done some 11 years 6 month ago.
. Remove dead code in sin_pi and remove 'if(ix<0x43300000)' as it is
always true.
. In the domain [0,2), move the three minimax approximations embedded
within __ieee75_lgamma_r() into three 'static inline double' functions.
. Remove the now unused variables p1, p2, and p3.
. Use integer literal constants instead of double literal constants
where possible (checked with md5).
. Remove explicit cast of the int 'i' to double (checked with md5).
* lib/msun/src/e_lgammaf_r.c:
. The minimax polynomials have more terms than required for the
precision. Remove a8-a11, t10-t14, and w3-w6.
. Fix prototype for sin_pif to agree with the intended commit
r97413 done some 11 years 6 month ago.
. Remove dead code in sin_pif and remove 'if(ix<0x4b000000)' as it is
always true.
. In the domain [0,2), move the three minimax approximations embedded
within __ieee75_lgamma_r() into three 'static inline float' functions.
. Remove the now unused variables p1, p2, and p3.
. Use integer literal constants instead of double literal constants
where possible (checked with md5).
. Remove explicit cast of the int 'i' to float (checked with md5).
. Reduce the 2**58 threshold copied from e_lgamma_r.c to 2**30.
As before and after comparison, I offer
lgammaf_r() without patch
Interval: Time ULP Value
[4.7683716e-07, 2.0000000e+00): 0.05381 2.40423 7.169912e-01
[2.0000000e+00, 8.0000000e+00): 0.07758 1.64252 2.763559e+00
[8.0000000e+00, 1.0737418e+09): 0.06948 2.56132 1.122045e+01
[1.0737418e+09, 2.1990233e+12): 0.06967 1.29037 1.693146e+09
[2.1990233e+12, 3.9876840e+36): 0.05704 1.50689 1.424986e+14
lgammaf_r() with patch
Interval: Time ULP Value
[4.7683716e-07, 2.0000000e+00): 0.04888 2.40423 7.169912e-01
[2.0000000e+00, 8.0000000e+00): 0.07471 1.64252 2.763559e+00
[8.0000000e+00, 1.0737418e+09): 0.05819 2.56132 1.122045e+01
[1.0737418e+09, 2.1990233e+12): 0.05338 1.29037 1.693146e+09
[2.1990233e+12, 3.9876840e+36): 0.07397 1.50689 1.424986e+14
lgamma_r() without patch
Interval: Time ULP Value
[8.4703295e-22, 2.0000000e+00): 0.05961 2.09187 7.3247074760047326e-01
[2.0000000e+00, 8.0000000e+00): 0.08758 1.46347 3.9813700222966872e+00
lgamma_r() with patch
Interval: Time ULP Value
[8.4703295e-22, 2.0000000e+00): 0.05761 2.09187 7.3247074760047326e-01
[2.0000000e+00, 8.0000000e+00): 0.08659 1.46347 3.9813700222966872e+00
Timing is in usec/call for 1 million values uniformly distributed in
the interval.
--
Steve
--- /usr/src/lib/msun/src/e_lgamma_r.c 2013-12-06 07:58:39.000000000 -0800
+++ src/e_lgamma_r.c 2013-12-09 13:05:22.000000000 -0800
@@ -6,10 +6,9 @@
*
* Developed at SunSoft, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
+ * software is freely granted, provided that this notice
* is preserved.
* ====================================================
- *
*/
#include <sys/cdefs.h>
@@ -156,7 +155,8 @@
static const double zero= 0.00000000000000000000e+00;
- static double sin_pi(double x)
+static double
+sin_pi(double x)
{
double y,z;
int n,ix;
@@ -177,15 +177,11 @@
y = 2*(y - floor(y)); /* y = |x| mod 2.0 */
n = (int)(y*4);
} else {
- if(ix>=0x43400000) {
- y = zero; n = 0; /* y must be even */
- } else {
- if(ix<0x43300000) z = y+two52; /* exact */
- GET_LOW_WORD(n,z);
- n &= 1;
- y = n;
- n<<= 2;
- }
+ z = y+two52; /* exact */
+ GET_LOW_WORD(n,z);
+ n &= 1;
+ y = n;
+ n<<= 2;
}
switch (n) {
case 0: y = __kernel_sin(pi*y,zero,0); break;
@@ -201,12 +197,46 @@
}
+static inline double
+func0(double y)
+{
+ double p1, p2, z;
+
+ z = y * y;
+ p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
+ p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
+ return (y * p1 + p2 - y / 2);
+}
+
+static inline double
+func1(double y)
+{
+ double p1, p2, p3, w, z;
+
+ z = y * y;
+ w = z * y;
+ p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12)));
+ p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
+ p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
+ return (z * p1 - (tt - w * (p2 + y * p3)) + tf);
+}
+
+static inline double
+func2(double y)
+{
+ double p1, p2;
+
+ p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
+ p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
+ return (p1 / p2 - y / 2);
+}
+
double
__ieee754_lgamma_r(double x, int *signgamp)
{
- double t,y,z,nadj,p,p1,p2,p3,q,r,w;
+ double t,y,z,nadj,p,q,r,w;
int32_t hx;
- int i,lx,ix;
+ int i,ix,lx;
EXTRACT_WORDS(hx,lx,x);
@@ -235,51 +265,36 @@
if((((ix-0x3ff00000)|lx)==0)||(((ix-0x40000000)|lx)==0)) r = 0;
/* for x < 2.0 */
else if(ix<0x40000000) {
- if(ix<=0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */
+ if (ix <= 0X3FECCCCC) { /* lgamma(x) = lgamma(x+1)-log(x) */
r = -__ieee754_log(x);
- if(ix>=0x3FE76944) {y = one-x; i= 0;}
- else if(ix>=0x3FCDA661) {y= x-(tc-one); i=1;}
- else {y = x; i=2;}
+ if (ix >= 0x3FE76944)
+ r += func0(1 - x);
+ else if (ix >= 0x3FCDA661)
+ r += func1(x - (tc - 1));
+ else
+ r += func2(x);
} else {
- r = zero;
- if(ix>=0x3FFBB4C3) {y=2.0-x;i=0;} /* [1.7316,2] */
- else if(ix>=0x3FF3B4C4) {y=x-tc;i=1;} /* [1.23,1.73] */
- else {y=x-one;i=2;}
- }
- switch(i) {
- case 0:
- z = y*y;
- p1 = a0+z*(a2+z*(a4+z*(a6+z*(a8+z*a10))));
- p2 = z*(a1+z*(a3+z*(a5+z*(a7+z*(a9+z*a11)))));
- p = y*p1+p2;
- r += (p-0.5*y); break;
- case 1:
- z = y*y;
- w = z*y;
- p1 = t0+w*(t3+w*(t6+w*(t9 +w*t12))); /* parallel comp */
- p2 = t1+w*(t4+w*(t7+w*(t10+w*t13)));
- p3 = t2+w*(t5+w*(t8+w*(t11+w*t14)));
- p = z*p1-(tt-w*(p2+y*p3));
- r += (tf + p); break;
- case 2:
- p1 = y*(u0+y*(u1+y*(u2+y*(u3+y*(u4+y*u5)))));
- p2 = one+y*(v1+y*(v2+y*(v3+y*(v4+y*v5))));
- r += (-0.5*y + p1/p2);
+ if (ix >= 0x3FFBB4C3) /* [1.7316,2] */
+ r = func0(2 - x);
+ else if (ix >= 0x3FF3B4C4) /* [1.23,1.73] */
+ r = func1(x - tc);
+ else
+ r = func2(x - 1);
}
}
else if(ix<0x40200000) { /* x < 8.0 */
i = (int)x;
- y = x-(double)i;
+ y = x-i;
p = y*(s0+y*(s1+y*(s2+y*(s3+y*(s4+y*(s5+y*s6))))));
q = one+y*(r1+y*(r2+y*(r3+y*(r4+y*(r5+y*r6)))));
- r = half*y+p/q;
+ r = y/2+p/q;
z = one; /* lgamma(1+s) = log(s) + lgamma(s) */
switch(i) {
- case 7: z *= (y+6.0); /* FALLTHRU */
- case 6: z *= (y+5.0); /* FALLTHRU */
- case 5: z *= (y+4.0); /* FALLTHRU */
- case 4: z *= (y+3.0); /* FALLTHRU */
- case 3: z *= (y+2.0); /* FALLTHRU */
+ case 7: z *= (y+6); /* FALLTHRU */
+ case 6: z *= (y+5); /* FALLTHRU */
+ case 5: z *= (y+4); /* FALLTHRU */
+ case 4: z *= (y+3); /* FALLTHRU */
+ case 3: z *= (y+2); /* FALLTHRU */
r += __ieee754_log(z); break;
}
/* 8.0 <= x < 2**58 */
--- /usr/src/lib/msun/src/e_lgammaf_r.c 2013-12-06 07:57:42.000000000 -0800
+++ src/e_lgammaf_r.c 2013-12-09 13:04:12.000000000 -0800
@@ -32,10 +32,6 @@
a5 = 2.8905137442e-03, /* 0x3b3d6ec6 */
a6 = 1.1927076848e-03, /* 0x3a9c54a1 */
a7 = 5.1006977446e-04, /* 0x3a05b634 */
-a8 = 2.2086278477e-04, /* 0x39679767 */
-a9 = 1.0801156895e-04, /* 0x38e28445 */
-a10 = 2.5214456400e-05, /* 0x37d383a2 */
-a11 = 4.4864096708e-05, /* 0x383c2c75 */
tc = 1.4616321325e+00, /* 0x3fbb16c3 */
tf = -1.2148628384e-01, /* 0xbdf8cdcd */
/* tt = -(tail of tf) */
@@ -50,11 +46,6 @@
t7 = -3.6845202558e-03, /* 0xbb7177fe */
t8 = 2.2596477065e-03, /* 0x3b141699 */
t9 = -1.4034647029e-03, /* 0xbab7f476 */
-t10 = 8.8108185446e-04, /* 0x3a66f867 */
-t11 = -5.3859531181e-04, /* 0xba0d3085 */
-t12 = 3.1563205994e-04, /* 0x39a57b6b */
-t13 = -3.1275415677e-04, /* 0xb9a3f927 */
-t14 = 3.3552918467e-04, /* 0x39afe9f7 */
u0 = -7.7215664089e-02, /* 0xbd9e233f */
u1 = 6.3282704353e-01, /* 0x3f2200f4 */
u2 = 1.4549225569e+00, /* 0x3fba3ae7 */
@@ -81,15 +72,12 @@
r6 = 7.3266842264e-06, /* 0x36f5d7bd */
w0 = 4.1893854737e-01, /* 0x3ed67f1d */
w1 = 8.3333335817e-02, /* 0x3daaaaab */
-w2 = -2.7777778450e-03, /* 0xbb360b61 */
-w3 = 7.9365057172e-04, /* 0x3a500cfd */
-w4 = -5.9518753551e-04, /* 0xba1c065c */
-w5 = 8.3633989561e-04, /* 0x3a5b3dd2 */
-w6 = -1.6309292987e-03; /* 0xbad5c4e8 */
+w2 = -2.7777778450e-03; /* 0xbb360b61 */
static const float zero= 0.0000000000e+00;
- static float sin_pif(float x)
+static float
+sin_pif(float x)
{
float y,z;
int n,ix;
@@ -110,15 +98,11 @@
y = 2*(y - floorf(y)); /* y = |x| mod 2.0 */
n = (int)(y*4);
} else {
- if(ix>=0x4b800000) {
- y = zero; n = 0; /* y must be even */
- } else {
- if(ix<0x4b000000) z = y+two23; /* exact */
- GET_FLOAT_WORD(n,z);
- n &= 1;
- y = n;
- n<<= 2;
- }
+ z = y+two23; /* exact */
+ GET_FLOAT_WORD(n,z);
+ n &= 1;
+ y = n;
+ n<<= 2;
}
switch (n) {
case 0: y = __kernel_sindf(pi*y); break;
@@ -133,11 +117,44 @@
return -y;
}
+static inline float
+func0(float y)
+{
+ float p1, p2, z;
+
+ z = y * y;
+ p1 = a0 + z * (a2 + z * (a4 + z * a6));
+ p2 = z * (a1 + z * (a3 + z * (a5 + z * a7)));
+ return (y * p1 + p2 - y / 2);
+}
+
+static inline float
+func1(float y)
+{
+ float p1, p2, p3, w, z;
+
+ z = y * y;
+ w = z * y;
+ p1 = t0 + w * (t3 + w * (t6 + w * t9));
+ p2 = t1 + w * (t4 + w * t7);
+ p3 = t2 + w * (t5 + w * t8);
+ return (z * p1 - (tt - w * (p2 + y * p3)) + tf);
+}
+
+static inline float
+func2(float y)
+{
+ float p1, p2;
+
+ p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
+ p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
+ return (p1 / p2 - y / 2);
+}
float
__ieee754_lgammaf_r(float x, int *signgamp)
{
- float t,y,z,nadj,p,p1,p2,p3,q,r,w;
+ float t,y,z,nadj,p,q,r,w;
int32_t hx;
int i,ix;
@@ -168,62 +185,47 @@
if (ix==0x3f800000||ix==0x40000000) r = 0;
/* for x < 2.0 */
else if(ix<0x40000000) {
- if(ix<=0x3f666666) { /* lgamma(x) = lgamma(x+1)-log(x) */
+ if (ix <= 0x3f666666) { /* lgamma(x) = lgamma(x+1)-log(x) */
r = -__ieee754_logf(x);
- if(ix>=0x3f3b4a20) {y = one-x; i= 0;}
- else if(ix>=0x3e6d3308) {y= x-(tc-one); i=1;}
- else {y = x; i=2;}
+ if (ix >= 0x3f3b4a20)
+ r += func0(1 - x);
+ else if (ix >= 0x3e6d3308)
+ r += func1(x - (tc - 1));
+ else
+ r += func2(x);
} else {
- r = zero;
- if(ix>=0x3fdda618) {y=(float)2.0-x;i=0;} /* [1.7316,2] */
- else if(ix>=0x3F9da620) {y=x-tc;i=1;} /* [1.23,1.73] */
- else {y=x-one;i=2;}
- }
- switch(i) {
- case 0:
- z = y*y;
- p1 = a0+z*(a2+z*(a4+z*(a6+z*(a8+z*a10))));
- p2 = z*(a1+z*(a3+z*(a5+z*(a7+z*(a9+z*a11)))));
- p = y*p1+p2;
- r += (p-(float)0.5*y); break;
- case 1:
- z = y*y;
- w = z*y;
- p1 = t0+w*(t3+w*(t6+w*(t9 +w*t12))); /* parallel comp */
- p2 = t1+w*(t4+w*(t7+w*(t10+w*t13)));
- p3 = t2+w*(t5+w*(t8+w*(t11+w*t14)));
- p = z*p1-(tt-w*(p2+y*p3));
- r += (tf + p); break;
- case 2:
- p1 = y*(u0+y*(u1+y*(u2+y*(u3+y*(u4+y*u5)))));
- p2 = one+y*(v1+y*(v2+y*(v3+y*(v4+y*v5))));
- r += (-(float)0.5*y + p1/p2);
+ if (ix >= 0x3fdda618) /* [1.7316,2] */
+ r = func0(2 - x);
+ else if (ix >= 0x3F9da620) /* [1.23,1.73] */
+ r = func1(x - tc);
+ else
+ r = func2(x - 1);
}
}
else if(ix<0x41000000) { /* x < 8.0 */
i = (int)x;
- y = x-(float)i;
+ y = x-i;
p = y*(s0+y*(s1+y*(s2+y*(s3+y*(s4+y*(s5+y*s6))))));
q = one+y*(r1+y*(r2+y*(r3+y*(r4+y*(r5+y*r6)))));
- r = half*y+p/q;
+ r = y/2+p/q;
z = one; /* lgamma(1+s) = log(s) + lgamma(s) */
switch(i) {
- case 7: z *= (y+(float)6.0); /* FALLTHRU */
- case 6: z *= (y+(float)5.0); /* FALLTHRU */
- case 5: z *= (y+(float)4.0); /* FALLTHRU */
- case 4: z *= (y+(float)3.0); /* FALLTHRU */
- case 3: z *= (y+(float)2.0); /* FALLTHRU */
+ case 7: z *= (y+6); /* FALLTHRU */
+ case 6: z *= (y+5); /* FALLTHRU */
+ case 5: z *= (y+4); /* FALLTHRU */
+ case 4: z *= (y+3); /* FALLTHRU */
+ case 3: z *= (y+2); /* FALLTHRU */
r += __ieee754_logf(z); break;
}
- /* 8.0 <= x < 2**58 */
- } else if (ix < 0x5c800000) {
+ /* 8.0 <= x < 2**30 */
+ } else if (ix < 0x4e800000) {
t = __ieee754_logf(x);
z = one/x;
y = z*z;
- w = w0+z*(w1+y*(w2+y*(w3+y*(w4+y*(w5+y*w6)))));
+ w = w0+z*(w1+y*w2);
r = (x-half)*(t-one)+w;
} else
- /* 2**58 <= x <= inf */
+ /* 2**30 <= x <= inf */
r = x*(__ieee754_logf(x)-one);
if(hx<0) r = nadj - r;
return r;
More information about the freebsd-numerics
mailing list