dead code in lgamma_r[f]?

Mon Dec 9 22:37:28 UTC 2013

On Sun, Dec 08, 2013 at 10:59:41AM -0800, Steve Kargl wrote:
> On Sun, Dec 08, 2013 at 10:33:39AM -0800, Steve Kargl wrote:
> > 
> > fdlibm's lgammaf_r does not appear to have an issue near 1 or 2.
> > Starting at the lower bound of each interval and using nextafterf
> > to scan up to the upper bound, I'm seeing
> > 
> > % make testf && ./testf
> > Interval:                       Time    ULP      Value
> > [4.7683716e-07, 2.0000000e+00): 0.04590 0.98491  1.231645e+00
> > [2.0000000e+00, 8.0000000e+00): 0.08754 0.82825  4.012439e+00
> > [8.0000000e+00, 2.8823038e+17): 0.05841 1.43789  8.942273e+06
> > [2.8823038e+17, 2.6584560e+36): 0.05095 0.50000  1.491249e+23
> > 
> > where the reference value is from lgamma_r.  The different 
> > intervals test specific branches in lgammaf_r.  The upper
> > bound of 2.6584560e36 is 0x1p121.  Time is the average value
> > for all calls in the interval in usec/call.
> 
> Correction.  The timing is for 1 million calls uniformly
> distributed over the interval.
> 

I would like to commit the attached patch.  I will do it in 
3 passes:
   1) whitespace fixes,
   2) literal constants changes (checked by md5),
   3) the code re-organization, threshold change, and dead code removed.

* lib/msun/src/e_lgamma_r.c
  . Remove trailing space and trailing blank line in Copyright.
  . Fix prototype for sin_pi to agree with the intended commit
    r97413 done some 11 years 6 month ago.
  . Remove dead code in sin_pi and remove 'if(ix<0x43300000)' as it is
    always true.
  . In the domain [0,2), move the three minimax approximations embedded
    within __ieee75_lgamma_r() into three 'static inline double' functions.
  . Remove the now unused variables p1, p2, and p3.
  . Use integer literal constants instead of double literal constants
    where possible (checked with md5).
  . Remove explicit cast of the int 'i' to double (checked with md5).

* lib/msun/src/e_lgammaf_r.c:
  . The minimax polynomials have more terms than required for the 
    precision.  Remove a8-a11, t10-t14, and w3-w6.
  . Fix prototype for sin_pif to agree with the intended commit
    r97413 done some 11 years 6 month ago.
  . Remove dead code in sin_pif and remove 'if(ix<0x4b000000)' as it is
    always true.
  . In the domain [0,2), move the three minimax approximations embedded
    within __ieee75_lgamma_r() into three 'static inline float' functions.
  . Remove the now unused variables p1, p2, and p3.
  . Use integer literal constants instead of double literal constants
    where possible (checked with md5).
  . Remove explicit cast of the int 'i' to float (checked with md5).
  . Reduce the 2**58 threshold copied from e_lgamma_r.c to 2**30.


As before and after comparison, I offer

lgammaf_r() without patch
Interval:                       Time    ULP      Value
[4.7683716e-07, 2.0000000e+00): 0.05381 2.40423  7.169912e-01
[2.0000000e+00, 8.0000000e+00): 0.07758 1.64252  2.763559e+00
[8.0000000e+00, 1.0737418e+09): 0.06948 2.56132  1.122045e+01
[1.0737418e+09, 2.1990233e+12): 0.06967 1.29037  1.693146e+09
[2.1990233e+12, 3.9876840e+36): 0.05704 1.50689  1.424986e+14

lgammaf_r() with patch
Interval:                       Time    ULP      Value
[4.7683716e-07, 2.0000000e+00): 0.04888 2.40423  7.169912e-01
[2.0000000e+00, 8.0000000e+00): 0.07471 1.64252  2.763559e+00
[8.0000000e+00, 1.0737418e+09): 0.05819 2.56132  1.122045e+01
[1.0737418e+09, 2.1990233e+12): 0.05338 1.29037  1.693146e+09
[2.1990233e+12, 3.9876840e+36): 0.07397 1.50689  1.424986e+14

lgamma_r() without patch
Interval:                       Time    ULP      Value
[8.4703295e-22, 2.0000000e+00): 0.05961 2.09187  7.3247074760047326e-01
[2.0000000e+00, 8.0000000e+00): 0.08758 1.46347  3.9813700222966872e+00

lgamma_r() with patch
Interval:                       Time    ULP      Value
[8.4703295e-22, 2.0000000e+00): 0.05761 2.09187  7.3247074760047326e-01
[2.0000000e+00, 8.0000000e+00): 0.08659 1.46347  3.9813700222966872e+00

Timing is in usec/call for 1 million values uniformly distributed in
the interval.

-- 
Steve

--- /usr/src/lib/msun/src/e_lgamma_r.c	2013-12-06 07:58:39.000000000 -0800
+++ src/e_lgamma_r.c	2013-12-09 13:05:22.000000000 -0800
@@ -6,10 +6,9 @@
  *
  * Developed at SunSoft, a Sun Microsystems, Inc. business.
  * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice 
+ * software is freely granted, provided that this notice
  * is preserved.
  * ====================================================
- *
  */
 
 #include <sys/cdefs.h>
@@ -156,7 +155,8 @@
 
 static const double zero=  0.00000000000000000000e+00;
 
-	static double sin_pi(double x)
+static double
+sin_pi(double x)
 {
 	double y,z;
 	int n,ix;
@@ -177,15 +177,11 @@
 	    y  = 2*(y - floor(y));		/* y = |x| mod 2.0 */
 	    n  = (int)(y*4);
 	} else {
-            if(ix>=0x43400000) {
-                y = zero; n = 0;                 /* y must be even */
-            } else {
-                if(ix<0x43300000) z = y+two52;	/* exact */
-		GET_LOW_WORD(n,z);
-		n &= 1;
-                y  = n;
-                n<<= 2;
-            }
+	    z  = y+two52;			/* exact */
+	    GET_LOW_WORD(n,z);
+	    n &= 1;
+	    y  = n;
+	    n<<= 2;
         }
 	switch (n) {
 	    case 0:   y =  __kernel_sin(pi*y,zero,0); break;
@@ -201,12 +197,46 @@
 }
 
 
+static inline double
+func0(double y)
+{
+	double p1, p2, z;
+
+	z = y * y;
+	p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
+	p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
+	return (y * p1 + p2 - y / 2);
+}
+
+static inline double
+func1(double y)
+{
+	double p1, p2, p3, w, z;
+
+	z = y * y;
+	w = z * y;
+	p1 = t0 + w * (t3 + w * (t6 + w * (t9  + w * t12)));
+	p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
+	p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
+	return (z * p1 - (tt - w * (p2 + y * p3)) + tf);
+}
+
+static inline double
+func2(double y)
+{
+	double p1, p2;
+
+	p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
+	p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
+	return (p1 / p2 - y / 2);
+}
+
 double
 __ieee754_lgamma_r(double x, int *signgamp)
 {
-	double t,y,z,nadj,p,p1,p2,p3,q,r,w;
+	double t,y,z,nadj,p,q,r,w;
 	int32_t hx;
-	int i,lx,ix;
+	int i,ix,lx;
 
 	EXTRACT_WORDS(hx,lx,x);
 
@@ -235,51 +265,36 @@
 	if((((ix-0x3ff00000)|lx)==0)||(((ix-0x40000000)|lx)==0)) r = 0;
     /* for x < 2.0 */
 	else if(ix<0x40000000) {
-	    if(ix<=0x3feccccc) { 	/* lgamma(x) = lgamma(x+1)-log(x) */
+	    if (ix <= 0X3FECCCCC) { 	/* lgamma(x) = lgamma(x+1)-log(x) */
 		r = -__ieee754_log(x);
-		if(ix>=0x3FE76944) {y = one-x; i= 0;}
-		else if(ix>=0x3FCDA661) {y= x-(tc-one); i=1;}
-	  	else {y = x; i=2;}
+		if (ix >= 0x3FE76944)
+		    r += func0(1 - x);
+		else if (ix >= 0x3FCDA661)
+		    r += func1(x - (tc - 1));
+	  	else
+		    r += func2(x);
 	    } else {
-	  	r = zero;
-	        if(ix>=0x3FFBB4C3) {y=2.0-x;i=0;} /* [1.7316,2] */
-	        else if(ix>=0x3FF3B4C4) {y=x-tc;i=1;} /* [1.23,1.73] */
-		else {y=x-one;i=2;}
-	    }
-	    switch(i) {
-	      case 0:
-		z = y*y;
-		p1 = a0+z*(a2+z*(a4+z*(a6+z*(a8+z*a10))));
-		p2 = z*(a1+z*(a3+z*(a5+z*(a7+z*(a9+z*a11)))));
-		p  = y*p1+p2;
-		r  += (p-0.5*y); break;
-	      case 1:
-		z = y*y;
-		w = z*y;
-		p1 = t0+w*(t3+w*(t6+w*(t9 +w*t12)));	/* parallel comp */
-		p2 = t1+w*(t4+w*(t7+w*(t10+w*t13)));
-		p3 = t2+w*(t5+w*(t8+w*(t11+w*t14)));
-		p  = z*p1-(tt-w*(p2+y*p3));
-		r += (tf + p); break;
-	      case 2:	
-		p1 = y*(u0+y*(u1+y*(u2+y*(u3+y*(u4+y*u5)))));
-		p2 = one+y*(v1+y*(v2+y*(v3+y*(v4+y*v5))));
-		r += (-0.5*y + p1/p2);
+	        if (ix >= 0x3FFBB4C3)		/* [1.7316,2] */
+		    r = func0(2 - x);
+	        else if (ix >= 0x3FF3B4C4)	/* [1.23,1.73] */
+		    r = func1(x - tc);
+		else
+		    r = func2(x - 1);
 	    }
 	}
 	else if(ix<0x40200000) { 			/* x < 8.0 */
 	    i = (int)x;
-	    y = x-(double)i;
+	    y = x-i;
 	    p = y*(s0+y*(s1+y*(s2+y*(s3+y*(s4+y*(s5+y*s6))))));
 	    q = one+y*(r1+y*(r2+y*(r3+y*(r4+y*(r5+y*r6)))));
-	    r = half*y+p/q;
+	    r = y/2+p/q;
 	    z = one;	/* lgamma(1+s) = log(s) + lgamma(s) */
 	    switch(i) {
-	    case 7: z *= (y+6.0);	/* FALLTHRU */
-	    case 6: z *= (y+5.0);	/* FALLTHRU */
-	    case 5: z *= (y+4.0);	/* FALLTHRU */
-	    case 4: z *= (y+3.0);	/* FALLTHRU */
-	    case 3: z *= (y+2.0);	/* FALLTHRU */
+	    case 7: z *= (y+6);	/* FALLTHRU */
+	    case 6: z *= (y+5);	/* FALLTHRU */
+	    case 5: z *= (y+4);	/* FALLTHRU */
+	    case 4: z *= (y+3);	/* FALLTHRU */
+	    case 3: z *= (y+2);	/* FALLTHRU */
 		    r += __ieee754_log(z); break;
 	    }
     /* 8.0 <= x < 2**58 */
--- /usr/src/lib/msun/src/e_lgammaf_r.c	2013-12-06 07:57:42.000000000 -0800
+++ src/e_lgammaf_r.c	2013-12-09 13:04:12.000000000 -0800
@@ -32,10 +32,6 @@
 a5  =  2.8905137442e-03, /* 0x3b3d6ec6 */
 a6  =  1.1927076848e-03, /* 0x3a9c54a1 */
 a7  =  5.1006977446e-04, /* 0x3a05b634 */
-a8  =  2.2086278477e-04, /* 0x39679767 */
-a9  =  1.0801156895e-04, /* 0x38e28445 */
-a10 =  2.5214456400e-05, /* 0x37d383a2 */
-a11 =  4.4864096708e-05, /* 0x383c2c75 */
 tc  =  1.4616321325e+00, /* 0x3fbb16c3 */
 tf  = -1.2148628384e-01, /* 0xbdf8cdcd */
 /* tt = -(tail of tf) */
@@ -50,11 +46,6 @@
 t7  = -3.6845202558e-03, /* 0xbb7177fe */
 t8  =  2.2596477065e-03, /* 0x3b141699 */
 t9  = -1.4034647029e-03, /* 0xbab7f476 */
-t10 =  8.8108185446e-04, /* 0x3a66f867 */
-t11 = -5.3859531181e-04, /* 0xba0d3085 */
-t12 =  3.1563205994e-04, /* 0x39a57b6b */
-t13 = -3.1275415677e-04, /* 0xb9a3f927 */
-t14 =  3.3552918467e-04, /* 0x39afe9f7 */
 u0  = -7.7215664089e-02, /* 0xbd9e233f */
 u1  =  6.3282704353e-01, /* 0x3f2200f4 */
 u2  =  1.4549225569e+00, /* 0x3fba3ae7 */
@@ -81,15 +72,12 @@
 r6  =  7.3266842264e-06, /* 0x36f5d7bd */
 w0  =  4.1893854737e-01, /* 0x3ed67f1d */
 w1  =  8.3333335817e-02, /* 0x3daaaaab */
-w2  = -2.7777778450e-03, /* 0xbb360b61 */
-w3  =  7.9365057172e-04, /* 0x3a500cfd */
-w4  = -5.9518753551e-04, /* 0xba1c065c */
-w5  =  8.3633989561e-04, /* 0x3a5b3dd2 */
-w6  = -1.6309292987e-03; /* 0xbad5c4e8 */
+w2  = -2.7777778450e-03; /* 0xbb360b61 */
 
 static const float zero=  0.0000000000e+00;
 
-	static float sin_pif(float x)
+static float
+sin_pif(float x)
 {
 	float y,z;
 	int n,ix;
@@ -110,15 +98,11 @@
 	    y  = 2*(y - floorf(y));		/* y = |x| mod 2.0 */
 	    n  = (int)(y*4);
 	} else {
-            if(ix>=0x4b800000) {
-                y = zero; n = 0;                 /* y must be even */
-            } else {
-                if(ix<0x4b000000) z = y+two23;	/* exact */
-		GET_FLOAT_WORD(n,z);
-		n &= 1;
-                y  = n;
-                n<<= 2;
-            }
+	    z  = y+two23;			/* exact */
+	    GET_FLOAT_WORD(n,z);
+	    n &= 1;
+	    y  = n;
+	    n<<= 2;
         }
 	switch (n) {
 	    case 0:   y =  __kernel_sindf(pi*y); break;
@@ -133,11 +117,44 @@
 	return -y;
 }
 
+static inline float
+func0(float y)
+{
+	float p1, p2, z;
+
+	z = y * y;
+	p1 = a0 + z * (a2 + z * (a4 + z * a6));
+	p2 = z * (a1 + z * (a3 + z * (a5 + z * a7)));
+	return (y * p1 + p2 - y / 2);
+}
+
+static inline float
+func1(float y)
+{
+	float p1, p2, p3, w, z;
+
+	z = y * y;
+	w = z * y;
+	p1 = t0 + w * (t3 + w * (t6 + w * t9));
+	p2 = t1 + w * (t4 + w * t7);
+	p3 = t2 + w * (t5 + w * t8);
+	return (z * p1 - (tt - w * (p2 + y * p3)) + tf);
+}
+
+static inline float
+func2(float y)
+{
+	float p1, p2;
+
+	p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
+	p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
+	return (p1 / p2 - y / 2);
+}
 
 float
 __ieee754_lgammaf_r(float x, int *signgamp)
 {
-	float t,y,z,nadj,p,p1,p2,p3,q,r,w;
+	float t,y,z,nadj,p,q,r,w;
 	int32_t hx;
 	int i,ix;
 
@@ -168,62 +185,47 @@
 	if (ix==0x3f800000||ix==0x40000000) r = 0;
     /* for x < 2.0 */
 	else if(ix<0x40000000) {
-	    if(ix<=0x3f666666) { 	/* lgamma(x) = lgamma(x+1)-log(x) */
+	    if (ix <= 0x3f666666) { 	/* lgamma(x) = lgamma(x+1)-log(x) */
 		r = -__ieee754_logf(x);
-		if(ix>=0x3f3b4a20) {y = one-x; i= 0;}
-		else if(ix>=0x3e6d3308) {y= x-(tc-one); i=1;}
-	  	else {y = x; i=2;}
+		if (ix >= 0x3f3b4a20)
+		    r += func0(1 - x);
+		else if (ix >= 0x3e6d3308)
+		    r += func1(x - (tc - 1));
+	  	else
+		    r += func2(x);
 	    } else {
-	  	r = zero;
-	        if(ix>=0x3fdda618) {y=(float)2.0-x;i=0;} /* [1.7316,2] */
-	        else if(ix>=0x3F9da620) {y=x-tc;i=1;} /* [1.23,1.73] */
-		else {y=x-one;i=2;}
-	    }
-	    switch(i) {
-	      case 0:
-		z = y*y;
-		p1 = a0+z*(a2+z*(a4+z*(a6+z*(a8+z*a10))));
-		p2 = z*(a1+z*(a3+z*(a5+z*(a7+z*(a9+z*a11)))));
-		p  = y*p1+p2;
-		r  += (p-(float)0.5*y); break;
-	      case 1:
-		z = y*y;
-		w = z*y;
-		p1 = t0+w*(t3+w*(t6+w*(t9 +w*t12)));	/* parallel comp */
-		p2 = t1+w*(t4+w*(t7+w*(t10+w*t13)));
-		p3 = t2+w*(t5+w*(t8+w*(t11+w*t14)));
-		p  = z*p1-(tt-w*(p2+y*p3));
-		r += (tf + p); break;
-	      case 2:
-		p1 = y*(u0+y*(u1+y*(u2+y*(u3+y*(u4+y*u5)))));
-		p2 = one+y*(v1+y*(v2+y*(v3+y*(v4+y*v5))));
-		r += (-(float)0.5*y + p1/p2);
+	        if (ix >= 0x3fdda618)		/* [1.7316,2] */
+		    r = func0(2 - x);
+		else if (ix >= 0x3F9da620)	/* [1.23,1.73] */
+		    r = func1(x - tc);
+		else
+		    r = func2(x - 1);
 	    }
 	}
 	else if(ix<0x41000000) { 			/* x < 8.0 */
 	    i = (int)x;
-	    y = x-(float)i;
+	    y = x-i;
 	    p = y*(s0+y*(s1+y*(s2+y*(s3+y*(s4+y*(s5+y*s6))))));
 	    q = one+y*(r1+y*(r2+y*(r3+y*(r4+y*(r5+y*r6)))));
-	    r = half*y+p/q;
+	    r = y/2+p/q;
 	    z = one;	/* lgamma(1+s) = log(s) + lgamma(s) */
 	    switch(i) {
-	    case 7: z *= (y+(float)6.0);	/* FALLTHRU */
-	    case 6: z *= (y+(float)5.0);	/* FALLTHRU */
-	    case 5: z *= (y+(float)4.0);	/* FALLTHRU */
-	    case 4: z *= (y+(float)3.0);	/* FALLTHRU */
-	    case 3: z *= (y+(float)2.0);	/* FALLTHRU */
+	    case 7: z *= (y+6);	/* FALLTHRU */
+	    case 6: z *= (y+5);	/* FALLTHRU */
+	    case 5: z *= (y+4);	/* FALLTHRU */
+	    case 4: z *= (y+3);	/* FALLTHRU */
+	    case 3: z *= (y+2);	/* FALLTHRU */
 		    r += __ieee754_logf(z); break;
 	    }
-    /* 8.0 <= x < 2**58 */
-	} else if (ix < 0x5c800000) {
+    /* 8.0 <= x < 2**30 */
+	} else if (ix < 0x4e800000) {
 	    t = __ieee754_logf(x);
 	    z = one/x;
 	    y = z*z;
-	    w = w0+z*(w1+y*(w2+y*(w3+y*(w4+y*(w5+y*w6)))));
+	    w = w0+z*(w1+y*w2);
 	    r = (x-half)*(t-one)+w;
 	} else
-    /* 2**58 <= x <= inf */
+    /* 2**30 <= x <= inf */
 	    r =  x*(__ieee754_logf(x)-one);
 	if(hx<0) r = nadj - r;
 	return r;