i386/67469: src/lib/msun/i387/s_tan.S gives incorrect results for large inputs

David Schultz das at FreeBSD.ORG
Tue Feb 8 21:14:21 PST 2005


I ran some careful performance comparisons between the version of
i387 tan() I posted earlier and the fdlibm tan().  Executive
summary: the fdlibm tan() is faster for virtually all inputs on a
Pentium 4.  Pentium 3s seem to have lower-latency FPUs, but fdlibm
still beats the fptan instruction for the important cases where
fptan actually gets the right answer.  I used the following sets
of inputs:

tbl1: small numbers
-7.910377e-286
-5.142120e-78
-8.305257e-262
-2.089491e-228
9.625342e-237
-5.867161e-64
9.000611e-34
5.192603e-280
-1.255581e-153
1.275985e-153

tbl2: numbers on [-8pi,8pi] greater in magnitude than 2^-18
-2.410568e-05
-6.482504e-01
-1.971384e-03
-1.686721e-04
-3.629842e-04
1.036818e-03
2.987541e-04
6.186103e+00
2.165271e-02
1.032138e-04

tbl3: large numbers
2.379610e+172
3.238483e+204
-4.680033e+194
-1.442727e+225
3.090948e+48
1.778800e+185
5.177174e+295
-1.237869e+204
-4.577895e+223
-6.735385e+171

tbl4: special cases
nan
inf
-inf
-0.0
+0.0

The results below are divided into four columns.  The first is the
average number of clock cycles taken by the fdlibm tan() for the
corresponding table input above on a Pentium 4, the second is the
clock cycles for the assembly tan(), the third is the difference,
and the fourth is the percentage difference relative to column 1.

das at VARK:/home/t/freebsd> paste perf1 perf1md | awk '{printf("%f\t%f\t%f\t%+.0f%%\n", $1, $2, $2-$1, ($2-$1)*100/$1);}'
1259.000000     1697.000000     438.000000      +35%
1162.000000     1488.000000     326.000000      +28%
1060.000000     1445.000000     385.000000      +36%
1059.000000     1453.000000     394.000000      +37%
1065.000000     1459.000000     394.000000      +37%
1059.000000     1458.000000     399.000000      +38%
1031.000000     1461.000000     430.000000      +42%
1054.000000     1436.000000     382.000000      +36%
1056.000000     1455.000000     399.000000      +38%
1073.000000     1459.000000     386.000000      +36%
das at VARK:/home/t/freebsd> paste perf2 perf2md | awk '{printf("%f\t%f\t%f\t%+.0f%%\n", $1, $2, $2-$1, ($2-$1)*100/$1);}'
2018.000000     1985.000000     -33.000000      -2%
1713.000000     1821.000000     108.000000      +6%
1694.000000     1730.000000     36.000000       +2%
1708.000000     1737.000000     29.000000       +2%
1703.000000     1745.000000     42.000000       +2%
1696.000000     1762.000000     66.000000       +4%
1718.000000     1774.000000     56.000000       +3%
1890.000000     2108.000000     218.000000      +12%
1693.000000     1744.000000     51.000000       +3%
1702.000000     1714.000000     12.000000       +1%
das at VARK:/home/t/freebsd> paste perf3 perf3md | awk '{printf("%f\t%f\t%f\t%+.0f%%\n", $1, $2, $2-$1, ($2-$1)*100/$1);}'
5737.000000     6078.000000     341.000000      +6%
5110.000000     5200.000000     90.000000       +2%
6726.000000     7030.000000     304.000000      +5%
5370.000000     5403.000000     33.000000       +1%
5032.000000     5206.000000     174.000000      +3%
5229.000000     5199.000000     -30.000000      -1%
6313.000000     6523.000000     210.000000      +3%
5201.000000     5583.000000     382.000000      +7%
6443.000000     6560.000000     117.000000      +2%
5172.000000     5356.000000     184.000000      +4%
das at VARK:/home/t/freebsd> paste perf4 perf4md | awk '{printf("%f\t%f\t%f\t%+.0f%%\n", $1, $2, $2-$1, ($2-$1)*100/$1);}'
4726.000000     3234.000000     -1492.000000    -32%
3181.000000     2850.000000     -331.000000     -10%
3149.000000     2842.000000     -307.000000     -10%
1045.000000     1091.000000     46.000000       +4%
1076.000000     1114.000000     38.000000       +4%

(P.S.: Oops, forgot to compile s_sin.c with -O.)

I also ran the first three tests on freefall (Pentium III, using
the old reduction code), and got results that aren't as favorable
for the fdlibm version:

das at freefall:~> paste perf1 perf1md | awk '{printf("%f\t%f\t%f\t%+.0f%%\n", $1, $2, $2-$1, ($2-$1)*100/$1);}'
1384.000000     442.000000      -942.000000     -68%
584.000000      440.000000      -144.000000     -25%
589.000000      441.000000      -148.000000     -25%
584.000000      441.000000      -143.000000     -24%
585.000000      441.000000      -144.000000     -25%
585.000000      440.000000      -145.000000     -25%
585.000000      441.000000      -144.000000     -25%
584.000000      440.000000      -144.000000     -25%
584.000000      441.000000      -143.000000     -24%
584.000000      441.000000      -143.000000     -24%
das at freefall:~> paste perf2 perf2md | awk '{printf("%f\t%f\t%f\t%+.0f%%\n", $1, $2, $2-$1, ($2-$1)*100/$1);}'
639.000000      656.000000      17.000000       +3%
820.000000      648.000000      -172.000000     -21%
640.000000      654.000000      14.000000       +2%
639.000000      702.000000      63.000000       +10%
638.000000      654.000000      16.000000       +3%
639.000000      658.000000      19.000000       +3%
638.000000      655.000000      17.000000       +3%
1789.000000     654.000000      -1135.000000    -63%
639.000000      654.000000      15.000000       +2%
638.000000      656.000000      18.000000       +3%
das at freefall:~> paste perf3 perf3md | awk '{printf("%f\t%f\t%f\t%+.0f%%\n", $1, $2, $2-$1, ($2-$1)*100/$1);}'
5751.000000     1918.000000     -3833.000000    -67%
4383.000000     2351.000000     -2032.000000    -46%
6241.000000     2150.000000     -4091.000000    -66%
4401.000000     2483.000000     -1918.000000    -44%
4387.000000     1182.000000     -3205.000000    -73%
4276.000000     2004.000000     -2272.000000    -53%
5814.000000     2735.000000     -3079.000000    -53%
4389.000000     2180.000000     -2209.000000    -50%
5779.000000     2270.000000     -3509.000000    -61%
4379.000000     2020.000000     -2359.000000    -54%

Here, fdlibm usually wins for tbl2, which is the most important
class of inputs.  It is slower for the two inputs in tbl2 that are
close to multiples of 2pi and for large inputs, but in all
fairness, the i387 gets the wrong answer in those cases---hence,
this PR.  The i387 legitimately beats fdlibm for the small inputs,
for which tan(x) == x, so a special case for those earlier in
fdlibm would probably be beneficial.

Conclusion: We should toss out the assembly versions of tan() and
tanf(), and possibly special-case small inputs in fdlibm tan().

I found similar results for sin(), so we should probably do the
same for that, too.  Again, this is on my Pentium 4; YMMV.

das at VARK:/home/t/freebsd> paste perf1 perf1md | awk '{printf("%f\t%f\t%f\t%+.0f%%\n", $1, $2, $2-$1, ($2-$1)*100/$1);}'
1254.000000     1981.000000     727.000000      +58%
1109.000000     1603.000000     494.000000      +45%
1056.000000     1579.000000     523.000000      +50%
1051.000000     1565.000000     514.000000      +49%
1043.000000     1569.000000     526.000000      +50%
1077.000000     1572.000000     495.000000      +46%
1050.000000     1570.000000     520.000000      +50%
1051.000000     1566.000000     515.000000      +49%
1042.000000     1569.000000     527.000000      +51%
1048.000000     1568.000000     520.000000      +50%
das at VARK:/home/t/freebsd> paste perf2 perf2md | awk '{printf("%f\t%f\t%f\t%+.0f%%\n", $1, $2, $2-$1, ($2-$1)*100/$1);}'
1404.000000     1656.000000     252.000000      +18%
1222.000000     1563.000000     341.000000      +28%
1205.000000     1614.000000     409.000000      +34%
1210.000000     1603.000000     393.000000      +32%
1206.000000     1616.000000     410.000000      +34%
1218.000000     1636.000000     418.000000      +34%
1207.000000     1645.000000     438.000000      +36%
1772.000000     1610.000000     -162.000000     -9%
1213.000000     1615.000000     402.000000      +33%
1208.000000     1617.000000     409.000000      +34%
das at VARK:/home/t/freebsd> paste perf3 perf3md | awk '{printf("%f\t%f\t%f\t%+.0f%%\n", $1, $2, $2-$1, ($2-$1)*100/$1);}'
5057.000000     5260.000000     203.000000      +4%
5076.000000     6708.000000     1632.000000     +32%
6528.000000     5978.000000     -550.000000     -8%
5074.000000     6922.000000     1848.000000     +36%
5116.000000     2889.000000     -2227.000000    -44%
4702.000000     5190.000000     488.000000      +10%
5884.000000     7223.000000     1339.000000     +23%
5138.000000     5927.000000     789.000000      +15%
5834.000000     5923.000000     89.000000       +2%
5051.000000     5618.000000     567.000000      +11%
das at VARK:/home/t/freebsd> paste perf4 perf4md | awk '{printf("%f\t%f\t%f\t%+.0f%%\n", $1, $2, $2-$1, ($2-$1)*100/$1);}'
3536.000000     3576.000000     40.000000       +1%
3190.000000     2705.000000     -485.000000     -15%
3189.000000     2698.000000     -491.000000     -15%
1053.000000     1171.000000     118.000000      +11%
1059.000000     1174.000000     115.000000      +11%

The above data was generated using the program below, executed as
follows:
	./a.out < tblN | grep avg | awk '{print $2}' > perfN
When compiling the program, it is necessary to add
-Dfunc=tan or -Dfunc=itan.

#include <limits.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

#define	ITER		10000

#define	rdtsc(rv)	__asm __volatile("xor %%ax,%%ax\n\tcpuid\n\trdtsc" \
					 : "=A" (*(rv)) : : "ebx", "ecx")

double itan(double);

static void
runtest(double d)
{
	volatile double result;
	double avg, sd;
	uint64_t start, end;
	int64_t total;
	int t[ITER];
	int i, n;
	int tmax, tmin;

	printf("%a\n", d);
	total = 0;
	for (i = 0; i < ITER; i++) {
		rdtsc(&start);
		result = func(d);
		rdtsc(&end);

		t[i] = end - start;
		total += t[i];
	}

	/* compute initial avg and sd */
	avg = (double)total / ITER;
	sd = 0;
	for (i = 0; i < ITER; i++)
		sd += (avg - t[i]) * (avg - t[i]);
	sd = sqrt(sd / ITER);

	/* recompute avg and sd with outliers removed, find max and min */
	n = 0;
	tmax = 0;
	tmin = INT_MAX;
	for (i = 0; i < ITER; i++) {
		if (fabs(avg - t[i]) <= sd * 2) {
			total += t[i];
			n++;
			if (t[i] > tmax)
				tmax = t[i];
			if (t[i] < tmin)
				tmin = t[i];
		} else {
			t[i] = -1;
		}
	}
	avg = (double)total / n;
	sd = 0;
	for (i = 0; i < ITER; i++) {
		if (t[i] >= 0)
			sd += (avg - t[i]) * (avg - t[i]);
	}
	sd = sqrt(sd / n);

	printf("avg:\t%.0f\nsd:\t%.0f\nmin:\t%d\nmax:\t%d\nout:\t%.02f%%\n\n",
	    avg, sd, tmin, tmax, (double)(ITER - n) * 100 / ITER);
}

int
main(int argc, char *argv[])
{
	double d;

	while (scanf("%lf", &d) > 0)
		runtest(d);

	return (0);
}


More information about the freebsd-i386 mailing list