svn commit: r344133 - head/sys/kern

Thu Feb 14 19:07:09 UTC 2019

Author: bde
Date: Thu Feb 14 19:07:08 2019
New Revision: 344133
URL: https://svnweb.freebsd.org/changeset/base/344133

Log:
  Finish the fix for overflow in calcru1().
  
  The previous fix was unnecessarily very slow up to 105 hours where the
  simple formula used previously worked, and unnecessarily slow by a factor
  of about 5/3 up to 388 days, and didn't work above 388 days.  388 days is
  not a long time, since it is a reasonable uptime, and for processes the
  times being calculated are aggregated over all threads, so with N CPUs
  running the same thread a runtime of 388 days is reachable after only
  388 / N physical days.
  
  The PRs document overflow at 388 days, but don't try to fix it.
  
  Use the simple formula up to 76 hours.  Then use a complicated general
  method that reduces to the simple formula up to a bit less than 105
  hours, then reduces to the previous method without its extra work up
  to almost 388 days, then does more complicated reductions, usually
  many bits at a time so that this is not slow.  This works up to half
  of maximum representable time (292271 years), with accumulated rounding
  errors of at most 32 usec.
  
  amd64 can do all this with no avoidable rounding errors in an inline
  asm with 2 instructions, but this is too special to use.  __uint128_t
  can do the same with 100's of instructions on 64-bit arches.  Long
  doubles with at least 64 bits of precision are the easiest method to
  use on i386 userland, but are hard to use in the kernel.
  
  PR:		76972 and duplicates
  Reviewed by:	kib

Modified:
  head/sys/kern/kern_resource.c

Modified: head/sys/kern/kern_resource.c
==============================================================================

--- head/sys/kern/kern_resource.c	Thu Feb 14 18:02:37 2019	(r344132)
+++ head/sys/kern/kern_resource.c	Thu Feb 14 19:07:08 2019	(r344133)
@@ -863,13 +863,88 @@ rufetchtd(struct thread *td, struct rusage *ru)
 	calcru1(p, &td->td_rux, &ru->ru_utime, &ru->ru_stime);
 }
 
+/* XXX: the MI version is too slow to use: */
+#ifndef __HAVE_INLINE_FLSLL
+#define	flsll(x)	(fls((x) >> 32) != 0 ? fls((x) >> 32) + 32 : fls(x))
+#endif
+
 static uint64_t
 mul64_by_fraction(uint64_t a, uint64_t b, uint64_t c)
 {
+	uint64_t acc, bh, bl;
+	int i, s, sa, sb;
+
 	/*
-	 * Compute floor(a * (b / c)) without overflowing, (b / c) <= 1.0.
+	 * Calculate (a * b) / c accurately enough without overflowing.  c
+	 * must be nonzero, and its top bit must be 0.  a or b must be
+	 * <= c, and the implementation is tuned for b <= c.
+	 *
+	 * The comments about times are for use in calcru1() with units of
+	 * microseconds for 'a' and stathz ticks at 128 Hz for b and c.
+	 *
+	 * Let n be the number of top zero bits in c.  Each iteration
+	 * either returns, or reduces b by right shifting it by at least n.
+	 * The number of iterations is at most 1 + 64 / n, and the error is
+	 * at most the number of iterations.
+	 *
+	 * It is very unusual to need even 2 iterations.  Previous
+	 * implementations overflowed essentially by returning early in the
+	 * first iteration, with n = 38 giving overflow at 105+ hours and
+	 * n = 32 giving overlow at at 388+ days despite a more careful
+	 * calculation.  388 days is a reasonable uptime, and the calculation
+	 * needs to work for the uptime times the number of CPUs since 'a'
+	 * is per-process.
 	 */
-	return ((a / c) * b + (a % c) * (b / c) + (a % c) * (b % c) / c);
+	if (a >= (uint64_t)1 << 63)
+		return (0);		/* Unsupported arg -- can't happen. */
+	acc = 0;
+	for (i = 0; i < 128; i++) {
+		sa = flsll(a);
+		sb = flsll(b);
+		if (sa + sb <= 64)
+			/* Up to 105 hours on first iteration. */
+			return (acc + (a * b) / c);
+		if (a >= c) {
+			/*
+			 * This reduction is based on a = q * c + r, with the
+			 * remainder r < c.  'a' may be large to start, and
+			 * moving bits from b into 'a' at the end of the loop
+			 * sets the top bit of 'a', so the reduction makes
+			 * significant progress.
+			 */
+			acc += (a / c) * b;
+			a %= c;
+			sa = flsll(a);
+			if (sa + sb <= 64)
+				/* Up to 388 days on first iteration. */
+				return (acc + (a * b) / c);
+		}
+
+		/*
+		 * This step writes a * b as a * ((bh << s) + bl) =
+		 * a * (bh << s) + a * bl = (a << s) * bh + a * bl.  The 2
+		 * additive terms are handled separately.  Splitting in
+		 * this way is linear except for rounding errors.
+		 *
+		 * s = 64 - sa is the maximum such that a << s fits in 64
+		 * bits.  Since a < c and c has at least 1 zero top bit,
+		 * sa < 64 and s > 0.  Thus this step makes progress by
+		 * reducing b (it increases 'a', but taking remainders on
+		 * the next iteration completes the reduction).
+		 *
+		 * Finally, the choice for s is just what is needed to keep
+		 * a * bl from overflowing, so we don't need complications
+		 * like a recursive call mul64_by_fraction(a, bl, c) to
+		 * handle the second additive term.
+		 */
+		s = 64 - sa;
+		bh = b >> s;
+		bl = b - (bh << s);
+		acc += (a * bl) / c;
+		a <<= s;
+		b = bh;
+	}
+	return (0);		/* Algorithm failure -- can't happen. */
 }
 
 static void
@@ -896,15 +971,23 @@ calcru1(struct proc *p, struct rusage_ext *ruxp, struc
 		tu = ruxp->rux_tu;
 	}
 
+	/* Subdivide tu.  Avoid overflow in the multiplications. */
+	if (__predict_true(tu <= ((uint64_t)1 << 38) && tt <= (1 << 26))) {
+		/* Up to 76 hours when stathz is 128. */
+		uu = (tu * ut) / tt;
+		su = (tu * st) / tt;
+	} else {
+		uu = mul64_by_fraction(tu, ut, tt);
+		su = mul64_by_fraction(tu, ut, st);
+	}
+
 	if (tu >= ruxp->rux_tu) {
 		/*
 		 * The normal case, time increased.
 		 * Enforce monotonicity of bucketed numbers.
 		 */
-		uu = mul64_by_fraction(tu, ut, tt);
 		if (uu < ruxp->rux_uu)
 			uu = ruxp->rux_uu;
-		su = mul64_by_fraction(tu, st, tt);
 		if (su < ruxp->rux_su)
 			su = ruxp->rux_su;
 	} else if (tu + 3 > ruxp->rux_tu || 101 * tu > 100 * ruxp->rux_tu) {
@@ -933,8 +1016,6 @@ calcru1(struct proc *p, struct rusage_ext *ruxp, struc
 		    "to %ju usec for pid %d (%s)\n",
 		    (uintmax_t)ruxp->rux_tu, (uintmax_t)tu,
 		    p->p_pid, p->p_comm);
-		uu = mul64_by_fraction(tu, ut, tt);
-		su = mul64_by_fraction(tu, st, tt);
 	}
 
 	ruxp->rux_uu = uu;