git: dee01da58a27 - stable/13 - Correctly measure system load averages > 1024

From: Alan Somers <asomers_at_FreeBSD.org>
Date: Mon, 23 May 2022 19:11:51 UTC
The branch stable/13 has been updated by asomers:

URL: https://cgit.FreeBSD.org/src/commit/?id=dee01da58a275b1cdc21b6211b26223968431449

commit dee01da58a275b1cdc21b6211b26223968431449
Author:     Alan Somers <asomers@FreeBSD.org>
AuthorDate: 2022-05-05 21:35:23 +0000
Commit:     Alan Somers <asomers@FreeBSD.org>
CommitDate: 2022-05-23 19:11:23 +0000

    Correctly measure system load averages > 1024
    
    The old fixed-point arithmetic used for calculating load averages had an
    overflow at 1024.  So on systems with extremely high load, the observed
    load average would actually fall back to 0 and shoot up again, creating
    a kind of sawtooth graph.
    
    Fix this by using 64-bit math internally, while still reporting the load
    average to userspace as a 32-bit number.
    
    Sponsored by:   Axcient
    Reviewed by:    imp
    Differential Revision: https://reviews.freebsd.org/D35134
    
    (cherry picked from commit 1d2421ad8b6d508ef155752bdfc5948f7373bac3)
---
 sys/kern/kern_synch.c | 9 +++++----
 sys/kern/tty_info.c   | 2 +-
 sys/sys/param.h       | 8 ++++----
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
index 88f47ba78601..5abc38d64296 100644
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@@ -87,7 +87,7 @@ struct loadavg averunnable =
  * Constants for averages over 1, 5, and 15 minutes
  * when sampling at 5 second intervals.
  */
-static fixpt_t cexp[3] = {
+static uint64_t cexp[3] = {
 	0.9200444146293232 * FSCALE,	/* exp(-1/12) */
 	0.9834714538216174 * FSCALE,	/* exp(-1/60) */
 	0.9944598480048967 * FSCALE,	/* exp(-1/180) */
@@ -610,14 +610,15 @@ setrunnable(struct thread *td, int srqflags)
 static void
 loadav(void *arg)
 {
-	int i, nrun;
+	int i;
+	uint64_t nrun;
 	struct loadavg *avg;
 
-	nrun = sched_load();
+	nrun = (uint64_t)sched_load();
 	avg = &averunnable;
 
 	for (i = 0; i < 3; i++)
-		avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
+		avg->ldavg[i] = (cexp[i] * (uint64_t)avg->ldavg[i] +
 		    nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
 
 	/*
diff --git a/sys/kern/tty_info.c b/sys/kern/tty_info.c
index 60675557e4ed..237aa47a18da 100644
--- a/sys/kern/tty_info.c
+++ b/sys/kern/tty_info.c
@@ -302,7 +302,7 @@ tty_info(struct tty *tp)
 	sbuf_set_drain(&sb, sbuf_tty_drain, tp);
 
 	/* Print load average. */
-	load = (averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT;
+	load = ((int64_t)averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT;
 	sbuf_printf(&sb, "%sload: %d.%02d ", tp->t_column == 0 ? "" : "\n",
 	    load / 100, load % 100);
 
diff --git a/sys/sys/param.h b/sys/sys/param.h
index a0f1b9f7945b..24011244449e 100644
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@@ -343,12 +343,12 @@ __END_DECLS
  * Scale factor for scaled integers used to count %cpu time and load avgs.
  *
  * The number of CPU `tick's that map to a unique `%age' can be expressed
- * by the formula (1 / (2 ^ (FSHIFT - 11))).  The maximum load average that
- * can be calculated (assuming 32 bits) can be closely approximated using
- * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15).
+ * by the formula (1 / (2 ^ (FSHIFT - 11))).  Since the intermediate
+ * calculation is done with 64-bit precision, the maximum load average that can
+ * be calculated is approximately 2^32 / FSCALE.
  *
  * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age',
- * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024.
+ * FSHIFT must be at least 11.  This gives a maximum load avg of 2 million.
  */
 #define	FSHIFT	11		/* bits to right of fixed binary point */
 #define FSCALE	(1<<FSHIFT)