git: 1e40acb54539 - stable/13 - x86: Implement deferred TSC calibration

From: Mark Johnston <markj_at_FreeBSD.org>
Date: Wed, 29 Dec 2021 15:46:59 UTC
The branch stable/13 has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=1e40acb545391169b8e13fd27724e3699d6824c3

commit 1e40acb545391169b8e13fd27724e3699d6824c3
Author:     Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2021-11-15 20:31:21 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2021-12-29 15:38:50 +0000

    x86: Implement deferred TSC calibration
    
    There is no universal way to find the TSC frequency.  Newer Intel CPUs
    may report it via CPUID leaves 0x15 and 0x16.  Sometimes it can be
    obtained from the PLATFORM_INFO MSR as well, though we never use that.
    On older platforms we derive the frequency using a DELAY(1000000) call,
    which uses the 8254 PIT.  On some newer platforms the 8254 is apparently
    non-functional, leading to bogus calibration results.  On such platforms
    the TSC frequency must be available from CPUID.  It is also possible to
    disable calibration with a tunable, in which case we try to parse the
    brand string if the TSC freq is not available from CPUID.
    
    CPUID 0x15 provides an authoritative TSC frequency value, but even that
    is not always available on new Intel platforms.  CPUID 0x16 provides the
    specified processor base frequency, which is not the same as the TSC
    frequency.  Empirically, it is close enough for early boot, but too far
    off for timekeeping: on a Comet Lake NUC, CPUID 0x16 yields 1600MHz but
    the TSC frequency is rougly 1608MHz, leading to frequent clock stepping
    when NTP is in use.
    
    Thus we have a situation where we cannot calibrate using the PIT and
    cannot obtain a precise frequency from CPUID (or MSRs).  This change
    seeks to address that by using the CPUID 0x16 value during early boot
    and refining the calibration later once ACPI-based timecounters are
    available.  TSC frequency detection is thus split into two phases:
    
    Early phase:
    - On Intel platforms, query CPUID 0x15 and 0x16 and use that value
      initially if available.
    - Otherwise, get an estimate using the PIT, reducing the delay loop to
      100ms from 1s.
    - Continue to register the TSC as the CPU ticks provider early, even
      though the frequency may be off.  Otherwise any code executed during
      boot that uses cpu_ticks() (e.g., context switching) gets tripped up
      when the ticks provider changes.
    
    Later phase:
    - In SI_SUB_CLOCKS, once the timehands are initialized, load the current
      TSC and timecounter (sbinuptime()) values at the beginning and end of
      a 1s interval and use the timecounter frequency (typically from
      kvmclock, HPET or the ACPI PM timer) to estimate the TSC frequency.
    - Update the TSC timecounter, global tsc_freq and CPU ticker with the
      new frequency and finally register the TSC as a timecounter.
    
    Reviewed by:    kib, jhb (previous version)
    Discussed with: imp, cperciva
    Sponsored by:   The FreeBSD Foundation
    
    (cherry picked from commit 22875f88799e1684febf79b5049541e0f825aaa1)
---
 sys/x86/x86/local_apic.c |   7 +-
 sys/x86/x86/tsc.c        | 202 +++++++++++++++++++++++++++++++----------------
 2 files changed, 141 insertions(+), 68 deletions(-)

diff --git a/sys/x86/x86/local_apic.c b/sys/x86/x86/local_apic.c
index d0548d06569b..b35c4ab459fa 100644
--- a/sys/x86/x86/local_apic.c
+++ b/sys/x86/x86/local_apic.c
@@ -1007,7 +1007,12 @@ lapic_change_mode(struct eventtimer *et, struct lapic *la,
     enum lat_timer_mode newmode)
 {
 
-	if (la->la_timer_mode == newmode)
+	/*
+	 * The TSC frequency may change during late calibration against other
+	 * timecounters (HPET or ACPI PMTimer).
+	 */
+	if (la->la_timer_mode == newmode &&
+	    (newmode != LAT_MODE_DEADLINE || et->et_frequency == tsc_freq))
 		return;
 	switch (newmode) {
 	case LAT_MODE_PERIODIC:
diff --git a/sys/x86/x86/tsc.c b/sys/x86/x86/tsc.c
index fabc980e2231..53e2c7dcfe42 100644
--- a/sys/x86/x86/tsc.c
+++ b/sys/x86/x86/tsc.c
@@ -32,12 +32,14 @@ __FBSDID("$FreeBSD$");
 #include "opt_clock.h"
 
 #include <sys/param.h>
+#include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
-#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 #include <sys/timetc.h>
@@ -84,7 +86,7 @@ SYSCTL_INT(_machdep, OID_AUTO, disable_tsc, CTLFLAG_RDTUN, &tsc_disabled, 0,
 static int	tsc_skip_calibration;
 SYSCTL_INT(_machdep, OID_AUTO, disable_tsc_calibration, CTLFLAG_RDTUN,
     &tsc_skip_calibration, 0,
-    "Disable TSC frequency calibration");
+    "Disable early TSC frequency calibration");
 
 static void tsc_freq_changed(void *arg, const struct cf_level *level,
     int status);
@@ -134,14 +136,11 @@ tsc_freq_vmware(void)
 }
 
 /*
- * Calculate TSC frequency using information from the CPUID leaf 0x15
- * 'Time Stamp Counter and Nominal Core Crystal Clock'.  If leaf 0x15
- * is not functional, as it is on Skylake/Kabylake, try 0x16 'Processor
- * Frequency Information'.  Leaf 0x16 is described in the SDM as
- * informational only, but if 0x15 did not work, and TSC calibration
- * is disabled, it is the best we can get at all.  It should still be
- * an improvement over the parsing of the CPU model name in
- * tsc_freq_intel(), when available.
+ * Calculate TSC frequency using information from the CPUID leaf 0x15 'Time
+ * Stamp Counter and Nominal Core Crystal Clock'.  If leaf 0x15 is not
+ * functional, as it is on Skylake/Kabylake, try 0x16 'Processor Frequency
+ * Information'.  Leaf 0x16 is described in the SDM as informational only, but
+ * we can use this value until late calibration is complete.
  */
 static bool
 tsc_freq_cpuid(uint64_t *res)
@@ -167,8 +166,8 @@ tsc_freq_cpuid(uint64_t *res)
 	return (false);
 }
 
-static void
-tsc_freq_intel(void)
+static bool
+tsc_freq_intel_brand(uint64_t *res)
 {
 	char brand[48];
 	u_int regs[4];
@@ -205,7 +204,7 @@ tsc_freq_intel(void)
 				i = 1000000;
 				break;
 			default:
-				return;
+				return (false);
 			}
 #define	C2D(c)	((c) - '0')
 			if (p[1] == '.') {
@@ -221,17 +220,39 @@ tsc_freq_intel(void)
 				freq *= i * 1000000;
 			}
 #undef C2D
-			tsc_freq = freq;
+			*res = freq;
+			return (true);
 		}
 	}
+	return (false);
 }
 
 static void
-probe_tsc_freq(void)
+tsc_freq_8254(uint64_t *res)
 {
-	uint64_t tmp_freq, tsc1, tsc2;
-	int no_cpuid_override;
+	uint64_t tsc1, tsc2;
+	int64_t overhead;
+	int count, i;
+
+	overhead = 0;
+	for (i = 0, count = 8; i < count; i++) {
+		tsc1 = rdtsc_ordered();
+		DELAY(0);
+		tsc2 = rdtsc_ordered();
+		if (i > 0)
+			overhead += tsc2 - tsc1;
+	}
+	overhead /= count;
+
+	tsc1 = rdtsc_ordered();
+	DELAY(100000);
+	tsc2 = rdtsc_ordered();
+	tsc_freq = (tsc2 - tsc1 - overhead) * 10;
+}
 
+static void
+probe_tsc_freq(void)
+{
 	if (cpu_power_ecx & CPUID_PERF_STAT) {
 		/*
 		 * XXX Some emulators expose host CPUID without actual support
@@ -287,50 +308,44 @@ probe_tsc_freq(void)
 		break;
 	}
 
-	if (tsc_skip_calibration) {
-		if (tsc_freq_cpuid(&tmp_freq))
-			tsc_freq = tmp_freq;
-		else if (cpu_vendor_id == CPU_VENDOR_INTEL)
-			tsc_freq_intel();
-		if (tsc_freq == 0)
-			tsc_disabled = 1;
-	} else {
+	if (tsc_freq_cpuid(&tsc_freq)) {
+		/*
+		 * If possible, use the value obtained from CPUID as the initial
+		 * frequency.  This will be refined later during boot but is
+		 * good enough for now.  The 8254 PIT is not functional on some
+		 * newer platforms anyway, so don't delay our boot for what
+		 * might be a garbage result.  Late calibration is required if
+		 * the initial frequency was obtained from CPUID.16H, as the
+		 * derived value may be off by as much as 1%.
+		 */
 		if (bootverbose)
-			printf("Calibrating TSC clock ... ");
-		tsc1 = rdtsc();
-		DELAY(1000000);
-		tsc2 = rdtsc();
-		tsc_freq = tsc2 - tsc1;
-
+			printf("Early TSC frequency %juHz derived from CPUID\n",
+			    (uintmax_t)tsc_freq);
+	} else if (tsc_skip_calibration) {
 		/*
-		 * If the difference between calibrated frequency and
-		 * the frequency reported by CPUID 0x15/0x16 leafs
-		 * differ significantly, this probably means that
-		 * calibration is bogus.  It happens on machines
-		 * without 8254 timer.  The BIOS rarely properly
-		 * reports it in FADT boot flags, so just compare the
-		 * frequencies directly.
+		 * Try to parse the brand string to obtain the nominal TSC
+		 * frequency.
 		 */
-		if (tsc_freq_cpuid(&tmp_freq) && qabs(tsc_freq - tmp_freq) >
-		    uqmin(tsc_freq, tmp_freq)) {
-			no_cpuid_override = 0;
-			TUNABLE_INT_FETCH("machdep.disable_tsc_cpuid_override",
-			    &no_cpuid_override);
-			if (!no_cpuid_override) {
-				if (bootverbose) {
-					printf(
-	"TSC clock: calibration freq %ju Hz, CPUID freq %ju Hz%s\n",
-					    (uintmax_t)tsc_freq,
-					    (uintmax_t)tmp_freq,
-					    no_cpuid_override ? "" :
-					    ", doing CPUID override");
-				}
-				tsc_freq = tmp_freq;
-			}
+		if (cpu_vendor_id == CPU_VENDOR_INTEL &&
+		    tsc_freq_intel_brand(&tsc_freq)) {
+			if (bootverbose)
+				printf(
+		    "Early TSC frequency %juHz derived from brand string\n",
+				    (uintmax_t)tsc_freq);
+		} else {
+			tsc_disabled = 1;
 		}
+	} else {
+		/*
+		 * Calibrate against the 8254 PIT.  This estimate will be
+		 * refined later in tsc_calib().
+		 */
+		tsc_freq_8254(&tsc_freq);
+		if (bootverbose)
+			printf(
+		    "Early TSC frequency %juHz calibrated from 8254 PIT\n",
+			    (uintmax_t)tsc_freq);
 	}
-	if (bootverbose)
-		printf("TSC clock: %ju Hz\n", (intmax_t)tsc_freq);
 }
 
 void
@@ -372,13 +387,18 @@ init_TSC(void)
 		break;
 	}
 #endif
-		
+
 	probe_tsc_freq();
 
 	/*
 	 * Inform CPU accounting about our boot-time clock rate.  This will
 	 * be updated if someone loads a cpufreq driver after boot that
 	 * discovers a new max frequency.
+	 *
+	 * The frequency may also be updated after late calibration is complete;
+	 * however, we register the TSC as the ticker now to avoid switching
+	 * counters after much of the kernel has already booted and potentially
+	 * sampled the CPU clock.
 	 */
 	if (tsc_freq != 0)
 		set_cputicker(rdtsc, tsc_freq, !tsc_is_invariant);
@@ -654,11 +674,65 @@ init_TSC_tc(void)
 	if (tsc_freq != 0) {
 		tsc_timecounter.tc_frequency = tsc_freq >> shift;
 		tsc_timecounter.tc_priv = (void *)(intptr_t)shift;
-		tc_init(&tsc_timecounter);
+
+		/*
+		 * Timecounter registration is deferred until after late
+		 * calibration is finished.
+		 */
 	}
 }
 SYSINIT(tsc_tc, SI_SUB_SMP, SI_ORDER_ANY, init_TSC_tc, NULL);
 
+static void
+tsc_update_freq(uint64_t new_freq)
+{
+	atomic_store_rel_64(&tsc_freq, new_freq);
+	atomic_store_rel_64(&tsc_timecounter.tc_frequency,
+	    new_freq >> (int)(intptr_t)tsc_timecounter.tc_priv);
+}
+
+/*
+ * Perform late calibration of the TSC frequency once ACPI-based timecounters
+ * are available.
+ */
+static void
+tsc_calib(void *arg __unused)
+{
+	sbintime_t t_start, t_end;
+	uint64_t freq_khz, tsc_start, tsc_end;
+	register_t flags;
+	int cpu;
+
+	if (tsc_disabled)
+		return;
+
+	flags = intr_disable();
+	cpu = curcpu;
+	tsc_start = rdtsc_ordered();
+	t_start = sbinuptime();
+	intr_restore(flags);
+
+	DELAY(1000000);
+
+	thread_lock(curthread);
+	sched_bind(curthread, cpu);
+
+	flags = intr_disable();
+	tsc_end = rdtsc_ordered();
+	t_end = sbinuptime();
+	intr_restore(flags);
+
+	sched_unbind(curthread);
+	thread_unlock(curthread);
+
+	freq_khz = (SBT_1S / 1024) * (tsc_end - tsc_start) / (t_end - t_start);
+
+	tsc_update_freq(freq_khz * 1024);
+	tc_init(&tsc_timecounter);
+	set_cputicker(rdtsc, tsc_freq, !tsc_is_invariant);
+}
+SYSINIT(tsc_calib, SI_SUB_CLOCKS + 1, SI_ORDER_ANY, tsc_calib, NULL);
+
 void
 resume_TSC(void)
 {
@@ -750,9 +824,7 @@ tsc_freq_changed(void *arg, const struct cf_level *level, int status)
 
 	/* Total setting for this level gives the new frequency in MHz. */
 	freq = (uint64_t)level->total_set.freq * 1000000;
-	atomic_store_rel_64(&tsc_freq, freq);
-	tsc_timecounter.tc_frequency =
-	    freq >> (int)(intptr_t)tsc_timecounter.tc_priv;
+	tsc_update_freq(freq);
 }
 
 static int
@@ -765,14 +837,10 @@ sysctl_machdep_tsc_freq(SYSCTL_HANDLER_ARGS)
 	if (freq == 0)
 		return (EOPNOTSUPP);
 	error = sysctl_handle_64(oidp, &freq, 0, req);
-	if (error == 0 && req->newptr != NULL) {
-		atomic_store_rel_64(&tsc_freq, freq);
-		atomic_store_rel_64(&tsc_timecounter.tc_frequency,
-		    freq >> (int)(intptr_t)tsc_timecounter.tc_priv);
-	}
+	if (error == 0 && req->newptr != NULL)
+		tsc_update_freq(freq);
 	return (error);
 }
-
 SYSCTL_PROC(_machdep, OID_AUTO, tsc_freq,
     CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     0, 0, sysctl_machdep_tsc_freq, "QU",