git: fb99512f32c9 - stable/13 - Enable M_TSTMP in Chelsio cxgbe driver by creating a mechanism that can sync the time.

From: John Baldwin <jhb_at_FreeBSD.org>
Date: Fri, 11 Nov 2022 01:48:19 UTC
The branch stable/13 has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=fb99512f32c92aa4a9c939cd4dc4a6c1facbf0a9

commit fb99512f32c92aa4a9c939cd4dc4a6c1facbf0a9
Author:     Randall Stewart <rrs@FreeBSD.org>
AuthorDate: 2022-09-20 19:13:16 +0000
Commit:     John Baldwin <jhb@FreeBSD.org>
CommitDate: 2022-11-11 01:25:52 +0000

    Enable M_TSTMP in Chelsio cxgbe driver by creating a mechanism that can sync the time.
    
    Chelsio has always been recording a timestamp in the mbuf (rcv_tstmp) but
    not setting the M_TSTMP bit in the mbuf flags. This is because the timestamp
    was just the free running 60bit clock. This change fixes that so that
    we keep a synchronization by periodically (every 30 seconds after startup)
    getting the timestamp and the current nanosecond time. We always keep
    several sets around and the current one we always keep the current pair
    and the previous pair of timestamps. This allows us to setup a ratio
    between the two so we can correctly translate the time. Note that
    we use special care to split the timestamp into seconds (per the clock tick)
    and nanoseconds otherwise 64bit math would overflow.
    
    Reviewed by: np
    Sponsored by: Netflix Inc
    Differential Revision: https://reviews.freebsd.org/D36315
    
    (cherry picked from commit e398922eaf66978b5e556f6b4b095693c865f329)
---
 sys/dev/cxgbe/adapter.h |  14 +++++++
 sys/dev/cxgbe/t4_main.c | 100 +++++++++++++++++++++++++++++++++++++++++++++++-
 sys/dev/cxgbe/t4_sge.c  |  86 +++++++++++++++++++++++++++++++++--------
 3 files changed, 182 insertions(+), 18 deletions(-)

diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h
index db72a7baf189..1fb0d9ff61c2 100644
--- a/sys/dev/cxgbe/adapter.h
+++ b/sys/dev/cxgbe/adapter.h
@@ -861,6 +861,15 @@ struct devnames {
 
 struct clip_entry;
 
+#define CNT_CAL_INFO 3
+struct clock_sync {
+	uint64_t hw_cur;
+	uint64_t hw_prev;
+	uint64_t rt_cur;
+	uint64_t rt_prev;
+	uint32_t gen;
+};
+
 struct adapter {
 	SLIST_ENTRY(adapter) link;
 	device_t dev;
@@ -980,6 +989,11 @@ struct adapter {
 	struct mtx sfl_lock;	/* same cache-line as sc_lock? but that's ok */
 	TAILQ_HEAD(, sge_fl) sfl;
 	struct callout sfl_callout;
+	struct callout cal_callout;
+	struct clock_sync cal_info[CNT_CAL_INFO];
+	int cal_current;
+	int cal_count;
+	uint32_t cal_gen;
 
 	/*
 	 * Driver code that can run when the adapter is suspended must use this
diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c
index 3d4e2f3a7a71..a71b35abb5b7 100644
--- a/sys/dev/cxgbe/t4_main.c
+++ b/sys/dev/cxgbe/t4_main.c
@@ -325,6 +325,18 @@ static int t4_nofldtxq = -NOFLDTXQ;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldtxq, CTLFLAG_RDTUN, &t4_nofldtxq, 0,
     "Number of offload TX queues per port");
 
+static int t4_clocksync_fast = 1;
+SYSCTL_INT(_hw_cxgbe, OID_AUTO, csfast, CTLFLAG_RW | CTLFLAG_MPSAFE, &t4_clocksync_fast, 0,
+    "During initial clock sync how fast do we update in seconds");
+
+static int t4_clocksync_normal = 30;
+SYSCTL_INT(_hw_cxgbe, OID_AUTO, csnormal, CTLFLAG_RW | CTLFLAG_MPSAFE, &t4_clocksync_normal, 0,
+    "During normal clock sync how fast do we update in seconds");
+
+static int t4_fast_2_normal = 30;
+SYSCTL_INT(_hw_cxgbe, OID_AUTO, cscount, CTLFLAG_RW | CTLFLAG_MPSAFE, &t4_fast_2_normal, 0,
+    "How many clock syncs do we need to do to transition to slow");
+
 #define NOFLDRXQ 2
 static int t4_nofldrxq = -NOFLDRXQ;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldrxq, CTLFLAG_RDTUN, &t4_nofldrxq, 0,
@@ -1110,6 +1122,79 @@ t4_ifnet_unit(struct adapter *sc, struct port_info *pi)
 	return (-1);
 }
 
+static inline uint64_t
+t4_get_ns_timestamp(struct timespec *ts)
+{
+	return ((ts->tv_sec * 1000000000) + ts->tv_nsec);
+}
+
+static void
+t4_calibration(void *arg)
+{
+	struct adapter *sc;
+	struct timespec ts;
+	struct clock_sync *cur, *nex;
+	int next_up;
+
+	sc = (struct adapter *)arg;
+
+	cur = &sc->cal_info[sc->cal_current];
+	next_up = (sc->cal_current + 1) % CNT_CAL_INFO;
+       	nex = &sc->cal_info[next_up];
+	if (__predict_false(sc->cal_count == 0)) {
+		/* First time in, just get the values in */
+		cur->hw_cur = t4_read_reg64(sc, A_SGE_TIMESTAMP_LO);
+		nanouptime(&ts);
+		cur->rt_cur = t4_get_ns_timestamp(&ts);
+		sc->cal_count++;
+		goto done;
+	}
+	nex->hw_prev = cur->hw_cur;
+	nex->rt_prev = cur->rt_cur;
+	KASSERT((hw_off_limits(sc) == 0), "hw_off_limits at t4_calibtration");
+	nex->hw_cur = t4_read_reg64(sc, A_SGE_TIMESTAMP_LO);
+	nanouptime(&ts);	
+	nex->rt_cur = t4_get_ns_timestamp(&ts);
+	if ((nex->hw_cur - nex->hw_prev) == 0) {
+		/* The clock is not advancing? */
+		sc->cal_count = 0;
+		atomic_store_rel_int(&cur->gen, 0);
+		goto done;
+	}
+	atomic_store_rel_int(&cur->gen, 0);
+	sc->cal_current = next_up;
+	sc->cal_gen++;
+	atomic_store_rel_int(&nex->gen, sc->cal_gen);
+	if (sc->cal_count < t4_fast_2_normal)
+		sc->cal_count++;
+done:
+	callout_reset_sbt_curcpu(&sc->cal_callout,
+				 ((sc->cal_count < t4_fast_2_normal)  ?
+				 t4_clocksync_fast : t4_clocksync_normal) * SBT_1S, 0,
+				 t4_calibration, sc, C_DIRECT_EXEC);
+}
+
+
+
+static void
+t4_calibration_start(struct adapter *sc)
+{
+	/*
+	 * Here if we have not done a calibration
+	 * then do so otherwise start the appropriate
+	 * timer.
+	 */
+	int i;
+
+	for (i = 0; i < CNT_CAL_INFO; i++) {
+		sc->cal_info[i].gen = 0;
+	}
+	sc->cal_current = 0;
+	sc->cal_count = 0;
+	sc->cal_gen = 0;
+	t4_calibration(sc);
+}
+
 static int
 t4_attach(device_t dev)
 {
@@ -1178,6 +1263,8 @@ t4_attach(device_t dev)
 
 	callout_init(&sc->ktls_tick, 1);
 
+	callout_init(&sc->cal_callout, 1);
+
 	refcount_init(&sc->vxlan_refcount, 0);
 
 	TASK_INIT(&sc->reset_task, 0, reset_adapter_task, sc);
@@ -1568,6 +1655,7 @@ t4_attach(device_t dev)
 		    "failed to attach all child ports: %d\n", rc);
 		goto done;
 	}
+	t4_calibration_start(sc);
 
 	device_printf(dev,
 	    "PCIe gen%d x%d, %d ports, %d %s interrupt%s, %d eq, %d iq\n",
@@ -1744,7 +1832,8 @@ t4_detach_common(device_t dev)
 			free(pi, M_CXGBE);
 		}
 	}
-
+	callout_stop(&sc->cal_callout);
+	callout_drain(&sc->cal_callout);
 	device_delete_children(dev);
 	sysctl_ctx_free(&sc->ctx);
 	adapter_full_uninit(sc);
@@ -1922,7 +2011,6 @@ t4_suspend(device_t dev)
 
 	/* No more DMA or interrupts. */
 	stop_adapter(sc);
-
 	/* Quiesce all activity. */
 	for_each_port(sc, i) {
 		pi = sc->port[i];
@@ -1995,6 +2083,10 @@ t4_suspend(device_t dev)
 		quiesce_iq_fl(sc, &sc->sge.fwq, NULL);
 	}
 
+	/* Stop calibration */
+	callout_stop(&sc->cal_callout);
+	callout_drain(&sc->cal_callout);
+
 	/* Mark the adapter totally off limits. */
 	mtx_lock(&sc->reg_lock);
 	atomic_set_int(&sc->error_flags, HW_OFF_LIMITS);
@@ -2355,6 +2447,10 @@ t4_resume(device_t dev)
 			}
 		}
 	}
+
+	/* Reset all calibration */
+	t4_calibration_start(sc);	
+
 done:
 	if (rc == 0) {
 		sc->incarnation++;
diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c
index da0ac13c90e4..dd3d22020651 100644
--- a/sys/dev/cxgbe/t4_sge.c
+++ b/sys/dev/cxgbe/t4_sge.c
@@ -1510,15 +1510,73 @@ sort_before_lro(struct lro_ctrl *lro)
 }
 #endif
 
+#define CGBE_SHIFT_SCALE 10
+
 static inline uint64_t
-last_flit_to_ns(struct adapter *sc, uint64_t lf)
+t4_tstmp_to_ns(struct adapter *sc, uint64_t lf)
 {
-	uint64_t n = be64toh(lf) & 0xfffffffffffffff;	/* 60b, not 64b. */
+	struct clock_sync *cur, dcur;
+	uint64_t tstmp_sec, tstmp_nsec;
+	uint64_t hw_clocks;
+	uint64_t rt_cur_to_prev, res_s, res_n, res_s_modulo, res;
+	uint64_t hw_clk_div, cclk;
+	uint64_t hw_tstmp = lf & 0xfffffffffffffffULL;	/* 60b, not 64b. */
+	uint32_t gen;
 
-	if (n > UINT64_MAX / 1000000)
-		return (n / sc->params.vpd.cclk * 1000000);
-	else
-		return (n * 1000000 / sc->params.vpd.cclk);
+	do {
+		cur = &sc->cal_info[sc->cal_current];
+		gen = atomic_load_acq_int(&cur->gen);
+		if (gen == 0)
+			return (0);
+		dcur = *cur;
+		atomic_thread_fence_acq();
+	} while (gen != dcur.gen);
+
+	/*
+	 * Our goal here is to have a result that is:
+	 *
+	 * (                             (cur_time - prev_time)   )
+	 * ((hw_tstmp - hw_prev) *  ----------------------------- ) + prev_time
+	 * (                             (hw_cur - hw_prev)       )
+	 *
+	 * With the constraints that we cannot use float and we
+	 * don't want to overflow the uint64_t numbers we are using.
+	 *
+	 * The plan is to take the clocking value of the hw timestamps
+	 * and split them into seconds and nanosecond equivalent portions.
+	 * Then we operate on the two portions seperately making sure to
+	 * bring back the carry over from the seconds when we divide.
+	 *
+	 * First up lets get the two divided into separate entities
+	 * i.e. the seconds. We use the clock frequency for this.
+	 * Note that vpd.cclk is in khz, we need it in raw hz so
+	 * convert to hz.
+	 */
+	cclk = sc->params.vpd.cclk * 1000;
+	hw_clocks = hw_tstmp - dcur.hw_prev;
+	tstmp_sec = hw_clocks / cclk;
+	tstmp_nsec = hw_clocks % cclk;
+	/* Now work with them separately */
+	rt_cur_to_prev = (dcur.rt_cur - dcur.rt_prev);
+	res_s = tstmp_sec * rt_cur_to_prev;
+	res_n = tstmp_nsec * rt_cur_to_prev;
+	/* Now lets get our divider */
+	hw_clk_div = dcur.hw_cur - dcur.hw_prev;
+	/* Make sure to save the remainder from the seconds divide */
+	res_s_modulo = res_s % hw_clk_div;
+	res_s /= hw_clk_div;
+	/* scale the remainder to where it should be */
+	res_s_modulo *= cclk;
+	/* Now add in the remainder */
+	res_n += res_s_modulo;
+	/* Now do the divide */
+	res_n /= hw_clk_div;
+	res_s *= cclk;
+	/* Recombine the two */
+	res = res_s + res_n;
+	/* And now add in the base time to get to the real timestamp */
+	res += dcur.rt_prev;
+	return (res);
 }
 
 static inline void
@@ -2066,17 +2124,13 @@ have_mbuf:
 
 	if (rxq->iq.flags & IQ_RX_TIMESTAMP) {
 		/*
-		 * Fill up rcv_tstmp but do not set M_TSTMP.
-		 * rcv_tstmp is not in the format that the
-		 * kernel expects and we don't want to mislead
-		 * it.  For now this is only for custom code
-		 * that knows how to interpret cxgbe's stamp.
+		 * Fill up rcv_tstmp but do not set M_TSTMP as
+		 * long as we get a non-zero back from t4_tstmp_to_ns().
 		 */
-		m0->m_pkthdr.rcv_tstmp =
-		    last_flit_to_ns(sc, d->rsp.u.last_flit);
-#ifdef notyet
-		m0->m_flags |= M_TSTMP;
-#endif
+		m0->m_pkthdr.rcv_tstmp = t4_tstmp_to_ns(sc,
+		    be64toh(d->rsp.u.last_flit));
+		if (m0->m_pkthdr.rcv_tstmp != 0)
+			m0->m_flags |= M_TSTMP;
 	}
 
 #ifdef NUMA