git: bfdd5b643d23 - main - tcp: improve testing of HPTS

Go to: [ bottom of page ] [ top of archives ] [ this month ]
From: Michael Tuexen <tuexen_at_FreeBSD.org>
Date: Mon, 13 Oct 2025 20:32:50 UTC
The branch main has been updated by tuexen:

URL: https://cgit.FreeBSD.org/src/commit/?id=bfdd5b643d23171c53920accc2f15f78e984dfae

commit bfdd5b643d23171c53920accc2f15f78e984dfae
Author:     Nick Banks <nickbanks@netflix.com>
AuthorDate: 2025-10-13 20:31:30 +0000
Commit:     Michael Tuexen <tuexen@FreeBSD.org>
CommitDate: 2025-10-13 20:31:30 +0000

    tcp: improve testing of HPTS
    
    Improve the HPTS API to allow testing and add several tests.
    
    Reviewed by:    tuexen
    Sponsored by:   Netflix, Inc.
---
 sys/conf/NOTES              |    1 +
 sys/conf/options            |    1 +
 sys/netinet/tcp_hpts.c      |  758 +++++++++++---------
 sys/netinet/tcp_hpts.h      |   30 +-
 sys/netinet/tcp_hpts_test.c | 1616 ++++++++++++++++++++++++++++++++++++++++++-
 sys/netinet/tcp_lro_hpts.c  |    3 +
 sys/tests/ktest.h           |    7 +
 7 files changed, 2055 insertions(+), 361 deletions(-)

diff --git a/sys/conf/NOTES b/sys/conf/NOTES
index ea9b2667607e..a25ee8f6e1af 100644
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -679,6 +679,7 @@ options 	TCP_OFFLOAD		# TCP offload support.
 options  	TCP_RFC7413		# TCP Fast Open
 
 options  	TCPHPTS
+#options 	TCP_HPTS_KTEST		# Add KTEST support for HPTS
 
 # In order to enable IPSEC you MUST also add device crypto to
 # your kernel configuration
diff --git a/sys/conf/options b/sys/conf/options
index b48ad1cf42cf..0b795a8d28fb 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -231,6 +231,7 @@ SYSVSEM		opt_sysvipc.h
 SYSVSHM		opt_sysvipc.h
 SW_WATCHDOG	opt_watchdog.h
 TCPHPTS
+TCP_HPTS_KTEST	opt_inet.h
 TCP_REQUEST_TRK opt_global.h
 TCP_ACCOUNTING	opt_global.h
 TCP_BBR		opt_inet.h
diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c
index 133703a5ede1..2631e79ab034 100644
--- a/sys/netinet/tcp_hpts.c
+++ b/sys/netinet/tcp_hpts.c
@@ -149,27 +149,44 @@
 #include <netinet/tcpip.h>
 #include <netinet/cc/cc.h>
 #include <netinet/tcp_hpts.h>
+#include <netinet/tcp_hpts_internal.h>
 #include <netinet/tcp_log_buf.h>
 
 #ifdef tcp_offload
 #include <netinet/tcp_offload.h>
 #endif
 
-/*
- * The hpts uses a 102400 wheel. The wheel
- * defines the time in 10 usec increments (102400 x 10).
- * This gives a range of 10usec - 1024ms to place
- * an entry within. If the user requests more than
- * 1.024 second, a remaineder is attached and the hpts
- * when seeing the remainder will re-insert the
- * inpcb forward in time from where it is until
- * the remainder is zero.
- */
+/* Global instance for TCP HPTS */
+struct tcp_hptsi *tcp_hptsi_pace;
+
+/* Default function table for production use. */
+const struct tcp_hptsi_funcs tcp_hptsi_default_funcs = {
+	.microuptime = microuptime,
+	.swi_add = swi_add,
+	.swi_remove = swi_remove,
+	.swi_sched = swi_sched,
+	.intr_event_bind = intr_event_bind,
+	.intr_event_bind_ithread_cpuset = intr_event_bind_ithread_cpuset,
+	.callout_init = callout_init,
+	.callout_reset_sbt_on = callout_reset_sbt_on,
+	._callout_stop_safe = _callout_stop_safe,
+};
 
-#define NUM_OF_HPTSI_SLOTS 102400
+#ifdef TCP_HPTS_KTEST
+#define microuptime pace->funcs->microuptime
+#define swi_add pace->funcs->swi_add
+#define swi_remove pace->funcs->swi_remove
+#define swi_sched pace->funcs->swi_sched
+#define intr_event_bind pace->funcs->intr_event_bind
+#define intr_event_bind_ithread_cpuset pace->funcs->intr_event_bind_ithread_cpuset
+#define callout_init pace->funcs->callout_init
+#define callout_reset_sbt_on pace->funcs->callout_reset_sbt_on
+#define _callout_stop_safe pace->funcs->_callout_stop_safe
+#endif
 
-/* The number of connections after which the dynamic sleep logic kicks in. */
-#define DEFAULT_CONNECTION_THRESHOLD 100
+static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
+
+static void tcp_hpts_thread(void *ctx);
 
 /*
  * When using the hpts, a TCP stack must make sure
@@ -212,75 +229,14 @@
  *
  */
 
-/* Each hpts has its own p_mtx which is used for locking */
-#define	HPTS_MTX_ASSERT(hpts)	mtx_assert(&(hpts)->p_mtx, MA_OWNED)
-#define	HPTS_LOCK(hpts)		mtx_lock(&(hpts)->p_mtx)
-#define	HPTS_TRYLOCK(hpts)	mtx_trylock(&(hpts)->p_mtx)
-#define	HPTS_UNLOCK(hpts)	mtx_unlock(&(hpts)->p_mtx)
-struct tcp_hpts_entry {
-	/* Cache line 0x00 */
-	struct mtx p_mtx;	/* Mutex for hpts */
-	struct timeval p_mysleep;	/* Our min sleep time */
-	uint64_t syscall_cnt;
-	uint64_t sleeping;	/* What the actual sleep was (if sleeping) */
-	uint16_t p_hpts_active; /* Flag that says hpts is awake  */
-	uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
-	uint32_t p_runningslot; /* Current slot we are at if we are running */
-	uint32_t p_prev_slot;	/* Previous slot we were on */
-	uint32_t p_cur_slot;	/* Current slot in wheel hpts is draining */
-	uint32_t p_nxt_slot;	/* The next slot outside the current range of
-				 * slots that the hpts is running on. */
-	int32_t p_on_queue_cnt;	/* Count on queue in this hpts */
-	uint8_t p_direct_wake :1, /* boolean */
-		p_on_min_sleep:1, /* boolean */
-		p_hpts_wake_scheduled:1, /* boolean */
-		hit_callout_thresh:1,
-		p_avail:4;
-	uint8_t p_fill[3];	  /* Fill to 32 bits */
-	/* Cache line 0x40 */
-	struct hptsh {
-		TAILQ_HEAD(, tcpcb)	head;
-		uint32_t		count;
-		uint32_t		gencnt;
-	} *p_hptss;			/* Hptsi wheel */
-	uint32_t p_hpts_sleep_time;	/* Current sleep interval having a max
-					 * of 255ms */
-	uint32_t overidden_sleep;	/* what was overrided by min-sleep for logging */
-	uint32_t saved_curslot;		/* for logging */
-	uint32_t saved_prev_slot;       /* for logging */
-	uint32_t p_delayed_by;	/* How much were we delayed by */
-	/* Cache line 0x80 */
-	struct sysctl_ctx_list hpts_ctx;
-	struct sysctl_oid *hpts_root;
-	struct intr_event *ie;
-	void *ie_cookie;
-	uint16_t p_num;		/* The hpts number one per cpu */
-	uint16_t p_cpu;		/* The hpts CPU */
-	/* There is extra space in here */
-	/* Cache line 0x100 */
-	struct callout co __aligned(CACHE_LINE_SIZE);
-}               __aligned(CACHE_LINE_SIZE);
-
-static struct tcp_hptsi {
-	struct cpu_group **grps;
-	struct tcp_hpts_entry **rp_ent;	/* Array of hptss */
-	uint32_t *cts_last_ran;
-	uint32_t grp_cnt;
-	uint32_t rp_num_hptss;	/* Number of hpts threads */
-} tcp_pace;
-
-static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
 #ifdef RSS
-static int tcp_bind_threads = 1;
+int tcp_bind_threads = 1;
 #else
-static int tcp_bind_threads = 2;
+int tcp_bind_threads = 2;
 #endif
 static int tcp_use_irq_cpu = 0;
 static int hpts_does_tp_logging = 0;
-
-static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout);
-static void tcp_hpts_thread(void *ctx);
-
+static int32_t tcp_hpts_precision = 120;
 int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
 static int conn_cnt_thresh = DEFAULT_CONNECTION_THRESHOLD;
 static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP;
@@ -291,23 +247,6 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "TCP Hpts statistics");
 
-#define	timersub(tvp, uvp, vvp)						\
-	do {								\
-		(vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;		\
-		(vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;	\
-		if ((vvp)->tv_usec < 0) {				\
-			(vvp)->tv_sec--;				\
-			(vvp)->tv_usec += 1000000;			\
-		}							\
-	} while (0)
-
-static int32_t tcp_hpts_precision = 120;
-
-static struct hpts_domain_info {
-	int count;
-	int cpu[MAXCPU];
-} hpts_domains[MAXMEMDOM];
-
 counter_u64_t hpts_hopelessly_behind;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD,
@@ -455,14 +394,14 @@ SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW,
     &tcp_hpts_no_wake_over_thresh, 0,
     "When we are over the threshold on the pacer do we prohibit wakeups?");
 
-static uint16_t
-hpts_random_cpu(void)
+uint16_t
+tcp_hptsi_random_cpu(struct tcp_hptsi *pace)
 {
 	uint16_t cpuid;
 	uint32_t ran;
 
 	ran = arc4random();
-	cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss);
+	cpuid = (((ran & 0xffff) % mp_ncpus) % pace->rp_num_hptss);
 	return (cpuid);
 }
 
@@ -504,11 +443,67 @@ tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
 	}
 }
 
+/*
+ * Timeout handler for the HPTS sleep callout. It immediately schedules the SWI
+ * for the HPTS entry to run.
+ */
 static void
-tcp_wakehpts(struct tcp_hpts_entry *hpts)
+tcp_hpts_sleep_timeout(void *arg)
 {
+#ifdef TCP_HPTS_KTEST
+	struct tcp_hptsi *pace;
+#endif
+	struct tcp_hpts_entry *hpts;
+
+	hpts = (struct tcp_hpts_entry *)arg;
+#ifdef TCP_HPTS_KTEST
+	pace = hpts->p_hptsi;
+#endif
+	swi_sched(hpts->ie_cookie, 0);
+}
+
+/*
+ * Reset the HPTS callout timer with the provided timeval. Returns the results
+ * of the callout_reset_sbt_on() function.
+ */
+static int
+tcp_hpts_sleep(struct tcp_hpts_entry *hpts, struct timeval *tv)
+{
+#ifdef TCP_HPTS_KTEST
+	struct tcp_hptsi *pace;
+#endif
+	sbintime_t sb;
+
+#ifdef TCP_HPTS_KTEST
+	pace = hpts->p_hptsi;
+#endif
+
+	/* Store off to make visible the actual sleep time */
+	hpts->sleeping = tv->tv_usec;
+
+	sb = tvtosbt(*tv);
+	return (callout_reset_sbt_on(
+		    &hpts->co, sb, 0, tcp_hpts_sleep_timeout, hpts, hpts->p_cpu,
+		    (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))));
+}
+
+/*
+ * Schedules the SWI for the HTPS entry to run, if not already scheduled or
+ * running.
+ */
+void
+tcp_hpts_wake(struct tcp_hpts_entry *hpts)
+{
+#ifdef TCP_HPTS_KTEST
+	struct tcp_hptsi *pace;
+#endif
+
 	HPTS_MTX_ASSERT(hpts);
 
+#ifdef TCP_HPTS_KTEST
+	pace = hpts->p_hptsi;
+#endif
+
 	if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= conn_cnt_thresh)) {
 		hpts->p_direct_wake = 0;
 		return;
@@ -519,15 +514,6 @@ tcp_wakehpts(struct tcp_hpts_entry *hpts)
 	}
 }
 
-static void
-hpts_timeout_swi(void *arg)
-{
-	struct tcp_hpts_entry *hpts;
-
-	hpts = (struct tcp_hpts_entry *)arg;
-	swi_sched(hpts->ie_cookie, 0);
-}
-
 static void
 tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts)
 {
@@ -556,13 +542,13 @@ tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts)
 }
 
 static struct tcp_hpts_entry *
-tcp_hpts_lock(struct tcpcb *tp)
+tcp_hpts_lock(struct tcp_hptsi *pace, struct tcpcb *tp)
 {
 	struct tcp_hpts_entry *hpts;
 
 	INP_LOCK_ASSERT(tptoinpcb(tp));
 
-	hpts = tcp_pace.rp_ent[tp->t_hpts_cpu];
+	hpts = pace->rp_ent[tp->t_hpts_cpu];
 	HPTS_LOCK(hpts);
 
 	return (hpts);
@@ -589,11 +575,10 @@ tcp_hpts_release(struct tcpcb *tp)
  * and has never received a first packet.
  */
 void
-tcp_hpts_init(struct tcpcb *tp)
+__tcp_hpts_init(struct tcp_hptsi *pace, struct tcpcb *tp)
 {
-
 	if (__predict_true(tp->t_hpts_cpu == HPTS_CPU_NONE)) {
-		tp->t_hpts_cpu = hpts_random_cpu();
+		tp->t_hpts_cpu = tcp_hptsi_random_cpu(pace);
 		MPASS(!(tp->t_flags2 & TF2_HPTS_CPU_SET));
 	}
 }
@@ -605,14 +590,14 @@ tcp_hpts_init(struct tcpcb *tp)
  * INP lock and then get the hpts lock.
  */
 void
-tcp_hpts_remove(struct tcpcb *tp)
+__tcp_hpts_remove(struct tcp_hptsi *pace, struct tcpcb *tp)
 {
 	struct tcp_hpts_entry *hpts;
 	struct hptsh *hptsh;
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
-	hpts = tcp_hpts_lock(tp);
+	hpts = tcp_hpts_lock(pace, tp);
 	if (tp->t_in_hpts == IHPTS_ONQUEUE) {
 		hptsh = &hpts->p_hptss[tp->t_hpts_slot];
 		tp->t_hpts_request = 0;
@@ -847,10 +832,11 @@ check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp,
 
 void
 #ifdef INVARIANTS
-__tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, int32_t line,
-	struct hpts_diag *diag)
+__tcp_hpts_insert(struct tcp_hptsi *pace, struct tcpcb *tp, uint32_t slot,
+	int32_t line, struct hpts_diag *diag)
 #else
-tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, struct hpts_diag *diag)
+__tcp_hpts_insert(struct tcp_hptsi *pace, struct tcpcb *tp, uint32_t slot,
+	struct hpts_diag *diag)
 #endif
 {
 	struct tcp_hpts_entry *hpts;
@@ -868,7 +854,7 @@ tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, struct hpts_diag *diag)
 	 * current run (if up) or where it was when it stopped if it is
 	 * sleeping.
 	 */
-	hpts = tcp_hpts_lock(tp);
+	hpts = tcp_hpts_lock(pace, tp);
 	microuptime(&tv);
 	if (diag) {
 		memset(diag, 0, sizeof(struct hpts_diag));
@@ -903,7 +889,7 @@ tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, struct hpts_diag *diag)
 			 * timeout is not 1.
 			 */
 			hpts->p_direct_wake = 1;
-			tcp_wakehpts(hpts);
+			tcp_hpts_wake(hpts);
 		}
 		HPTS_UNLOCK(hpts);
 
@@ -991,7 +977,7 @@ tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, struct hpts_diag *diag)
 	 */
 	if (need_wakeup) {
 		hpts->p_direct_wake = 1;
-		tcp_wakehpts(hpts);
+		tcp_hpts_wake(hpts);
 		if (diag) {
 			diag->need_new_to = 0;
 			diag->co_ret = 0xffff0000;
@@ -999,7 +985,6 @@ tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, struct hpts_diag *diag)
 	} else if (need_new_to) {
 		int32_t co_ret;
 		struct timeval tv;
-		sbintime_t sb;
 
 		tv.tv_sec = 0;
 		tv.tv_usec = 0;
@@ -1007,11 +992,8 @@ tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, struct hpts_diag *diag)
 			tv.tv_sec++;
 			need_new_to -= HPTS_USEC_IN_SEC;
 		}
-		tv.tv_usec = need_new_to;
-		sb = tvtosbt(tv);
-		co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
-					      hpts_timeout_swi, hpts, hpts->p_cpu,
-					      (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+		tv.tv_usec = need_new_to; /* XXX: Why is this sleeping over the max? */
+		co_ret = tcp_hpts_sleep(hpts, &tv);
 		if (diag) {
 			diag->need_new_to = need_new_to;
 			diag->co_ret = co_ret;
@@ -1021,7 +1003,7 @@ tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, struct hpts_diag *diag)
 }
 
 static uint16_t
-hpts_cpuid(struct tcpcb *tp, int *failed)
+hpts_cpuid(struct tcp_hptsi *pace, struct tcpcb *tp, int *failed)
 {
 	struct inpcb *inp = tptoinpcb(tp);
 	u_int cpuid;
@@ -1048,7 +1030,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)
 #ifdef RSS
 	cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
 	if (cpuid == NETISR_CPUID_NONE)
-		return (hpts_random_cpu());
+		return (tcp_hptsi_random_cpu(pace));
 	else
 		return (cpuid);
 #endif
@@ -1059,7 +1041,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)
 	 */
 	if (inp->inp_flowtype == M_HASHTYPE_NONE) {
 		counter_u64_add(cpu_uses_random, 1);
-		return (hpts_random_cpu());
+		return (tcp_hptsi_random_cpu(pace));
 	}
 	/*
 	 * Hash to a thread based on the flowid.  If we are using numa,
@@ -1074,7 +1056,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)
 #ifdef NUMA
 	} else {
 		/* Hash into the cpu's that use that domain */
-		di = &hpts_domains[inp->inp_numa_domain];
+		di = &pace->domains[inp->inp_numa_domain];
 		cpuid = di->cpu[inp->inp_flowid % di->count];
 	}
 #endif
@@ -1112,9 +1094,10 @@ tcp_hpts_different_slots(uint32_t cts, uint32_t cts_last_run)
 	return ((cts / HPTS_USECS_PER_SLOT) != (cts_last_run / HPTS_USECS_PER_SLOT));
 }
 
-static int32_t
+int32_t
 tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout)
 {
+	struct tcp_hptsi *pace;
 	struct tcpcb *tp;
 	struct timeval tv;
 	int32_t slots_to_run, i, error;
@@ -1132,12 +1115,17 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout)
 
 	HPTS_MTX_ASSERT(hpts);
 	NET_EPOCH_ASSERT();
+
+	pace = hpts->p_hptsi;
+	MPASS(pace != NULL);
+
 	/* record previous info for any logging */
 	hpts->saved_curslot = hpts->p_cur_slot;
 	hpts->saved_prev_slot = hpts->p_prev_slot;
 
-	cts_last_run = tcp_pace.cts_last_ran[hpts->p_num];
-	tcp_pace.cts_last_ran[hpts->p_num] = cts = tcp_get_usecs(&tv);
+	microuptime(&tv);
+	cts_last_run = pace->cts_last_ran[hpts->p_cpu];
+	pace->cts_last_ran[hpts->p_cpu] = cts = tcp_tv_to_usec(&tv);
 
 	orig_exit_slot = hpts->p_cur_slot = cts_to_wheel(cts);
 	if ((hpts->p_on_queue_cnt == 0) ||
@@ -1383,7 +1371,7 @@ again:
 				 * gets added to the hpts (not this one)
 				 * :-)
 				 */
-				tcp_set_hpts(tp);
+				__tcp_set_hpts(pace, tp);
 			}
 			CURVNET_SET(inp->inp_vnet);
 			/* Lets do any logging that we might want to */
@@ -1445,8 +1433,9 @@ no_one:
 	 * more slots (if we did not hit eno-bufs).
 	 */
 	hpts->p_prev_slot = hpts->p_cur_slot;
+	microuptime(&tv);
 	cts_last_run = cts;
-	cts = tcp_get_usecs(&tv);
+	cts = tcp_tv_to_usec(&tv);
 	if (!from_callout || (loop_cnt > max_pacer_loops)) {
 		/*
 		 * Something is serious slow we have
@@ -1478,7 +1467,7 @@ no_one:
 		goto again;
 	}
 no_run:
-	tcp_pace.cts_last_ran[hpts->p_num] = cts;
+	pace->cts_last_ran[hpts->p_cpu] = cts;
 	/*
 	 * Set flag to tell that we are done for
 	 * any slot input that happens during
@@ -1512,7 +1501,8 @@ no_run:
 		 cts_last_run, cts, loop_cnt, wrap_loop_cnt));
 
 	if (from_callout && tcp_hpts_different_slots(cts, cts_last_run)) {
-		cts = tcp_get_usecs(&tv);
+		microuptime(&tv);
+		cts = tcp_tv_to_usec(&tv);
 		hpts->p_cur_slot = cts_to_wheel(cts);
 		counter_u64_add(hpts_loops, 1);
 		goto again;
@@ -1528,16 +1518,16 @@ no_run:
 }
 
 void
-tcp_set_hpts(struct tcpcb *tp)
+__tcp_set_hpts(struct tcp_hptsi *pace, struct tcpcb *tp)
 {
 	struct tcp_hpts_entry *hpts;
 	int failed;
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
-	hpts = tcp_hpts_lock(tp);
+	hpts = tcp_hpts_lock(pace, tp);
 	if (tp->t_in_hpts == IHPTS_NONE && !(tp->t_flags2 & TF2_HPTS_CPU_SET)) {
-		tp->t_hpts_cpu = hpts_cpuid(tp, &failed);
+		tp->t_hpts_cpu = hpts_cpuid(pace, tp, &failed);
 		if (failed == 0)
 			tp->t_flags2 |= TF2_HPTS_CPU_SET;
 	}
@@ -1545,33 +1535,35 @@ tcp_set_hpts(struct tcpcb *tp)
 }
 
 static struct tcp_hpts_entry *
-tcp_choose_hpts_to_run(void)
+tcp_choose_hpts_to_run(struct tcp_hptsi *pace)
 {
+	struct timeval tv;
 	int i, oldest_idx, start, end;
 	uint32_t cts, time_since_ran, calc;
 
-	cts = tcp_get_usecs(NULL);
+	microuptime(&tv);
+	cts = tcp_tv_to_usec(&tv);
 	time_since_ran = 0;
 	/* Default is all one group */
 	start = 0;
-	end = tcp_pace.rp_num_hptss;
+	end = pace->rp_num_hptss;
 	/*
 	 * If we have more than one L3 group figure out which one
 	 * this CPU is in.
 	 */
-	if (tcp_pace.grp_cnt > 1) {
-		for (i = 0; i < tcp_pace.grp_cnt; i++) {
-			if (CPU_ISSET(curcpu, &tcp_pace.grps[i]->cg_mask)) {
-				start = tcp_pace.grps[i]->cg_first;
-				end = (tcp_pace.grps[i]->cg_last + 1);
+	if (pace->grp_cnt > 1) {
+		for (i = 0; i < pace->grp_cnt; i++) {
+			if (CPU_ISSET(curcpu, &pace->grps[i]->cg_mask)) {
+				start = pace->grps[i]->cg_first;
+				end = (pace->grps[i]->cg_last + 1);
 				break;
 			}
 		}
 	}
 	oldest_idx = -1;
 	for (i = start; i < end; i++) {
-		if (TSTMP_GT(cts, tcp_pace.cts_last_ran[i]))
-			calc = cts - tcp_pace.cts_last_ran[i];
+		if (TSTMP_GT(cts, pace->cts_last_ran[i]))
+			calc = cts - pace->cts_last_ran[i];
 		else
 			calc = 0;
 		if (calc > time_since_ran) {
@@ -1580,9 +1572,9 @@ tcp_choose_hpts_to_run(void)
 		}
 	}
 	if (oldest_idx >= 0)
-		return(tcp_pace.rp_ent[oldest_idx]);
+		return(pace->rp_ent[oldest_idx]);
 	else
-		return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]);
+		return(pace->rp_ent[(curcpu % pace->rp_num_hptss)]);
 }
 
 static void
@@ -1592,7 +1584,7 @@ __tcp_run_hpts(void)
 	struct tcp_hpts_entry *hpts;
 	int slots_ran;
 
-	hpts = tcp_choose_hpts_to_run();
+	hpts = tcp_choose_hpts_to_run(tcp_hptsi_pace);
 
 	if (hpts->p_hpts_active) {
 		/* Already active */
@@ -1613,7 +1605,6 @@ __tcp_run_hpts(void)
 	if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
 		if (slots_ran > slots_indicate_less_sleep) {
 			struct timeval tv;
-			sbintime_t sb;
 
 			hpts->p_mysleep.tv_usec /= 2;
 			if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
@@ -1637,12 +1628,7 @@ __tcp_run_hpts(void)
 			 * the dynamic value and set the on_min_sleep
 			 * flag so we will not be awoken.
 			 */
-			sb = tvtosbt(tv);
-			/* Store off to make visible the actual sleep time */
-			hpts->sleeping = tv.tv_usec;
-			callout_reset_sbt_on(&hpts->co, sb, 0,
-					     hpts_timeout_swi, hpts, hpts->p_cpu,
-					     (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+			(void)tcp_hpts_sleep(hpts, &tv);
 		} else if (slots_ran < slots_indicate_more_sleep) {
 			/* For the further sleep, don't reschedule  hpts */
 			hpts->p_mysleep.tv_usec *= 2;
@@ -1660,17 +1646,22 @@ out_with_mtx:
 static void
 tcp_hpts_thread(void *ctx)
 {
+#ifdef TCP_HPTS_KTEST
+	struct tcp_hptsi *pace;
+#endif
 	struct tcp_hpts_entry *hpts;
 	struct epoch_tracker et;
 	struct timeval tv;
-	sbintime_t sb;
 	int slots_ran;
 
 	hpts = (struct tcp_hpts_entry *)ctx;
+#ifdef TCP_HPTS_KTEST
+	pace = hpts->p_hptsi;
+#endif
 	HPTS_LOCK(hpts);
 	if (hpts->p_direct_wake) {
 		/* Signaled by input or output with low occupancy count. */
-		callout_stop(&hpts->co);
+		_callout_stop_safe(&hpts->co, 0);
 		counter_u64_add(hpts_direct_awakening, 1);
 	} else {
 		/* Timed out, the normal case. */
@@ -1799,18 +1790,11 @@ tcp_hpts_thread(void *ctx)
 	hpts->p_hpts_active = 0;
 back_to_sleep:
 	hpts->p_direct_wake = 0;
-	sb = tvtosbt(tv);
-	/* Store off to make visible the actual sleep time */
-	hpts->sleeping = tv.tv_usec;
-	callout_reset_sbt_on(&hpts->co, sb, 0,
-			     hpts_timeout_swi, hpts, hpts->p_cpu,
-			     (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+	(void)tcp_hpts_sleep(hpts, &tv);
 	NET_EPOCH_EXIT(et);
 	HPTS_UNLOCK(hpts);
 }
 
-#undef	timersub
-
 static int32_t
 hpts_count_level(struct cpu_group *cg)
 {
@@ -1847,57 +1831,63 @@ hpts_gather_grps(struct cpu_group **grps, int32_t *at, int32_t max, struct cpu_g
 	}
 }
 
-static void
-tcp_hpts_mod_load(void)
+/*
+ * Initialize a tcp_hptsi structure. This performs the core initialization
+ * without starting threads.
+ */
+struct tcp_hptsi*
+tcp_hptsi_create(const struct tcp_hptsi_funcs *funcs, bool enable_sysctl)
 {
+	struct tcp_hptsi *pace;
 	struct cpu_group *cpu_top;
-	int32_t error __diagused;
-	int32_t i, j, bound = 0, created = 0;
+	uint32_t i, j, cts;
+	int32_t count;
 	size_t sz, asz;
 	struct timeval tv;
-	sbintime_t sb;
 	struct tcp_hpts_entry *hpts;
-	struct pcpu *pc;
 	char unit[16];
 	uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
-	int count, domain;
 
+	KASSERT(funcs != NULL, ("funcs is NULL"));
+
+	/* Allocate the main structure */
+	pace = malloc(sizeof(struct tcp_hptsi), M_TCPHPTS, M_WAITOK | M_ZERO);
+	if (pace == NULL)
+		return (NULL);
+
+	memset(pace, 0, sizeof(*pace));
+	pace->funcs = funcs;
+
+	/* Setup CPU topology information */
 #ifdef SMP
 	cpu_top = smp_topo();
 #else
 	cpu_top = NULL;
 #endif
-	tcp_pace.rp_num_hptss = ncpus;
-	hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK);
-	hpts_loops = counter_u64_alloc(M_WAITOK);
-	back_tosleep = counter_u64_alloc(M_WAITOK);
-	combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
-	wheel_wrap = counter_u64_alloc(M_WAITOK);
-	hpts_wake_timeout = counter_u64_alloc(M_WAITOK);
-	hpts_direct_awakening = counter_u64_alloc(M_WAITOK);
-	hpts_back_tosleep = counter_u64_alloc(M_WAITOK);
-	hpts_direct_call = counter_u64_alloc(M_WAITOK);
-	cpu_uses_flowid = counter_u64_alloc(M_WAITOK);
-	cpu_uses_random = counter_u64_alloc(M_WAITOK);
+	pace->rp_num_hptss = ncpus;
 
-	sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
-	tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
-	sz = (sizeof(uint32_t) * tcp_pace.rp_num_hptss);
-	tcp_pace.cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK);
-	tcp_pace.grp_cnt = 0;
+	/* Allocate hpts entry array */
+	sz = (pace->rp_num_hptss * sizeof(struct tcp_hpts_entry *));
+	pace->rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
+
+	/* Allocate timestamp tracking array */
+	sz = (sizeof(uint32_t) * pace->rp_num_hptss);
+	pace->cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK);
+
+	/* Setup CPU groups */
 	if (cpu_top == NULL) {
-		tcp_pace.grp_cnt = 1;
+		pace->grp_cnt = 1;
 	} else {
 		/* Find out how many cache level 3 domains we have */
 		count = 0;
-		tcp_pace.grp_cnt = hpts_count_level(cpu_top);
-		if (tcp_pace.grp_cnt == 0) {
-			tcp_pace.grp_cnt = 1;
+		pace->grp_cnt = hpts_count_level(cpu_top);
+		if (pace->grp_cnt == 0) {
+			pace->grp_cnt = 1;
 		}
-		sz = (tcp_pace.grp_cnt * sizeof(struct cpu_group *));
-		tcp_pace.grps = malloc(sz, M_TCPHPTS, M_WAITOK);
+		sz = (pace->grp_cnt * sizeof(struct cpu_group *));
+		pace->grps = malloc(sz, M_TCPHPTS, M_WAITOK);
 		/* Now populate the groups */
-		if (tcp_pace.grp_cnt == 1) {
+		if (pace->grp_cnt == 1) {
 			/*
 			 * All we need is the top level all cpu's are in
 			 * the same cache so when we use grp[0]->cg_mask
@@ -1905,188 +1895,290 @@ tcp_hpts_mod_load(void)
 			 * all cpu's in it. The level here is probably
 			 * zero which is ok.
 			 */
-			tcp_pace.grps[0] = cpu_top;
+			pace->grps[0] = cpu_top;
 		} else {
 			/*
 			 * Here we must find all the level three cache domains
 			 * and setup our pointers to them.
 			 */
 			count = 0;
-			hpts_gather_grps(tcp_pace.grps, &count, tcp_pace.grp_cnt, cpu_top);
+			hpts_gather_grps(pace->grps, &count, pace->grp_cnt, cpu_top);
 		}
 	}
+
+	/* Cache the current time for initializing the hpts entries */
+	microuptime(&tv);
+	cts = tcp_tv_to_usec(&tv);
+
+	/* Initialize each hpts entry */
 	asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
-	for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
-		tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
+	for (i = 0; i < pace->rp_num_hptss; i++) {
+		pace->rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
 		    M_TCPHPTS, M_WAITOK | M_ZERO);
-		tcp_pace.rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, M_WAITOK);
-		hpts = tcp_pace.rp_ent[i];
-		/*
-		 * Init all the hpts structures that are not specifically
-		 * zero'd by the allocations. Also lets attach them to the
-		 * appropriate sysctl block as well.
-		 */
-		mtx_init(&hpts->p_mtx, "tcp_hpts_lck",
-		    "hpts", MTX_DEF | MTX_DUPOK);
-		for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
-			TAILQ_INIT(&hpts->p_hptss[j].head);
-			hpts->p_hptss[j].count = 0;
-			hpts->p_hptss[j].gencnt = 0;
-		}
-		sysctl_ctx_init(&hpts->hpts_ctx);
-		sprintf(unit, "%d", i);
-		hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx,
-		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
-		    OID_AUTO,
-		    unit,
-		    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
-		    "");
-		SYSCTL_ADD_INT(&hpts->hpts_ctx,
-		    SYSCTL_CHILDREN(hpts->hpts_root),
-		    OID_AUTO, "out_qcnt", CTLFLAG_RD,
-		    &hpts->p_on_queue_cnt, 0,
-		    "Count TCB's awaiting output processing");
-		SYSCTL_ADD_U16(&hpts->hpts_ctx,
-		    SYSCTL_CHILDREN(hpts->hpts_root),
-		    OID_AUTO, "active", CTLFLAG_RD,
-		    &hpts->p_hpts_active, 0,
-		    "Is the hpts active");
-		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
-		    SYSCTL_CHILDREN(hpts->hpts_root),
-		    OID_AUTO, "curslot", CTLFLAG_RD,
-		    &hpts->p_cur_slot, 0,
-		    "What the current running pacers goal");
-		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
-		    SYSCTL_CHILDREN(hpts->hpts_root),
-		    OID_AUTO, "runslot", CTLFLAG_RD,
-		    &hpts->p_runningslot, 0,
-		    "What the running pacers current slot is");
-		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
-		    SYSCTL_CHILDREN(hpts->hpts_root),
-		    OID_AUTO, "lastran", CTLFLAG_RD,
-		    &tcp_pace.cts_last_ran[i], 0,
-		    "The last usec timestamp that this hpts ran");
-		SYSCTL_ADD_LONG(&hpts->hpts_ctx,
-		    SYSCTL_CHILDREN(hpts->hpts_root),
-		    OID_AUTO, "cur_min_sleep", CTLFLAG_RD,
-		    &hpts->p_mysleep.tv_usec,
-		    "What the running pacers is using for p_mysleep.tv_usec");
-		SYSCTL_ADD_U64(&hpts->hpts_ctx,
-		    SYSCTL_CHILDREN(hpts->hpts_root),
-		    OID_AUTO, "now_sleeping", CTLFLAG_RD,
-		    &hpts->sleeping, 0,
-		    "What the running pacers is actually sleeping for");
-		SYSCTL_ADD_U64(&hpts->hpts_ctx,
-		    SYSCTL_CHILDREN(hpts->hpts_root),
-		    OID_AUTO, "syscall_cnt", CTLFLAG_RD,
-		    &hpts->syscall_cnt, 0,
-		    "How many times we had syscalls on this hpts");
+		pace->rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS,
+		    M_WAITOK | M_ZERO);
+		hpts = pace->rp_ent[i];
 
+		/* Basic initialization */
 		hpts->p_hpts_sleep_time = hpts_sleep_max;
-		hpts->p_num = i;
-		tcp_pace.cts_last_ran[i] = tcp_get_u64_usecs(&tv);
-		hpts->p_cur_slot = cts_to_wheel(tcp_pace.cts_last_ran[i]);
+		hpts->p_cpu = i;
+		pace->cts_last_ran[i] = cts;
+		hpts->p_cur_slot = cts_to_wheel(cts);
 		hpts->p_prev_slot = hpts->p_cur_slot;
-		hpts->p_cpu = 0xffff;
 		hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1);
 		callout_init(&hpts->co, 1);
+		hpts->p_hptsi = pace;
+		mtx_init(&hpts->p_mtx, "tcp_hpts_lck", "hpts",
+		    MTX_DEF | MTX_DUPOK);
+		for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
+			TAILQ_INIT(&hpts->p_hptss[j].head);
+		}
+
+		/* Setup SYSCTL if requested */
+		if (enable_sysctl) {
+			sysctl_ctx_init(&hpts->hpts_ctx);
+			sprintf(unit, "%d", i);
+			hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx,
+			    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
+			    OID_AUTO,
+			    unit,
+			    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+			    "");
+			SYSCTL_ADD_INT(&hpts->hpts_ctx,
+			    SYSCTL_CHILDREN(hpts->hpts_root),
+			    OID_AUTO, "out_qcnt", CTLFLAG_RD,
+			    &hpts->p_on_queue_cnt, 0,
+			    "Count TCB's awaiting output processing");
+			SYSCTL_ADD_U16(&hpts->hpts_ctx,
+			    SYSCTL_CHILDREN(hpts->hpts_root),
+			    OID_AUTO, "active", CTLFLAG_RD,
+			    &hpts->p_hpts_active, 0,
+			    "Is the hpts active");
+			SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+			    SYSCTL_CHILDREN(hpts->hpts_root),
+			    OID_AUTO, "curslot", CTLFLAG_RD,
+			    &hpts->p_cur_slot, 0,
+			    "What the current running pacers goal");
+			SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+			    SYSCTL_CHILDREN(hpts->hpts_root),
+			    OID_AUTO, "runslot", CTLFLAG_RD,
+			    &hpts->p_runningslot, 0,
+			    "What the running pacers current slot is");
+			SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+			    SYSCTL_CHILDREN(hpts->hpts_root),
+			    OID_AUTO, "lastran", CTLFLAG_RD,
+			    &pace->cts_last_ran[i], 0,
+			    "The last usec timestamp that this hpts ran");
+			SYSCTL_ADD_LONG(&hpts->hpts_ctx,
+			    SYSCTL_CHILDREN(hpts->hpts_root),
+			    OID_AUTO, "cur_min_sleep", CTLFLAG_RD,
+			    &hpts->p_mysleep.tv_usec,
+			    "What the running pacers is using for p_mysleep.tv_usec");
+			SYSCTL_ADD_U64(&hpts->hpts_ctx,
+			    SYSCTL_CHILDREN(hpts->hpts_root),
+			    OID_AUTO, "now_sleeping", CTLFLAG_RD,
+			    &hpts->sleeping, 0,
+			    "What the running pacers is actually sleeping for");
+			SYSCTL_ADD_U64(&hpts->hpts_ctx,
+			    SYSCTL_CHILDREN(hpts->hpts_root),
+			    OID_AUTO, "syscall_cnt", CTLFLAG_RD,
+			    &hpts->syscall_cnt, 0,
+			    "How many times we had syscalls on this hpts");
+		}
 	}
-	/* Don't try to bind to NUMA domains if we don't have any */
-	if (vm_ndomains == 1 && tcp_bind_threads == 2)
-		tcp_bind_threads = 0;
 
-	/*
-	 * Now lets start ithreads to handle the hptss.
-	 */
-	for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
-		hpts = tcp_pace.rp_ent[i];
-		hpts->p_cpu = i;
+	return (pace);
+}
+
+/*
+ * Create threads for a tcp_hptsi structure and starts timers for the current
+ * (minimum) sleep interval.
+ */
+void
+tcp_hptsi_start(struct tcp_hptsi *pace)
+{
+	struct tcp_hpts_entry *hpts;
+	struct pcpu *pc;
+	struct timeval tv;
+	uint32_t i, j;
+	int count, domain;
+	int error __diagused;
+
+	KASSERT(pace != NULL, ("tcp_hptsi_start: pace is NULL"));
+
+	/* Start threads for each hpts entry */
+	for (i = 0; i < pace->rp_num_hptss; i++) {
+		hpts = pace->rp_ent[i];
+
+		KASSERT(hpts->ie_cookie == NULL,
+		    ("tcp_hptsi_start: hpts[%d]->ie_cookie is not NULL", i));
 
 		error = swi_add(&hpts->ie, "hpts",
*** 2014 LINES SKIPPED ***