git: 446ccdd08e2a - main - tcp: use single locked callout per tcpcb for the TCP timers

From: Gleb Smirnoff <glebius_at_FreeBSD.org>
Date: Wed, 07 Dec 2022 17:53:20 UTC
The branch main has been updated by glebius:

URL: https://cgit.FreeBSD.org/src/commit/?id=446ccdd08e2a9f704f6348cd7f679e59183b99b3

commit 446ccdd08e2a9f704f6348cd7f679e59183b99b3
Author:     Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2022-12-07 17:00:48 +0000
Commit:     Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2022-12-07 17:00:48 +0000

    tcp: use single locked callout per tcpcb for the TCP timers
    
    Use only one callout structure per tcpcb that is responsible for handling
    all five TCP timeouts.  Use locked version of callout, of course. The
    callout function tcp_timer_enter() chooses soonest timer and executes it
    with lock held.  Unless the timer reports that the tcpcb has been freed,
    the callout is rescheduled for next soonest timer, if there is any.
    
    With single callout per tcpcb on connection teardown we should be able
    to fully stop the callout and immediately free it, avoiding use of
    callout_async_drain().  There is one gotcha here: callout_stop() can
    actually touch our memory when a rare race condition happens.  See
    comment above tcp_timer_stop().  Synchronous stop of the callout makes
    tcp_discardcb() the single entry point for tcpcb destructor, merging the
    tcp_freecb() to the end of the function.
    
    While here, also remove lots of lingering checks in the beginning of
    TCP timer functions.  With a locked callout they are unnecessary.
    
    While here, clean unused parts of timer KPI for the pluggable TCP stacks.
    
    While here, remove TCPDEBUG from tcp_timer.c, as this allows for more
    simplification of TCP timers.  The TCPDEBUG is scheduled for removal.
    
    Move the DTrace probes in timers to the beginning of a function, where
    a tcpcb is always existing.
    
    Discussed with:         rrs, tuexen, rscheff    (the TCP part of the diff)
    Reviewed by:            hselasky, kib, mav      (the callout part)
    Differential revision:  https://reviews.freebsd.org/D37321
---
 sys/netinet/tcp_stacks/bbr.c  |  31 +--
 sys/netinet/tcp_stacks/rack.c |  44 +---
 sys/netinet/tcp_subr.c        |  97 ++------
 sys/netinet/tcp_timer.c       | 548 ++++++++++++++++--------------------------
 sys/netinet/tcp_timer.h       |  19 --
 sys/netinet/tcp_usrreq.c      |   6 +-
 sys/netinet/tcp_var.h         |  30 +--
 sys/sys/proc.h                |   2 +-
 8 files changed, 250 insertions(+), 527 deletions(-)

diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index edba270c151b..5a1e3de4c416 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -5285,37 +5285,13 @@ bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts)
 	}
 }
 
-static void
-bbr_timer_stop(struct tcpcb *tp, uint32_t timer_type)
+static int
+bbr_stopall(struct tcpcb *tp)
 {
 	struct tcp_bbr *bbr;
 
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	bbr->rc_all_timers_stopped = 1;
-	return;
-}
-
-/*
- * stop all timers always returning 0.
- */
-static int
-bbr_stopall(struct tcpcb *tp)
-{
-	return (0);
-}
-
-static void
-bbr_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
-{
-	return;
-}
-
-/*
- * return true if a bbr timer (rack or tlp) is active.
- */
-static int
-bbr_timer_active(struct tcpcb *tp, uint32_t timer_type)
-{
 	return (0);
 }
 
@@ -14168,9 +14144,6 @@ struct tcp_function_block __tcp_bbr = {
 	.tfb_tcp_fb_init = bbr_init,
 	.tfb_tcp_fb_fini = bbr_fini,
 	.tfb_tcp_timer_stop_all = bbr_stopall,
-	.tfb_tcp_timer_activate = bbr_timer_activate,
-	.tfb_tcp_timer_active = bbr_timer_active,
-	.tfb_tcp_timer_stop = bbr_timer_stop,
 	.tfb_tcp_rexmit_tmr = bbr_remxt_tmr,
 	.tfb_tcp_handoff_ok = bbr_handoff_ok,
 	.tfb_tcp_mtu_chg = bbr_mtu_chg,
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index a93fb18398fe..d8170d05dd07 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -489,10 +489,6 @@ static void rack_remxt_tmr(struct tcpcb *tp);
 static int rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt);
 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
 static int32_t rack_stopall(struct tcpcb *tp);
-static void
-rack_timer_activate(struct tcpcb *tp, uint32_t timer_type,
-    uint32_t delta);
-static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type);
 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type);
 static uint32_t
@@ -5910,9 +5906,6 @@ rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 	 */
 	struct rack_sendmap *rsm;
 
-	if (tp->tt_flags & TT_STOPPED) {
-		return (1);
-	}
 	counter_u64_add(rack_to_tot, 1);
 	if (rack->r_state && (rack->r_state != tp->t_state))
 		rack_set_state(tp, rack);
@@ -6123,9 +6116,6 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t
 	uint32_t out, avail;
 	int collapsed_win = 0;
 
-	if (tp->tt_flags & TT_STOPPED) {
-		return (1);
-	}
 	if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
 		/* Its not time yet */
 		return (0);
@@ -6312,9 +6302,7 @@ out:
 static int
 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
-	if (tp->tt_flags & TT_STOPPED) {
-		return (1);
-	}
+
 	rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL);
 	tp->t_flags &= ~TF_DELACK;
 	tp->t_flags |= TF_ACKNOW;
@@ -6337,9 +6325,6 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 	struct tcptemp *t_template;
 	int32_t retval = 1;
 
-	if (tp->tt_flags & TT_STOPPED) {
-		return (1);
-	}
 	if (rack->rc_in_persist == 0)
 		return (0);
 	if (ctf_progress_timeout_check(tp, false)) {
@@ -6425,9 +6410,6 @@ rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 	struct tcptemp *t_template;
 	struct inpcb *inp = tptoinpcb(tp);
 
-	if (tp->tt_flags & TT_STOPPED) {
-		return (1);
-	}
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
 	rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL);
 	/*
@@ -6654,9 +6636,6 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 	int32_t retval = 0;
 	bool isipv6;
 
-	if (tp->tt_flags & TT_STOPPED) {
-		return (1);
-	}
 	if ((tp->t_flags & TF_GPUTINPROG) &&
 	    (tp->t_rxtshift)) {
 		/*
@@ -7060,12 +7039,6 @@ rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int lin
 		rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry);
 }
 
-static void
-rack_timer_stop(struct tcpcb *tp, uint32_t timer_type)
-{
-	return;
-}
-
 static int
 rack_stopall(struct tcpcb *tp)
 {
@@ -7075,18 +7048,6 @@ rack_stopall(struct tcpcb *tp)
 	return (0);
 }
 
-static void
-rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
-{
-	return;
-}
-
-static int
-rack_timer_active(struct tcpcb *tp, uint32_t timer_type)
-{
-	return (0);
-}
-
 static void
 rack_stop_all_timers(struct tcpcb *tp)
 {
@@ -20307,9 +20268,6 @@ static struct tcp_function_block __tcp_rack = {
 	.tfb_tcp_fb_init = rack_init,
 	.tfb_tcp_fb_fini = rack_fini,
 	.tfb_tcp_timer_stop_all = rack_stopall,
-	.tfb_tcp_timer_activate = rack_timer_activate,
-	.tfb_tcp_timer_active = rack_timer_active,
-	.tfb_tcp_timer_stop = rack_timer_stop,
 	.tfb_tcp_rexmit_tmr = rack_remxt_tmr,
 	.tfb_tcp_handoff_ok = rack_handoff_ok,
 	.tfb_tcp_mtu_chg = rack_mtu_change,
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index a132d9cf7e96..658e72ffa804 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -1194,22 +1194,6 @@ register_tcp_functions_as_names(struct tcp_function_block *blk, int wait,
 		*num_names = 0;
 		return (EINVAL);
 	}
-	if (blk->tfb_tcp_timer_stop_all ||
-	    blk->tfb_tcp_timer_activate ||
-	    blk->tfb_tcp_timer_active ||
-	    blk->tfb_tcp_timer_stop) {
-		/*
-		 * If you define one timer function you
-		 * must have them all.
-		 */
-		if ((blk->tfb_tcp_timer_stop_all == NULL) ||
-		    (blk->tfb_tcp_timer_activate == NULL) ||
-		    (blk->tfb_tcp_timer_active == NULL) ||
-		    (blk->tfb_tcp_timer_stop == NULL)) {
-			*num_names = 0;
-			return (EINVAL);
-		}
-	}
 
 	if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) {
 		*num_names = 0;
@@ -2227,12 +2211,9 @@ tcp_newtcpcb(struct inpcb *inp)
 #endif /* INET6 */
 		V_tcp_mssdflt;
 
-	/* Set up our timeouts. */
-	callout_init(&tp->tt_rexmt, 1);
-	callout_init(&tp->tt_persist, 1);
-	callout_init(&tp->tt_keep, 1);
-	callout_init(&tp->tt_2msl, 1);
-	callout_init(&tp->tt_delack, 1);
+	callout_init_rw(&tp->t_callout, &inp->inp_lock, CALLOUT_RETURNUNLOCKED);
+	for (int i = 0; i < TT_N; i++)
+		tp->t_timers[i] = SBT_MAX;
 
 	switch (V_tcp_do_rfc1323) {
 		case 0:
@@ -2301,13 +2282,6 @@ tcp_newtcpcb(struct inpcb *inp)
 	if (V_tcp_do_lrd)
 		tp->t_flags |= TF_LRD;
 
-	/*
-	 * XXXGL: this self-reference might be pointless.  It will go away
-	 * when the TCP timers are properly locked and could never fire after
-	 * tcp_discardcb().
-	*/
-	in_pcbref(inp);
-
 	return (tp);
 }
 
@@ -2341,32 +2315,15 @@ void
 tcp_discardcb(struct tcpcb *tp)
 {
 	struct inpcb *inp = tptoinpcb(tp);
+	struct socket *so = tptosocket(tp);
+#ifdef INET6
+	bool isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
+#endif
 
 	INP_WLOCK_ASSERT(inp);
 
-	/*
-	 * Make sure that all of our timers are stopped before we delete the
-	 * PCB.
-	 *
-	 * If stopping a timer fails, we schedule a discard function in same
-	 * callout, and the last discard function called will take care of
-	 * deleting the tcpcb.
-	 */
-	tp->tt_draincnt = 0;
-	tcp_timer_stop(tp, TT_REXMT);
-	tcp_timer_stop(tp, TT_PERSIST);
-	tcp_timer_stop(tp, TT_KEEP);
-	tcp_timer_stop(tp, TT_2MSL);
-	tcp_timer_stop(tp, TT_DELACK);
+	tcp_timer_stop(tp);
 	if (tp->t_fb->tfb_tcp_timer_stop_all) {
-		/*
-		 * Call the stop-all function of the methods,
-		 * this function should call the tcp_timer_stop()
-		 * method with each of the function specific timeouts.
-		 * That stop will be called via the tfb_tcp_timer_stop()
-		 * which should use the async drain function of the
-		 * callout system (see tcp_var.h).
-		 */
 		tp->t_fb->tfb_tcp_timer_stop_all(tp);
 	}
 
@@ -2402,23 +2359,7 @@ tcp_discardcb(struct tcpcb *tp)
 #endif
 
 	CC_ALGO(tp) = NULL;
-	if (tp->tt_draincnt == 0)
-		tcp_freecb(tp);
-}
 
-bool
-tcp_freecb(struct tcpcb *tp)
-{
-	struct inpcb *inp = tptoinpcb(tp);
-	struct socket *so = tptosocket(tp);
-#ifdef INET6
-	bool isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
-#endif
-
-	INP_WLOCK_ASSERT(inp);
-	MPASS(tp->tt_draincnt == 0);
-
-	/* We own the last reference on tcpcb, let's free it. */
 #ifdef TCP_BLACKBOX
 	tcp_log_tcpcbfini(tp);
 #endif
@@ -2489,8 +2430,6 @@ tcp_freecb(struct tcpcb *tp)
 	}
 
 	refcount_release(&tp->t_fb->tfb_refcnt);
-
-	return (in_pcbrele_wlocked(inp));
 }
 
 /*
@@ -3940,17 +3879,17 @@ tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt)
 		     (tp->t_flags2 & TF2_ACE_PERMIT) ? 2 : 0;
 
 	now = getsbinuptime();
-#define	COPYTIMER(ttt)	do {					\
-	if (callout_active(&tp->ttt))				\
-		xt->ttt = (tp->ttt.c_time - now) / SBT_1MS;	\
-	else							\
-		xt->ttt = 0;					\
+#define	COPYTIMER(which,where)	do {					\
+	if (tp->t_timers[which] != SBT_MAX)				\
+		xt->where = (tp->t_timers[which] - now) / SBT_1MS;	\
+	else								\
+		xt->where = 0;						\
 } while (0)
-	COPYTIMER(tt_delack);
-	COPYTIMER(tt_rexmt);
-	COPYTIMER(tt_persist);
-	COPYTIMER(tt_keep);
-	COPYTIMER(tt_2msl);
+	COPYTIMER(TT_DELACK, tt_delack);
+	COPYTIMER(TT_REXMT, tt_rexmt);
+	COPYTIMER(TT_PERSIST, tt_persist);
+	COPYTIMER(TT_KEEP, tt_keep);
+	COPYTIMER(TT_2MSL, tt_2msl);
 #undef COPYTIMER
 	xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz;
 
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index d67a062eab5b..8c94218ec9d9 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -243,104 +243,86 @@ int tcp_totbackoff = 2559;	/* sum of tcp_backoff[] */
 
 /*
  * TCP timer processing.
+ *
+ * Each connection has 5 timers associated with it, which can be scheduled
+ * simultaneously.  They all are serviced by one callout tcp_timer_enter().
+ * This function executes the next timer via tcp_timersw[] vector.  Each
+ * timer is supposed to return 'true' unless the connection was destroyed.
+ * In the former case tcp_timer_enter() will schedule callout for next timer.
  */
 
-void
-tcp_timer_delack(void *xtp)
-{
-	struct epoch_tracker et;
-	struct tcpcb *tp = xtp;
-	struct inpcb *inp = tptoinpcb(tp);
-
-	INP_WLOCK(inp);
-	CURVNET_SET(inp->inp_vnet);
-
-	if (callout_pending(&tp->tt_delack) ||
-	    !callout_active(&tp->tt_delack)) {
-		INP_WUNLOCK(inp);
-		CURVNET_RESTORE();
-		return;
-	}
-	callout_deactivate(&tp->tt_delack);
-	if ((inp->inp_flags & INP_DROPPED) != 0) {
-		INP_WUNLOCK(inp);
-		CURVNET_RESTORE();
-		return;
-	}
-	tp->t_flags |= TF_ACKNOW;
-	TCPSTAT_INC(tcps_delack);
-	NET_EPOCH_ENTER(et);
-	(void) tcp_output_unlock(tp);
-	NET_EPOCH_EXIT(et);
-	CURVNET_RESTORE();
-}
+typedef bool tcp_timer_t(struct tcpcb *);
+static tcp_timer_t tcp_timer_delack;
+static tcp_timer_t tcp_timer_2msl;
+static tcp_timer_t tcp_timer_keep;
+static tcp_timer_t tcp_timer_persist;
+static tcp_timer_t tcp_timer_rexmt;
+
+static tcp_timer_t * const tcp_timersw[TT_N] = {
+	[TT_DELACK] = tcp_timer_delack,
+	[TT_REXMT] = tcp_timer_rexmt,
+	[TT_PERSIST] = tcp_timer_persist,
+	[TT_KEEP] = tcp_timer_keep,
+	[TT_2MSL] = tcp_timer_2msl,
+};
 
 /*
- * Call tcp_close() from a callout context.
+ * tcp_output_locked() s a timer specific variation of call to tcp_output(),
+ * see tcp_var.h for the rest.  It handles drop request from advanced stacks,
+ * but keeps tcpcb locked unless tcp_drop() destroyed it.
+ * Returns true if tcpcb is valid and locked.
  */
-static void
-tcp_timer_close(struct tcpcb *tp)
+static inline bool
+tcp_output_locked(struct tcpcb *tp)
 {
-	struct epoch_tracker et;
-	struct inpcb *inp = tptoinpcb(tp);
+	int rv;
 
-	INP_WLOCK_ASSERT(inp);
+	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
-	NET_EPOCH_ENTER(et);
-	tp = tcp_close(tp);
-	NET_EPOCH_EXIT(et);
-	if (tp != NULL)
-		INP_WUNLOCK(inp);
+	if ((rv = tp->t_fb->tfb_tcp_output(tp)) < 0) {
+		KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP,
+		    ("TCP stack %s requested tcp_drop(%p)",
+		    tp->t_fb->tfb_tcp_block_name, tp));
+		tp = tcp_drop(tp, rv);
+	}
+
+	return (tp != NULL);
 }
 
-/*
- * Call tcp_drop() from a callout context.
- */
-static void
-tcp_timer_drop(struct tcpcb *tp)
+static bool
+tcp_timer_delack(struct tcpcb *tp)
 {
 	struct epoch_tracker et;
+#if defined(INVARIANTS) || defined(VIMAGE)
 	struct inpcb *inp = tptoinpcb(tp);
+#endif
+	bool rv;
 
 	INP_WLOCK_ASSERT(inp);
 
+	CURVNET_SET(inp->inp_vnet);
+	tp->t_flags |= TF_ACKNOW;
+	TCPSTAT_INC(tcps_delack);
 	NET_EPOCH_ENTER(et);
-	tp = tcp_drop(tp, ETIMEDOUT);
+	rv = tcp_output_locked(tp);
 	NET_EPOCH_EXIT(et);
-	if (tp != NULL)
-		INP_WUNLOCK(inp);
+	CURVNET_RESTORE();
+
+	return (rv);
 }
 
-void
-tcp_timer_2msl(void *xtp)
+static bool
+tcp_timer_2msl(struct tcpcb *tp)
 {
-	struct tcpcb *tp = xtp;
 	struct inpcb *inp = tptoinpcb(tp);
-#ifdef TCPDEBUG
-	int ostate;
+	bool close = false;
 
-	ostate = tp->t_state;
-#endif
+	INP_WLOCK_ASSERT(inp);
 
-	INP_WLOCK(inp);
+	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 	CURVNET_SET(inp->inp_vnet);
-
 	tcp_log_end_status(tp, TCP_EI_STATUS_2MSL);
 	tcp_free_sackholes(tp);
-	if (callout_pending(&tp->tt_2msl) ||
-	    !callout_active(&tp->tt_2msl)) {
-		INP_WUNLOCK(inp);
-		CURVNET_RESTORE();
-		return;
-	}
-	callout_deactivate(&tp->tt_2msl);
-	if (inp->inp_flags & INP_DROPPED) {
-		INP_WUNLOCK(inp);
-		CURVNET_RESTORE();
-		return;
-	}
-	KASSERT((tp->tt_flags & TT_STOPPED) == 0,
-		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 	/*
 	 * 2 MSL timeout in shutdown went off.  If we're closed but
 	 * still waiting for peer to close and connection has been idle
@@ -354,69 +336,41 @@ tcp_timer_2msl(void *xtp)
 	 * XXXGL: check if inp_socket shall always be !NULL here?
 	 */
 	if (tp->t_state == TCPS_TIME_WAIT) {
-		tcp_timer_close(tp);
-		CURVNET_RESTORE();
-		return;
+		close = true;
 	} else if (tp->t_state == TCPS_FIN_WAIT_2 &&
 	    tcp_fast_finwait2_recycle && inp->inp_socket &&
 	    (inp->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
 		TCPSTAT_INC(tcps_finwait2_drops);
-		tcp_timer_close(tp);
-		CURVNET_RESTORE();
-		return;
+		close = true;
 	} else {
-		if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
-			callout_reset(&tp->tt_2msl,
-				      TP_KEEPINTVL(tp), tcp_timer_2msl, tp);
-		} else {
-			tcp_timer_close(tp);
-			CURVNET_RESTORE();
-			return;
-		}
+		if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp))
+			tcp_timer_activate(tp, TT_2MSL, TP_KEEPINTVL(tp));
+		else
+			close = true;
 	}
+	if (close) {
+		struct epoch_tracker et;
 
-#ifdef TCPDEBUG
-	if (tptosocket(tp)->so_options & SO_DEBUG)
-		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
-			  PRU_SLOWTIMO);
-#endif
-	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
-
-	INP_WUNLOCK(inp);
+		NET_EPOCH_ENTER(et);
+		tp = tcp_close(tp);
+		NET_EPOCH_EXIT(et);
+	}
 	CURVNET_RESTORE();
+
+	return (tp != NULL);
 }
 
-void
-tcp_timer_keep(void *xtp)
+static bool
+tcp_timer_keep(struct tcpcb *tp)
 {
 	struct epoch_tracker et;
-	struct tcpcb *tp = xtp;
 	struct inpcb *inp = tptoinpcb(tp);
 	struct tcptemp *t_template;
-#ifdef TCPDEBUG
-	int ostate;
 
-	ostate = tp->t_state;
-#endif
+	INP_WLOCK_ASSERT(inp);
 
-	INP_WLOCK(inp);
+	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 	CURVNET_SET(inp->inp_vnet);
-
-	if (callout_pending(&tp->tt_keep) ||
-	    !callout_active(&tp->tt_keep)) {
-		INP_WUNLOCK(inp);
-		CURVNET_RESTORE();
-		return;
-	}
-	callout_deactivate(&tp->tt_keep);
-	if (inp->inp_flags & INP_DROPPED) {
-		INP_WUNLOCK(inp);
-		CURVNET_RESTORE();
-		return;
-	}
-	KASSERT((tp->tt_flags & TT_STOPPED) == 0,
-		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
-
 	/*
 	 * Because we don't regularly reset the keepalive callout in
 	 * the ESTABLISHED state, it may be that we don't actually need
@@ -428,11 +382,10 @@ tcp_timer_keep(void *xtp)
 
 		idletime = ticks - tp->t_rcvtime;
 		if (idletime < TP_KEEPIDLE(tp)) {
-			callout_reset(&tp->tt_keep,
-			    TP_KEEPIDLE(tp) - idletime, tcp_timer_keep, tp);
-			INP_WUNLOCK(inp);
+			tcp_timer_activate(tp, TT_KEEP,
+			    TP_KEEPIDLE(tp) - idletime);
 			CURVNET_RESTORE();
-			return;
+			return (true);
 		}
 	}
 
@@ -470,38 +423,22 @@ tcp_timer_keep(void *xtp)
 			NET_EPOCH_EXIT(et);
 			free(t_template, M_TEMP);
 		}
-		callout_reset(&tp->tt_keep, TP_KEEPINTVL(tp),
-			      tcp_timer_keep, tp);
+		tcp_timer_activate(tp, TT_KEEP, TP_KEEPINTVL(tp));
 	} else
-		callout_reset(&tp->tt_keep, TP_KEEPIDLE(tp),
-			      tcp_timer_keep, tp);
+		tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
 
-#ifdef TCPDEBUG
-	if (inp->inp_socket->so_options & SO_DEBUG)
-		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
-			  PRU_SLOWTIMO);
-#endif
-	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
-	INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
-	return;
+	return (true);
 
 dropit:
 	TCPSTAT_INC(tcps_keepdrops);
 	NET_EPOCH_ENTER(et);
 	tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
 	tp = tcp_drop(tp, ETIMEDOUT);
-
-#ifdef TCPDEBUG
-	if (tp != NULL && (tptosocket(tp)->so_options & SO_DEBUG))
-		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
-			  PRU_SLOWTIMO);
-#endif
-	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 	NET_EPOCH_EXIT(et);
-	if (tp != NULL)
-		INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
+
+	return (tp != NULL);
 }
 
 /*
@@ -529,37 +466,19 @@ tcp_maxunacktime_check(struct tcpcb *tp)
 	return true;
 }
 
-void
-tcp_timer_persist(void *xtp)
+static bool
+tcp_timer_persist(struct tcpcb *tp)
 {
 	struct epoch_tracker et;
-	struct tcpcb *tp = xtp;
+#if defined(INVARIANTS) || defined(VIMAGE)
 	struct inpcb *inp = tptoinpcb(tp);
-	bool progdrop;
-	int outrv;
-#ifdef TCPDEBUG
-	int ostate;
-
-	ostate = tp->t_state;
 #endif
+	bool progdrop, rv;
 
-	INP_WLOCK(inp);
-	CURVNET_SET(inp->inp_vnet);
+	INP_WLOCK_ASSERT(inp);
 
-	if (callout_pending(&tp->tt_persist) ||
-	    !callout_active(&tp->tt_persist)) {
-		INP_WUNLOCK(inp);
-		CURVNET_RESTORE();
-		return;
-	}
-	callout_deactivate(&tp->tt_persist);
-	if (inp->inp_flags & INP_DROPPED) {
-		INP_WUNLOCK(inp);
-		CURVNET_RESTORE();
-		return;
-	}
-	KASSERT((tp->tt_flags & TT_STOPPED) == 0,
-		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
+	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
+	CURVNET_SET(inp->inp_vnet);
 	/*
 	 * Persistence timer into zero window.
 	 * Force a byte to be output, if possible.
@@ -581,9 +500,7 @@ tcp_timer_persist(void *xtp)
 		if (!progdrop)
 			TCPSTAT_INC(tcps_persistdrop);
 		tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
-		tcp_timer_drop(tp);
-		CURVNET_RESTORE();
-		return;
+		goto dropit;
 	}
 	/*
 	 * If the user has closed the socket then drop a persisting
@@ -593,57 +510,39 @@ tcp_timer_persist(void *xtp)
 	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
 		TCPSTAT_INC(tcps_persistdrop);
 		tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
-		tcp_timer_drop(tp);
-		CURVNET_RESTORE();
-		return;
+		goto dropit;
 	}
 	tcp_setpersist(tp);
 	tp->t_flags |= TF_FORCEDATA;
 	NET_EPOCH_ENTER(et);
-	outrv = tcp_output_nodrop(tp);
-	tp->t_flags &= ~TF_FORCEDATA;
+	if ((rv = tcp_output_locked(tp)))
+		tp->t_flags &= ~TF_FORCEDATA;
+	NET_EPOCH_EXIT(et);
+	CURVNET_RESTORE();
 
-#ifdef TCPDEBUG
-	if (tp != NULL && tptosocket(tp)->so_options & SO_DEBUG)
-		tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
-#endif
-	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
-	(void) tcp_unlock_or_drop(tp, outrv);
+	return (rv);
+
+dropit:
+	NET_EPOCH_ENTER(et);
+	tp = tcp_drop(tp, ETIMEDOUT);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
+
+	return (tp != NULL);
 }
 
-void
-tcp_timer_rexmt(void * xtp)
+static bool
+tcp_timer_rexmt(struct tcpcb *tp)
 {
 	struct epoch_tracker et;
-	struct tcpcb *tp = xtp;
 	struct inpcb *inp = tptoinpcb(tp);
-	int rexmt, outrv;
-	bool isipv6;
-#ifdef TCPDEBUG
-	int ostate;
+	int rexmt;
+	bool isipv6, rv;
 
-	ostate = tp->t_state;
-#endif
+	INP_WLOCK_ASSERT(inp);
 
-	INP_WLOCK(inp);
+	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 	CURVNET_SET(inp->inp_vnet);
-
-	if (callout_pending(&tp->tt_rexmt) ||
-	    !callout_active(&tp->tt_rexmt)) {
-		INP_WUNLOCK(inp);
-		CURVNET_RESTORE();
-		return;
-	}
-	callout_deactivate(&tp->tt_rexmt);
-	if (inp->inp_flags & INP_DROPPED) {
-		INP_WUNLOCK(inp);
-		CURVNET_RESTORE();
-		return;
-	}
-	KASSERT((tp->tt_flags & TT_STOPPED) == 0,
-		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 	tcp_free_sackholes(tp);
 	TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false);
 	if (tp->t_fb->tfb_tcp_rexmit_tmr) {
@@ -664,9 +563,12 @@ tcp_timer_rexmt(void * xtp)
 			TCPSTAT_INC(tcps_timeoutdrop);
 		tp->t_rxtshift = TCP_MAXRXTSHIFT;
 		tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
-		tcp_timer_drop(tp);
+		NET_EPOCH_ENTER(et);
+		tp = tcp_drop(tp, ETIMEDOUT);
+		NET_EPOCH_EXIT(et);
 		CURVNET_RESTORE();
-		return;
+
+		return (tp != NULL);
 	}
 	if (tp->t_state == TCPS_SYN_SENT) {
 		/*
@@ -883,159 +785,131 @@ tcp_timer_rexmt(void * xtp)
 
 	cc_cong_signal(tp, NULL, CC_RTO);
 	NET_EPOCH_ENTER(et);
-	outrv = tcp_output_nodrop(tp);
-#ifdef TCPDEBUG
-	if (tp != NULL && (tptosocket(tp)->so_options & SO_DEBUG))
-		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
-			  PRU_SLOWTIMO);
-#endif
-	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
-	(void) tcp_unlock_or_drop(tp, outrv);
+	rv = tcp_output_locked(tp);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
+
+	return (rv);
+}
+
+static inline tt_which
+tcp_timer_next(struct tcpcb *tp, sbintime_t *precision)
+{
+	tt_which i, rv;
+	sbintime_t after, before;
+
+	for (i = 0, rv = TT_N, after = before = SBT_MAX; i < TT_N; i++) {
+		if (tp->t_timers[i] < after) {
+			after = tp->t_timers[i];
+			rv = i;
+		}
+		before = MIN(before, tp->t_timers[i] + tp->t_precisions[i]);
+	}
+	if (precision != NULL)
+		*precision = before - after;
+
+	return (rv);
+}
+
+static void
+tcp_timer_enter(void *xtp)
+{
+	struct tcpcb *tp = xtp;
+	struct inpcb *inp = tptoinpcb(tp);
+	sbintime_t precision;
+	tt_which which;
+
+	INP_WLOCK_ASSERT(inp);
+	MPASS((curthread->td_pflags & TDP_INTCPCALLOUT) == 0);
+
+	curthread->td_pflags |= TDP_INTCPCALLOUT;
+
+	which = tcp_timer_next(tp, NULL);
+	MPASS(which < TT_N);
+	tp->t_timers[which] = SBT_MAX;
+	tp->t_precisions[which] = 0;
+
+	if (tcp_timersw[which](tp)) {
+		if ((which = tcp_timer_next(tp, &precision)) != TT_N) {
+			callout_reset_sbt_on(&tp->t_callout,
+			    tp->t_timers[which], precision, tcp_timer_enter,
+			    tp, inp_to_cpuid(inp), C_ABSOLUTE);
+		}
+		INP_WUNLOCK(inp);
+	}
+
+	curthread->td_pflags &= ~TDP_INTCPCALLOUT;
 }
 
+/*
+ * Activate or stop (delta == 0) a TCP timer.
+ */
 void
-tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
+tcp_timer_activate(struct tcpcb *tp, tt_which which, u_int delta)
 {
-	struct callout *t_callout;
-	callout_func_t *f_callout;
 	struct inpcb *inp = tptoinpcb(tp);
-	int cpu = inp_to_cpuid(inp);
+	sbintime_t precision;
 
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE)
 		return;
 #endif
 
-	if (tp->tt_flags & TT_STOPPED)
-		return;
+	INP_WLOCK_ASSERT(inp);
 
-	switch (timer_type) {
-		case TT_DELACK:
-			t_callout = &tp->tt_delack;
-			f_callout = tcp_timer_delack;
-			break;
-		case TT_REXMT:
-			t_callout = &tp->tt_rexmt;
-			f_callout = tcp_timer_rexmt;
-			break;
-		case TT_PERSIST:
-			t_callout = &tp->tt_persist;
-			f_callout = tcp_timer_persist;
-			break;
-		case TT_KEEP:
-			t_callout = &tp->tt_keep;
-			f_callout = tcp_timer_keep;
-			break;
-		case TT_2MSL:
-			t_callout = &tp->tt_2msl;
-			f_callout = tcp_timer_2msl;
-			break;
-		default:
-			if (tp->t_fb->tfb_tcp_timer_activate) {
-				tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta);
-				return;
-			}
-			panic("tp %p bad timer_type %#x", tp, timer_type);
-		}
-	if (delta == 0) {
-		callout_stop(t_callout);
-	} else {
-		callout_reset_on(t_callout, delta, f_callout, tp, cpu);
-	}
-}
+	if (delta > 0)
+		callout_when(tick_sbt * delta, 0, C_HARDCLOCK,
+		    &tp->t_timers[which], &tp->t_precisions[which]);
+	else
+		tp->t_timers[which] = SBT_MAX;
 
-int
-tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
-{
-	struct callout *t_callout;
-
-	switch (timer_type) {
-		case TT_DELACK:
-			t_callout = &tp->tt_delack;
-			break;
-		case TT_REXMT:
-			t_callout = &tp->tt_rexmt;
-			break;
-		case TT_PERSIST:
-			t_callout = &tp->tt_persist;
-			break;
-		case TT_KEEP:
-			t_callout = &tp->tt_keep;
-			break;
-		case TT_2MSL:
-			t_callout = &tp->tt_2msl;
-			break;
-		default:
*** 245 LINES SKIPPED ***