git: 0d7445193abc - main - tcp: remove tcptw, the compressed timewait state structure

From: Gleb Smirnoff <glebius_at_FreeBSD.org>
Date: Fri, 07 Oct 2022 02:35:38 UTC
The branch main has been updated by glebius:

URL: https://cgit.FreeBSD.org/src/commit/?id=0d7445193abc7c68703fd9b6de39f3f6cf6b55c9

commit 0d7445193abc7c68703fd9b6de39f3f6cf6b55c9
Author:     Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2022-10-07 02:22:23 +0000
Commit:     Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2022-10-07 02:22:23 +0000

    tcp: remove tcptw, the compressed timewait state structure
    
    The memory savings the tcptw brought back in 2003 (see 340c35de6a2) no
    longer justify the complexity required to maintain it.  For longer
    explanation please check out the email [1].
    
    Surpisingly through almost 20 years the TCP stack functionality of
    handling the TIME_WAIT state with a normal tcpcb did not bitrot.  The
    existing tcp_input() properly handles a tcpcb in TCPS_TIME_WAIT state,
    which is confirmed by the packetdrill tcp-testsuite [2].
    
    This change just removes tcptw and leaves INP_TIMEWAIT.  The flag will
    be removed in a separate commit.  This makes it easier to review and
    possibly debug the changes.
    
    [1] https://lists.freebsd.org/archives/freebsd-net/2022-January/001206.html
    [2] https://github.com/freebsd-net/tcp-testsuite
    
    Differential revision:  https://reviews.freebsd.org/D36398
---
 sys/netinet/in_pcb.c          |  22 +-
 sys/netinet/tcp_input.c       |  36 ++-
 sys/netinet/tcp_stacks/bbr.c  |   2 -
 sys/netinet/tcp_stacks/rack.c |   3 +-
 sys/netinet/tcp_subr.c        | 130 +++------
 sys/netinet/tcp_timer.c       |  43 +--
 sys/netinet/tcp_timer.h       |   2 -
 sys/netinet/tcp_timewait.c    | 638 ++++--------------------------------------
 sys/netinet/tcp_var.h         |  25 +-
 sys/netinet/toecore.c         |   4 +-
 sys/netinet6/in6_pcb.c        |  30 +-
 11 files changed, 126 insertions(+), 809 deletions(-)

diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index 7033ad05cab7..9dd6d3d019ca 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -1031,7 +1031,6 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
 		laddr = sin->sin_addr;
 		if (lport) {
 			struct inpcb *t;
-			struct tcptw *tw;
 
 			/* GROSS */
 			if (ntohs(lport) <= V_ipport_reservedhigh &&
@@ -1070,24 +1069,9 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
 			}
 			t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 			    lport, lookupflags, cred);
-			if (t && (t->inp_flags & INP_TIMEWAIT)) {
-				/*
-				 * XXXRW: If an incpb has had its timewait
-				 * state recycled, we treat the address as
-				 * being in use (for now).  This is better
-				 * than a panic, but not desirable.
-				 */
-				tw = intotw(t);
-				if (tw == NULL ||
-				    ((reuseport & tw->tw_so_options) == 0 &&
-					(reuseport_lb &
-				            tw->tw_so_options) == 0)) {
-					return (EADDRINUSE);
-				}
-			} else if (t &&
-				   ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
-				   (reuseport & inp_so_options(t)) == 0 &&
-				   (reuseport_lb & inp_so_options(t)) == 0) {
+			if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
+			    (reuseport & inp_so_options(t)) == 0 &&
+			    (reuseport_lb & inp_so_options(t)) == 0) {
 #ifdef INET6
 				if (ntohl(sin->sin_addr.s_addr) !=
 				    INADDR_ANY ||
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 7cd9494e5e4f..c1e1f58e315c 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -999,29 +999,29 @@ findpcb:
 			goto dropunlock;
 	}
 
-	/*
-	 * A previous connection in TIMEWAIT state is supposed to catch stray
-	 * or duplicate segments arriving late.  If this segment was a
-	 * legitimate new connection attempt, the old INPCB gets removed and
-	 * we can try again to find a listening socket.
-	 */
-	if (inp->inp_flags & INP_TIMEWAIT) {
+	tp = intotcpcb(inp);
+	switch (tp->t_state) {
+	case TCPS_TIME_WAIT:
+		/*
+		 * A previous connection in TIMEWAIT state is supposed to catch
+		 * stray or duplicate segments arriving late.  If this segment
+		 * was a legitimate new connection attempt, the old INPCB gets
+		 * removed and we can try again to find a listening socket.
+		 */
 		tcp_dooptions(&to, optp, optlen,
 		    (thflags & TH_SYN) ? TO_SYN : 0);
 		/*
-		 * NB: tcp_twcheck unlocks the INP and frees the mbuf.
+		 * tcp_twcheck unlocks the inp always, and frees the m if fails.
 		 */
 		if (tcp_twcheck(inp, &to, th, m, tlen))
 			goto findpcb;
 		return (IPPROTO_DONE);
-	}
-	/*
-	 * The TCPCB may no longer exist if the connection is winding
-	 * down or it is in the CLOSED state.  Either way we drop the
-	 * segment and send an appropriate response.
-	 */
-	tp = intotcpcb(inp);
-	if (tp == NULL || tp->t_state == TCPS_CLOSED) {
+	case TCPS_CLOSED:
+		/*
+		 * The TCPCB may no longer exist if the connection is winding
+		 * down or it is in the CLOSED state.  Either way we drop the
+		 * segment and send an appropriate response.
+		 */
 		rstreason = BANDLIM_RST_CLOSEDPORT;
 		goto dropwithreset;
 	}
@@ -3030,10 +3030,6 @@ process_ACK:
 				 * Starting the timer is contrary to the
 				 * specification, but if we don't get a FIN
 				 * we'll hang forever.
-				 *
-				 * XXXjl:
-				 * we should release the tp also, and use a
-				 * compressed state.
 				 */
 				if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 					soisdisconnected(so);
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index 5c5c1a2f9986..31b5c2cc78dc 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -11359,8 +11359,6 @@ bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
 	    __func__));
-	KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
-	    __func__));
 
 	tp->t_rcvtime = ticks;
 	/*
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 813cf0a57c3f..740ec73a17df 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -14154,8 +14154,7 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
 	    __func__));
-	KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
-	    __func__));
+
 	if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
 	    (tp->t_flags & TF_GPUTINPROG)) {
 		/*
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index a1df6677d226..c50f416351c3 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -1485,7 +1485,6 @@ tcp_vnet_init(void *arg __unused)
 	uma_zone_set_max(V_tcpcb_zone, maxsockets);
 	uma_zone_set_warning(V_tcpcb_zone, "kern.ipc.maxsockets limit reached");
 
-	tcp_tw_init();
 	syncache_init();
 	tcp_hc_init();
 
@@ -1647,7 +1646,6 @@ tcp_destroy(void *unused __unused)
 	}
 	tcp_hc_destroy();
 	syncache_destroy();
-	tcp_tw_destroy();
 	in_pcbinfo_destroy(&V_tcbinfo);
 	/* tcp_discardcb() clears the sack_holes up. */
 	uma_zdestroy(V_sack_hole_zone);
@@ -2678,33 +2676,17 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
 		return (error);
 
 	while ((inp = inp_next(&inpi)) != NULL) {
-		if (inp->inp_gencnt <= xig.xig_gen) {
-			int crerr;
-
-			/*
-			 * XXX: This use of cr_cansee(), introduced with
-			 * TCP state changes, is not quite right, but for
-			 * now, better than nothing.
-			 */
-			if (inp->inp_flags & INP_TIMEWAIT) {
-				if (intotw(inp) != NULL)
-					crerr = cr_cansee(req->td->td_ucred,
-					    intotw(inp)->tw_cred);
-				else
-					crerr = EINVAL;	/* Skip this inp. */
+		if (inp->inp_gencnt <= xig.xig_gen &&
+		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
+			struct xtcpcb xt;
+
+			tcp_inptoxtp(inp, &xt);
+			error = SYSCTL_OUT(req, &xt, sizeof xt);
+			if (error) {
+				INP_RUNLOCK(inp);
+				break;
 			} else
-				crerr = cr_canseeinpcb(req->td->td_ucred, inp);
-			if (crerr == 0) {
-				struct xtcpcb xt;
-
-				tcp_inptoxtp(inp, &xt);
-				error = SYSCTL_OUT(req, &xt, sizeof xt);
-				if (error) {
-					INP_RUNLOCK(inp);
-					break;
-				} else
-					continue;
-			}
+				continue;
 		}
 	}
 
@@ -3639,7 +3621,6 @@ sysctl_drop(SYSCTL_HANDLER_ARGS)
 	struct sockaddr_storage addrs[2];
 	struct inpcb *inp;
 	struct tcpcb *tp;
-	struct tcptw *tw;
 #ifdef INET
 	struct sockaddr_in *fin = NULL, *lin = NULL;
 #endif
@@ -3721,19 +3702,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS)
 #endif
 	}
 	if (inp != NULL) {
-		if (inp->inp_flags & INP_TIMEWAIT) {
-			/*
-			 * XXXRW: There currently exists a state where an
-			 * inpcb is present, but its timewait state has been
-			 * discarded.  For now, don't allow dropping of this
-			 * type of inpcb.
-			 */
-			tw = intotw(inp);
-			if (tw != NULL)
-				tcp_twclose(tw, 0);
-			else
-				INP_WUNLOCK(inp);
-		} else if ((inp->inp_flags & INP_DROPPED) == 0 &&
+		if ((inp->inp_flags & INP_DROPPED) == 0 &&
 		    !SOLISTENING(inp->inp_socket)) {
 			tp = intotcpcb(inp);
 			tp = tcp_drop(tp, ECONNABORTED);
@@ -4027,56 +3996,49 @@ void
 tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt)
 {
 	struct tcpcb *tp = intotcpcb(inp);
-	struct tcptw *tw = intotw(inp);
 	sbintime_t now;
 
 	bzero(xt, sizeof(*xt));
-	if (inp->inp_flags & INP_TIMEWAIT) {
-		xt->t_state = TCPS_TIME_WAIT;
-		xt->xt_encaps_port = tw->t_port;
-	} else {
-		xt->t_state = tp->t_state;
-		xt->t_logstate = tp->t_logstate;
-		xt->t_flags = tp->t_flags;
-		xt->t_sndzerowin = tp->t_sndzerowin;
-		xt->t_sndrexmitpack = tp->t_sndrexmitpack;
-		xt->t_rcvoopack = tp->t_rcvoopack;
-		xt->t_rcv_wnd = tp->rcv_wnd;
-		xt->t_snd_wnd = tp->snd_wnd;
-		xt->t_snd_cwnd = tp->snd_cwnd;
-		xt->t_snd_ssthresh = tp->snd_ssthresh;
-		xt->t_dsack_bytes = tp->t_dsack_bytes;
-		xt->t_dsack_tlp_bytes = tp->t_dsack_tlp_bytes;
-		xt->t_dsack_pack = tp->t_dsack_pack;
-		xt->t_maxseg = tp->t_maxseg;
-		xt->xt_ecn = (tp->t_flags2 & TF2_ECN_PERMIT) ? 1 : 0 +
-			     (tp->t_flags2 & TF2_ACE_PERMIT) ? 2 : 0;
-
-		now = getsbinuptime();
-#define	COPYTIMER(ttt)	do {						\
-		if (callout_active(&tp->t_timers->ttt))			\
-			xt->ttt = (tp->t_timers->ttt.c_time - now) /	\
-			    SBT_1MS;					\
-		else							\
-			xt->ttt = 0;					\
+	xt->t_state = tp->t_state;
+	xt->t_logstate = tp->t_logstate;
+	xt->t_flags = tp->t_flags;
+	xt->t_sndzerowin = tp->t_sndzerowin;
+	xt->t_sndrexmitpack = tp->t_sndrexmitpack;
+	xt->t_rcvoopack = tp->t_rcvoopack;
+	xt->t_rcv_wnd = tp->rcv_wnd;
+	xt->t_snd_wnd = tp->snd_wnd;
+	xt->t_snd_cwnd = tp->snd_cwnd;
+	xt->t_snd_ssthresh = tp->snd_ssthresh;
+	xt->t_dsack_bytes = tp->t_dsack_bytes;
+	xt->t_dsack_tlp_bytes = tp->t_dsack_tlp_bytes;
+	xt->t_dsack_pack = tp->t_dsack_pack;
+	xt->t_maxseg = tp->t_maxseg;
+	xt->xt_ecn = (tp->t_flags2 & TF2_ECN_PERMIT) ? 1 : 0 +
+		     (tp->t_flags2 & TF2_ACE_PERMIT) ? 2 : 0;
+
+	now = getsbinuptime();
+#define	COPYTIMER(ttt)	do {					\
+	if (callout_active(&tp->t_timers->ttt))			\
+		xt->ttt = (tp->t_timers->ttt.c_time - now) /	\
+		    SBT_1MS;					\
+	else							\
+		xt->ttt = 0;					\
 } while (0)
-		COPYTIMER(tt_delack);
-		COPYTIMER(tt_rexmt);
-		COPYTIMER(tt_persist);
-		COPYTIMER(tt_keep);
-		COPYTIMER(tt_2msl);
+	COPYTIMER(tt_delack);
+	COPYTIMER(tt_rexmt);
+	COPYTIMER(tt_persist);
+	COPYTIMER(tt_keep);
+	COPYTIMER(tt_2msl);
 #undef COPYTIMER
-		xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz;
+	xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz;
 
-		xt->xt_encaps_port = tp->t_port;
-		bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack,
-		    TCP_FUNCTION_NAME_LEN_MAX);
-		bcopy(CC_ALGO(tp)->name, xt->xt_cc,
-		    TCP_CA_NAME_MAX);
+	xt->xt_encaps_port = tp->t_port;
+	bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack,
+	    TCP_FUNCTION_NAME_LEN_MAX);
+	bcopy(CC_ALGO(tp)->name, xt->xt_cc, TCP_CA_NAME_MAX);
 #ifdef TCP_BLACKBOX
-		(void)tcp_log_get_id(tp, xt->xt_logid);
+	(void)tcp_log_get_id(tp, xt->xt_logid);
 #endif
-	}
 
 	xt->xt_len = sizeof(struct xtcpcb);
 	in_pcbtoxinpcb(inp, &xt->xt_inp);
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index a3837a7db90c..f4915da6e77c 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -236,41 +236,6 @@ inp_to_cpuid(struct inpcb *inp)
 	}
 }
 
-/*
- * Legacy TCP global callout routine called every 500 ms.
- * Used to cleanup timewait states, which lack their own callouts.
- */
-static struct callout tcpslow_callout;
-static void
-tcp_slowtimo(void *arg __unused)
-{
-	struct epoch_tracker et;
-	VNET_ITERATOR_DECL(vnet_iter);
-
-	NET_EPOCH_ENTER(et);
-	VNET_LIST_RLOCK_NOSLEEP();
-	VNET_FOREACH(vnet_iter) {
-		CURVNET_SET(vnet_iter);
-		(void) tcp_tw_2msl_scan(0);
-		CURVNET_RESTORE();
-	}
-	VNET_LIST_RUNLOCK_NOSLEEP();
-	NET_EPOCH_EXIT(et);
-
-	callout_reset_sbt(&tcpslow_callout, SBT_1MS * 500, SBT_1MS * 10,
-	    tcp_slowtimo, NULL, 0);
-}
-
-static void
-tcp_slowtimo_init(void *arg __unused)
-{
-
-        callout_init(&tcpslow_callout, 1);
-	callout_reset_sbt(&tcpslow_callout, SBT_1MS * 500, SBT_1MS * 10,
-	    tcp_slowtimo, NULL, 0);
-}
-SYSINIT(tcp_timer, SI_SUB_VNET_DONE, SI_ORDER_ANY, tcp_slowtimo_init, NULL);
-
 int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
     { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
 
@@ -387,8 +352,12 @@ tcp_timer_2msl(void *xtp)
 	 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it.
 	 * Ignore fact that there were recent incoming segments.
 	 */
-	if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
-	    tp->t_inpcb && tp->t_inpcb->inp_socket &&
+	if (tp->t_state == TCPS_TIME_WAIT) {
+		tcp_timer_close(tp);
+		CURVNET_RESTORE();
+		return;
+	} else if (tp->t_state == TCPS_FIN_WAIT_2 &&
+	    tcp_fast_finwait2_recycle && tp->t_inpcb->inp_socket &&
 	    (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
 		TCPSTAT_INC(tcps_finwait2_drops);
 		tcp_timer_close(tp);
diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h
index bb47ab72a63d..8742098855ab 100644
--- a/sys/netinet/tcp_timer.h
+++ b/sys/netinet/tcp_timer.h
@@ -229,8 +229,6 @@ VNET_DECLARE(int, tcp_msl);
 
 void	tcp_timer_init(void);
 void	tcp_timer_2msl(void *xtp);
-struct tcptw *
-	tcp_tw_2msl_scan(int reuse);	/* XXX temporary? */
 void	tcp_timer_keep(void *xtp);
 void	tcp_timer_persist(void *xtp);
 void	tcp_timer_rexmt(void *xtp);
diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c
index 5cac0af946a9..6b29335c30f9 100644
--- a/sys/netinet/tcp_timewait.c
+++ b/sys/netinet/tcp_timewait.c
@@ -96,142 +96,26 @@ __FBSDID("$FreeBSD$");
 
 #include <security/mac/mac_framework.h>
 
-VNET_DEFINE_STATIC(uma_zone_t, tcptw_zone);
-#define	V_tcptw_zone		VNET(tcptw_zone)
-static int	maxtcptw;
-
-/*
- * The timed wait queue contains references to each of the TCP sessions
- * currently in the TIME_WAIT state.  The queue pointers, including the
- * queue pointers in each tcptw structure, are protected using the global
- * timewait lock, which must be held over queue iteration and modification.
- *
- * Rules on tcptw usage:
- *  - a inpcb is always freed _after_ its tcptw
- *  - a tcptw relies on its inpcb reference counting for memory stability
- *  - a tcptw is dereferenceable only while its inpcb is locked
- */
-VNET_DEFINE_STATIC(TAILQ_HEAD(, tcptw), twq_2msl);
-#define	V_twq_2msl		VNET(twq_2msl)
-
-/* Global timewait lock */
-VNET_DEFINE_STATIC(struct rwlock, tw_lock);
-#define	V_tw_lock		VNET(tw_lock)
-
-#define	TW_LOCK_INIT(tw, d)	rw_init_flags(&(tw), (d), 0)
-#define	TW_LOCK_DESTROY(tw)	rw_destroy(&(tw))
-#define	TW_RLOCK(tw)		rw_rlock(&(tw))
-#define	TW_WLOCK(tw)		rw_wlock(&(tw))
-#define	TW_RUNLOCK(tw)		rw_runlock(&(tw))
-#define	TW_WUNLOCK(tw)		rw_wunlock(&(tw))
-#define	TW_LOCK_ASSERT(tw)	rw_assert(&(tw), RA_LOCKED)
-#define	TW_RLOCK_ASSERT(tw)	rw_assert(&(tw), RA_RLOCKED)
-#define	TW_WLOCK_ASSERT(tw)	rw_assert(&(tw), RA_WLOCKED)
-#define	TW_UNLOCK_ASSERT(tw)	rw_assert(&(tw), RA_UNLOCKED)
-
-static void	tcp_tw_2msl_reset(struct tcptw *, int);
-static void	tcp_tw_2msl_stop(struct tcptw *, int);
-static int	tcp_twrespond(struct tcptw *, int);
-
-static int
-tcptw_auto_size(void)
-{
-	int halfrange;
-
-	/*
-	 * Max out at half the ephemeral port range so that TIME_WAIT
-	 * sockets don't tie up too many ephemeral ports.
-	 */
-	if (V_ipport_lastauto > V_ipport_firstauto)
-		halfrange = (V_ipport_lastauto - V_ipport_firstauto) / 2;
-	else
-		halfrange = (V_ipport_firstauto - V_ipport_lastauto) / 2;
-	/* Protect against goofy port ranges smaller than 32. */
-	return (imin(imax(halfrange, 32), maxsockets / 5));
-}
-
-static int
-sysctl_maxtcptw(SYSCTL_HANDLER_ARGS)
-{
-	int error, new;
-
-	if (maxtcptw == 0)
-		new = tcptw_auto_size();
-	else
-		new = maxtcptw;
-	error = sysctl_handle_int(oidp, &new, 0, req);
-	if (error == 0 && req->newptr)
-		if (new >= 32) {
-			maxtcptw = new;
-			uma_zone_set_max(V_tcptw_zone, maxtcptw);
-		}
-	return (error);
-}
-
-SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw,
-    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
-    &maxtcptw, 0, sysctl_maxtcptw, "IU",
-    "Maximum number of compressed TCP TIME_WAIT entries");
-
 VNET_DEFINE_STATIC(bool, nolocaltimewait) = true;
 #define	V_nolocaltimewait	VNET(nolocaltimewait)
-SYSCTL_BOOL(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_VNET | CTLFLAG_RW,
-    &VNET_NAME(nolocaltimewait), true,
-    "Do not create compressed TCP TIME_WAIT entries for local connections");
-
-void
-tcp_tw_zone_change(void)
-{
-
-	if (maxtcptw == 0)
-		uma_zone_set_max(V_tcptw_zone, tcptw_auto_size());
-}
-
-void
-tcp_tw_init(void)
-{
-
-	V_tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw),
-	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
-	TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw);
-	if (maxtcptw == 0)
-		uma_zone_set_max(V_tcptw_zone, tcptw_auto_size());
-	else
-		uma_zone_set_max(V_tcptw_zone, maxtcptw);
-	TAILQ_INIT(&V_twq_2msl);
-	TW_LOCK_INIT(V_tw_lock, "tcptw");
-}
-
-#ifdef VIMAGE
-void
-tcp_tw_destroy(void)
-{
-	struct tcptw *tw;
-	struct epoch_tracker et;
-
-	NET_EPOCH_ENTER(et);
-	while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL)
-		tcp_twclose(tw, 0);
-	NET_EPOCH_EXIT(et);
-
-	TW_LOCK_DESTROY(V_tw_lock);
-	uma_zdestroy(V_tcptw_zone);
-}
-#endif
+SYSCTL_BOOL(_net_inet_tcp, OID_AUTO, nolocaltimewait,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nolocaltimewait), true,
+    "Do not create TCP TIME_WAIT state for local connections");
 
 /*
  * Move a TCP connection into TIME_WAIT state.
- *    tcbinfo is locked.
  *    inp is locked, and is unlocked before returning.
+ *
+ * This function used to free tcpcb and allocate a compressed TCP time-wait
+ * structure tcptw.  This served well for 20 years but is no longer relevant
+ * on modern machines in the modern internet.  However, the function remains
+ * so that TCP stacks require less modification and we don't burn the bridge
+ * to go back to using compressed time-wait.
  */
 void
 tcp_twstart(struct tcpcb *tp)
 {
-	struct tcptw twlocal, *tw;
 	struct inpcb *inp = tp->t_inpcb;
-	struct socket *so;
-	uint32_t recwin;
-	bool acknow, local;
 #ifdef INET6
 	bool isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
 #endif
@@ -243,144 +127,44 @@ tcp_twstart(struct tcpcb *tp)
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("tcp_twstart: "
 	    "(inp->inp_flags & INP_DROPPED) != 0"));
 
-	if (V_nolocaltimewait) {
+	tcp_state_change(tp, TCPS_TIME_WAIT);
+	soisdisconnected(inp->inp_socket);
+
+	if (tp->t_flags & TF_ACKNOW)
+		tcp_output(tp);
+
+	if (V_nolocaltimewait && (
 #ifdef INET6
-		if (isipv6)
-			local = in6_localaddr(&inp->in6p_faddr);
-		else
+	    isipv6 ? in6_localaddr(&inp->in6p_faddr) :
 #endif
 #ifdef INET
-			local = in_localip(inp->inp_faddr);
+	    in_localip(inp->inp_faddr)
 #else
-			local = false;
+	    false
 #endif
-	} else
-		local = false;
-
-	/*
-	 * For use only by DTrace.  We do not reference the state
-	 * after this point so modifying it in place is not a problem.
-	 */
-	tcp_state_change(tp, TCPS_TIME_WAIT);
-
-	if (local)
-		tw = &twlocal;
-	else
-		tw = uma_zalloc(V_tcptw_zone, M_NOWAIT);
-	if (tw == NULL) {
-		/*
-		 * Reached limit on total number of TIMEWAIT connections
-		 * allowed. Remove a connection from TIMEWAIT queue in LRU
-		 * fashion to make room for this connection.
-		 * If that fails, use on stack tw at least to be able to
-		 * run through tcp_twrespond() and standard tcpcb discard
-		 * routine.
-		 *
-		 * XXX:  Check if it possible to always have enough room
-		 * in advance based on guarantees provided by uma_zalloc().
-		 */
-		tw = tcp_tw_2msl_scan(1);
-		if (tw == NULL) {
-			tw = &twlocal;
-			local = true;
-		}
-	}
-	/*
-	 * For !local case the tcptw will hold a reference on its inpcb
-	 * until tcp_twclose is called.
-	 */
-	tw->tw_inpcb = inp;
-
-	/*
-	 * Recover last window size sent.
-	 */
-	so = inp->inp_socket;
-	recwin = lmin(lmax(sbspace(&so->so_rcv), 0),
-	    (long)TCP_MAXWIN << tp->rcv_scale);
-	if (recwin < (so->so_rcv.sb_hiwat / 4) &&
-	    recwin < tp->t_maxseg)
-		recwin = 0;
-	if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
-	    recwin < (tp->rcv_adv - tp->rcv_nxt))
-		recwin = (tp->rcv_adv - tp->rcv_nxt);
-	tw->last_win = (u_short)(recwin >> tp->rcv_scale);
-
-	/*
-	 * Set t_recent if timestamps are used on the connection.
-	 */
-	if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
-	    (TF_REQ_TSTMP|TF_RCVD_TSTMP)) {
-		tw->t_recent = tp->ts_recent;
-		tw->ts_offset = tp->ts_offset;
-	} else {
-		tw->t_recent = 0;
-		tw->ts_offset = 0;
-	}
-
-	tw->snd_nxt = tp->snd_nxt;
-	tw->t_port = tp->t_port;
-	tw->rcv_nxt = tp->rcv_nxt;
-	tw->tw_time = 0;
-	tw->tw_flags = tp->t_flags;
-
-/* XXX
- * If this code will
- * be used for fin-wait-2 state also, then we may need
- * a ts_recent from the last segment.
- */
-	acknow = tp->t_flags & TF_ACKNOW;
-
-	/*
-	 * First, discard tcpcb state, which includes stopping its timers and
-	 * freeing it.  tcp_discardcb() used to also release the inpcb, but
-	 * that work is now done in the caller.
-	 *
-	 * Note: soisdisconnected() call used to be made in tcp_discardcb(),
-	 * and might not be needed here any longer.
-	 */
-#ifdef TCPHPTS
-	tcp_hpts_remove(inp);
-#endif
-	tcp_discardcb(tp);
-	soisdisconnected(so);
-	tw->tw_so_options = so->so_options;
-	inp->inp_flags |= INP_TIMEWAIT;
-	if (acknow)
-		tcp_twrespond(tw, TH_ACK);
-	if (local)
-		in_pcbdrop(inp);
-	else {
-		in_pcbref(inp);	/* Reference from tw */
-		tw->tw_cred = crhold(so->so_cred);
-		inp->inp_ppcb = tw;
-		TCPSTATES_INC(TCPS_TIME_WAIT);
-		tcp_tw_2msl_reset(tw, 0);
+	    )) {
+		if ((tp = tcp_close(tp)) != NULL)
+			INP_WUNLOCK(inp);
+		return;
 	}
 
-	/*
-	 * If the inpcb owns the sole reference to the socket, then we can
-	 * detach and free the socket as it is not needed in time wait.
-	 */
-	if (inp->inp_flags & INP_SOCKREF) {
-		inp->inp_flags &= ~INP_SOCKREF;
-		INP_WUNLOCK(inp);
-		sorele(so);
-	} else
-		INP_WUNLOCK(inp);
+	tcp_timer_activate(tp, TT_2MSL, 2 * V_tcp_msl);
+	INP_WUNLOCK(inp);
 }
 
 /*
- * Returns 1 if the TIME_WAIT state was killed and we should start over,
- * looking for a pcb in the listen state.  Returns 0 otherwise.
+ * Returns true if the TIME_WAIT state was killed and we should start over,
+ * looking for a pcb in the listen state.  Otherwise returns false and frees
+ * the mbuf.
  *
  * For pure SYN-segments the PCB shall be read-locked and the tcpopt pointer
  * may be NULL.  For the rest write-lock and valid tcpopt.
  */
-int
+bool
 tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
     struct mbuf *m, int tlen)
 {
-	struct tcptw *tw;
+	struct tcpcb *tp = intotcpcb(inp);
 	char *s;
 	int thflags;
 	tcp_seq seq;
@@ -388,16 +172,6 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
 	NET_EPOCH_ASSERT();
 	INP_LOCK_ASSERT(inp);
 
-	/*
-	 * XXXRW: Time wait state for inpcb has been recycled, but inpcb is
-	 * still present.  This is undesirable, but temporarily necessary
-	 * until we work out how to handle inpcb's who's timewait state has
-	 * been removed.
-	 */
-	tw = intotw(inp);
-	if (tw == NULL)
-		goto drop;
-
 	thflags = tcp_get_flags(th);
 #ifdef INVARIANTS
 	if ((thflags & (TH_SYN | TH_ACK)) == TH_SYN)
@@ -459,36 +233,37 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
 	 * Allow UDP port number changes in this case.
 	 */
 	if (((thflags & (TH_SYN | TH_ACK)) == TH_SYN) &&
-	    SEQ_GT(th->th_seq, tw->rcv_nxt)) {
+	    SEQ_GT(th->th_seq, tp->rcv_nxt)) {
 		/*
 		 * In case we can't upgrade our lock just pretend we have
 		 * lost this packet.
 		 */
 		if (INP_TRY_UPGRADE(inp) == 0)
 			goto drop;
-		tcp_twclose(tw, 0);
+		if ((tp = tcp_close(tp)) != NULL)
+			INP_WUNLOCK(inp);
 		TCPSTAT_INC(tcps_tw_recycles);
-		return (1);
+		return (true);
 	}
 
 	/*
 	 * Send RST if UDP port numbers don't match
 	 */
-	if (tw->t_port != m->m_pkthdr.tcp_tun_port) {
+	if (tp->t_port != m->m_pkthdr.tcp_tun_port) {
 		if (tcp_get_flags(th) & TH_ACK) {
-			tcp_respond(NULL, mtod(m, void *), th, m,
+			tcp_respond(tp, mtod(m, void *), th, m,
 			    (tcp_seq)0, th->th_ack, TH_RST);
 		} else {
 			if (tcp_get_flags(th) & TH_SYN)
 				tlen++;
 			if (tcp_get_flags(th) & TH_FIN)
 				tlen++;
-			tcp_respond(NULL, mtod(m, void *), th, m,
+			tcp_respond(tp, mtod(m, void *), th, m,
 			    th->th_seq+tlen, (tcp_seq)0, TH_RST|TH_ACK);
 		}
 		INP_UNLOCK(inp);
 		TCPSTAT_INC(tcps_tw_resets);
-		return (0);
+		return (false);
 	}
 
 	/*
@@ -505,7 +280,7 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
 	 * the segment, unless the missing timestamps are tolerated.
 	 * See section 3.2 of RFC 7323.
 	 */
-	if (((to->to_flags & TOF_TS) == 0) && (tw->t_recent != 0) &&
+	if (((to->to_flags & TOF_TS) == 0) && (tp->ts_recent != 0) &&
 	    (V_tcp_tolerate_missing_ts == 0)) {
 		goto drop;
 	}
@@ -515,344 +290,25 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
 	 */
 	if (thflags & TH_FIN) {
 		seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0);
-		if (seq + 1 == tw->rcv_nxt)
-			tcp_tw_2msl_reset(tw, 1);
+		if (seq + 1 == tp->rcv_nxt)
+			tcp_timer_activate(tp, TT_2MSL, 2 * V_tcp_msl);
 	}
 
 	/*
 	 * Acknowledge the segment if it has data or is not a duplicate ACK.
 	 */
 	if (thflags != TH_ACK || tlen != 0 ||
-	    th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt) {
+	    th->th_seq != tp->rcv_nxt || th->th_ack != tp->snd_nxt) {
 		TCP_PROBE5(receive, NULL, NULL, m, NULL, th);
-		tcp_twrespond(tw, TH_ACK);
+		tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
+		    tp->snd_nxt, TH_ACK);
+		INP_UNLOCK(inp);
 		TCPSTAT_INC(tcps_tw_responds);
-		goto dropnoprobe;
+		return (false);
 	}
 drop:
 	TCP_PROBE5(receive, NULL, NULL, m, NULL, th);
-dropnoprobe:
 	INP_UNLOCK(inp);
 	m_freem(m);
-	return (0);
-}
-
-void
-tcp_twclose(struct tcptw *tw, int reuse)
-{
-	struct socket *so;
-	struct inpcb *inp;
-
-	/*
-	 * At this point, we are in one of two situations:
-	 *
-	 * (1) We have no socket, just an inpcb<->twtcp pair.  We can free
-	 *     all state.
-	 *
-	 * (2) We have a socket -- if we own a reference, release it and
-	 *     notify the socket layer.
-	 */
-	inp = tw->tw_inpcb;
-	KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait"));
-	KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw"));
-	NET_EPOCH_ASSERT();
-	INP_WLOCK_ASSERT(inp);
-
-	tcp_tw_2msl_stop(tw, reuse);
-	inp->inp_ppcb = NULL;
-	in_pcbdrop(inp);
-
-	so = inp->inp_socket;
-	if (so != NULL) {
-		/*
-		 * If there's a socket, handle two cases: first, we own a
-		 * strong reference, which we will now release, or we don't
-		 * in which case another reference exists (XXXRW: think
-		 * about this more), and we don't need to take action.
-		 */
-		if (inp->inp_flags & INP_SOCKREF) {
-			inp->inp_flags &= ~INP_SOCKREF;
-			INP_WUNLOCK(inp);
-			sorele(so);
-		} else {
-			/*
-			 * If we don't own the only reference, the socket and
-			 * inpcb need to be left around to be handled by
-			 * tcp_usr_detach() later.
-			 */
-			INP_WUNLOCK(inp);
-		}
-	} else {
-		/*
-		 * The socket has been already cleaned-up for us, only free the
-		 * inpcb.
-		 */
-		in_pcbfree(inp);
-	}
-	TCPSTAT_INC(tcps_closed);
-}
-
-static int
-tcp_twrespond(struct tcptw *tw, int flags)
-{
-	struct inpcb *inp = tw->tw_inpcb;
-#if defined(INET6) || defined(INET)
-	struct tcphdr *th = NULL;
-#endif
-	struct mbuf *m;
-#ifdef INET
-	struct ip *ip = NULL;
-#endif
-	u_int hdrlen, optlen, ulen;
-	int error = 0;			/* Keep compiler happy */
-	struct tcpopt to;
-#ifdef INET6
-	struct ip6_hdr *ip6 = NULL;
-	int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
-#endif
-	struct udphdr *udp = NULL;
-	hdrlen = 0;                     /* Keep compiler happy */
-
-	INP_WLOCK_ASSERT(inp);
-
-	m = m_gethdr(M_NOWAIT, MT_DATA);
-	if (m == NULL)
-		return (ENOBUFS);
-	m->m_data += max_linkhdr;
-
-#ifdef MAC
-	mac_inpcb_create_mbuf(inp, m);
-#endif
-
-#ifdef INET6
-	if (isipv6) {
-		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
-		ip6 = mtod(m, struct ip6_hdr *);
-		if (tw->t_port) {
-			udp = (struct udphdr *)(ip6 + 1);
-			hdrlen += sizeof(struct udphdr);
-			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
-			udp->uh_dport = tw->t_port;
-			ulen = (hdrlen - sizeof(struct ip6_hdr));
-			th = (struct tcphdr *)(udp + 1);
-		} else
-			th = (struct tcphdr *)(ip6 + 1);
-		tcpip_fillheaders(inp, tw->t_port, ip6, th);
-	}
-#endif
-#if defined(INET6) && defined(INET)
-	else
-#endif
-#ifdef INET
-	{
-		hdrlen = sizeof(struct tcpiphdr);
-		ip = mtod(m, struct ip *);
-		if (tw->t_port) {
-			udp = (struct udphdr *)(ip + 1);
-			hdrlen += sizeof(struct udphdr);
-			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
-			udp->uh_dport = tw->t_port;
-			ulen = (hdrlen - sizeof(struct ip));
-			th = (struct tcphdr *)(udp + 1);
-		} else
-			th = (struct tcphdr *)(ip + 1);
-		tcpip_fillheaders(inp, tw->t_port, ip, th);
-	}
-#endif
-	to.to_flags = 0;
-
-	/*
-	 * Send a timestamp and echo-reply if both our side and our peer
-	 * have sent timestamps in our SYN's and this is not a RST.
-	 */
-	if (tw->t_recent && flags == TH_ACK) {
-		to.to_flags |= TOF_TS;
-		to.to_tsval = tcp_ts_getticks() + tw->ts_offset;
*** 317 LINES SKIPPED ***