git: f64dc2ab5be3 - main - tcp: TCP output method can request tcp_drop

From: Gleb Smirnoff <glebius_at_FreeBSD.org>
Date: Sun, 26 Dec 2021 16:49:33 UTC
The branch main has been updated by glebius:

URL: https://cgit.FreeBSD.org/src/commit/?id=f64dc2ab5be38e5366271ef85ea90d8cb1c7841a

commit f64dc2ab5be38e5366271ef85ea90d8cb1c7841a
Author:     Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2021-12-26 16:48:19 +0000
Commit:     Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2021-12-26 16:48:19 +0000

    tcp: TCP output method can request tcp_drop
    
    The advanced TCP stacks (bbr, rack) may decide to drop a TCP connection
    when they do output on it.  The default stack never does this, thus
    existing framework expects tcp_output() always to return locked and
    valid tcpcb.
    
    Provide KPI extension to satisfy demands of advanced stacks.  If the
    output method returns negative error code, it means that caller must
    call tcp_drop().
    
    In tcp_var() provide three inline methods to call tcp_output():
    - tcp_output() is a drop-in replacement for the default stack, so that
      default stack can continue using it internally without modifications.
      For advanced stacks it would perform tcp_drop() and unlock and report
      that with negative error code.
    - tcp_output_unlock() handles the negative code and always converts
      it to positive and always unlocks.
    - tcp_output_nodrop() just calls the method and leaves the responsibility
      to drop on the caller.
    
    Sweep over the advanced stacks and use new KPI instead of using HPTS
    delayed drop queue for that.
    
    Reviewed by:            rrs, tuexen
    Differential revision:  https://reviews.freebsd.org/D33370
---
 sys/dev/cxgbe/tom/t4_cpl_io.c |   6 +++
 sys/netinet/tcp_hpts.c        |   2 +
 sys/netinet/tcp_stacks/bbr.c  |  52 ++++++++++-----------
 sys/netinet/tcp_stacks/rack.c |  49 +++++++++++---------
 sys/netinet/tcp_subr.c        |  19 ++++----
 sys/netinet/tcp_timer.c       |  18 ++++----
 sys/netinet/tcp_usrreq.c      |  47 ++++++++++---------
 sys/netinet/tcp_var.h         | 102 ++++++++++++++++++++++++++++++++++++++++--
 sys/netinet/toecore.c         |   3 +-
 9 files changed, 210 insertions(+), 88 deletions(-)

diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c
index ae22b3d5c252..56bb7910b5c1 100644
--- a/sys/dev/cxgbe/tom/t4_cpl_io.c
+++ b/sys/dev/cxgbe/tom/t4_cpl_io.c
@@ -2300,6 +2300,12 @@ sendanother:
 		if (moretocome)
 			tp->t_flags |= TF_MORETOCOME;
 		error = tcp_output(tp);
+		if (error < 0) {
+			INP_UNLOCK_ASSERT(inp);
+			SOCK_IO_SEND_UNLOCK(so);
+			error = -error;
+			goto out;
+		}
 		if (moretocome)
 			tp->t_flags &= ~TF_MORETOCOME;
 	}
diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c
index 745ebe9f3aaa..40ae9a5cc3ad 100644
--- a/sys/netinet/tcp_hpts.c
+++ b/sys/netinet/tcp_hpts.c
@@ -1550,6 +1550,8 @@ again:
 			}
 			inp->inp_hpts_calls = 1;
 			error = tcp_output(tp);
+			if (error < 0)
+				goto skip_pacing;
 			inp->inp_hpts_calls = 0;
 			if (ninp && ninp->inp_ppcb) {
 				/*
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index f79933d54f24..ada278d629d0 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -4580,8 +4580,7 @@ bbr_timeout_tlp(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
 	}
 	if (ctf_progress_timeout_check(tp, true)) {
 		bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
-		tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
-		return (1);
+		return (-ETIMEDOUT);	/* tcp_drop() */
 	}
 	/* Did we somehow get into persists? */
 	if (bbr->rc_in_persist) {
@@ -4773,8 +4772,7 @@ bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
 	 */
 	if (ctf_progress_timeout_check(tp, true)) {
 		bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
-		tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
-		goto out;
+		return (-ETIMEDOUT);	/* tcp_drop() */
 	}
 	/*
 	 * Hack: if the peer is dead/unreachable, we do not time out if the
@@ -4787,8 +4785,7 @@ bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
 	    ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
 		KMOD_TCPSTAT_INC(tcps_persistdrop);
 		tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
-		tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
-		goto out;
+		return (-ETIMEDOUT);	/* tcp_drop() */
 	}
 	if ((sbavail(&bbr->rc_inp->inp_socket->so_snd) == 0) &&
 	    tp->snd_una == tp->snd_max) {
@@ -4804,8 +4801,7 @@ bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
 	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
 		KMOD_TCPSTAT_INC(tcps_persistdrop);
 		tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
-		tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
-		goto out;
+		return (-ETIMEDOUT);	/* tcp_drop() */
 	}
 	t_template = tcpip_maketemplate(bbr->rc_inp);
 	if (t_template) {
@@ -4877,8 +4873,7 @@ bbr_timeout_keepalive(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
 dropit:
 	KMOD_TCPSTAT_INC(tcps_keepdrops);
 	tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
-	tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
-	return (1);
+	return (-ETIMEDOUT);	/* tcp_drop() */
 }
 
 /*
@@ -4998,10 +4993,8 @@ bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
 	 * and retransmit one segment.
 	 */
 	if (ctf_progress_timeout_check(tp, true)) {
-		retval = 1;
 		bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
-		tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
-		goto out;
+		return (-ETIMEDOUT);	/* tcp_drop() */
 	}
 	bbr_remxt_tmr(tp);
 	if ((bbr->r_ctl.rc_resend == NULL) ||
@@ -5017,11 +5010,11 @@ bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
 	if (tp->t_rxtshift > TCP_MAXRXTSHIFT) {
 		tp->t_rxtshift = TCP_MAXRXTSHIFT;
 		KMOD_TCPSTAT_INC(tcps_timeoutdrop);
-		retval = 1;
 		tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
-		tcp_set_inp_to_drop(bbr->rc_inp,
-		    (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
-		goto out;
+		/* XXXGL: previously t_softerror was casted to uint16_t */
+		MPASS(tp->t_softerror >= 0);
+		retval = tp->t_softerror ? -tp->t_softerror : -ETIMEDOUT;
+		return (retval);	/* tcp_drop() */
 	}
 	if (tp->t_state == TCPS_SYN_SENT) {
 		/*
@@ -5194,7 +5187,7 @@ bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
 	tp->snd_recover = tp->snd_max;
 	tp->t_flags |= TF_ACKNOW;
 	tp->t_rtttime = 0;
-out:
+
 	return (retval);
 }
 
@@ -11637,7 +11630,8 @@ bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
 			if (bbr->r_wanted_output != 0) {
 				bbr->rc_output_starts_timer = 0;
 				did_out = 1;
-				(void)tcp_output(tp);
+				if (tcp_output(tp) < 0)
+					return (1);
 			} else
 				bbr_start_hpts_timer(bbr, tp, cts, 6, 0, 0);
 		}
@@ -11676,7 +11670,8 @@ bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
 							/* We are late */
 							bbr->r_ctl.rc_last_delay_val = 0;
 							BBR_STAT_INC(bbr_force_output);
-							(void)tcp_output(tp);
+							if (tcp_output(tp) < 0)
+								return (1);
 						}
 					}
 				}
@@ -12163,9 +12158,16 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
 	hpts_calling = inp->inp_hpts_calls;
 	inp->inp_hpts_calls = 0;
 	if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
-		if (bbr_process_timers(tp, bbr, cts, hpts_calling)) {
+		int retval;
+
+		retval = bbr_process_timers(tp, bbr, cts, hpts_calling);
+		if (retval != 0) {
 			counter_u64_add(bbr_out_size[TCP_MSS_ACCT_ATIMER], 1);
-			return (0);
+			/*
+			 * If timers want tcp_drop(), then pass error out,
+			 * otherwise suppress it.
+			 */
+			return (retval < 0 ? retval : 0);
 		}
 	}
 	bbr->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
@@ -13234,10 +13236,9 @@ send:
 					 * is the only thing to do.
 					 */
 					BBR_STAT_INC(bbr_offset_drop);
-					tcp_set_inp_to_drop(inp, EFAULT);
 					SOCKBUF_UNLOCK(sb);
 					(void)m_free(m);
-					return (0);
+					return (-EFAULT); /* tcp_drop() */
 				}
 				len = rsm->r_end - rsm->r_start;
 			}
@@ -13891,7 +13892,7 @@ nomore:
 			bbr->oerror_cnt++;
 		if (bbr_max_net_error_cnt && (bbr->oerror_cnt >= bbr_max_net_error_cnt)) {
 			/* drop the session */
-			tcp_set_inp_to_drop(inp, ENETDOWN);
+			return (-ENETDOWN);
 		}
 		switch (error) {
 		case ENOBUFS:
@@ -14238,6 +14239,7 @@ struct tcp_function_block __tcp_bbr = {
 	.tfb_tcp_handoff_ok = bbr_handoff_ok,
 	.tfb_tcp_mtu_chg = bbr_mtu_chg,
 	.tfb_pru_options = bbr_pru_options,
+	.tfb_flags = TCP_FUNC_OUTPUT_CANDROP,
 };
 
 /*
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index cd198e11babe..204533b3b778 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -6319,8 +6319,7 @@ rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t
 	}
 	if (ctf_progress_timeout_check(tp, true)) {
 		rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
-		tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
-		return (1);
+		return (-ETIMEDOUT);	/* tcp_drop() */
 	}
 	/*
 	 * A TLP timer has expired. We have been idle for 2 rtts. So we now
@@ -6538,9 +6537,8 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 	if (ctf_progress_timeout_check(tp, false)) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
 		rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
-		tcp_set_inp_to_drop(inp, ETIMEDOUT);
 		counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
-		return (1);
+		return (-ETIMEDOUT);	/* tcp_drop() */
 	}
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	/*
@@ -6558,10 +6556,9 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
 	     TICKS_2_USEC(ticks - tp->t_rcvtime) >= RACK_REXMTVAL(tp) * tcp_totbackoff)) {
 		KMOD_TCPSTAT_INC(tcps_persistdrop);
-		retval = 1;
 		tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
-		tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
 		counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
+		retval = -ETIMEDOUT;	/* tcp_drop() */
 		goto out;
 	}
 	if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
@@ -6574,11 +6571,10 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 	 */
 	if (tp->t_state > TCPS_CLOSE_WAIT &&
 	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
-		retval = 1;
 		KMOD_TCPSTAT_INC(tcps_persistdrop);
 		tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
-		tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
 		counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
+		retval = -ETIMEDOUT;	/* tcp_drop() */
 		goto out;
 	}
 	t_template = tcpip_maketemplate(rack->rc_inp);
@@ -6669,8 +6665,7 @@ rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 dropit:
 	KMOD_TCPSTAT_INC(tcps_keepdrops);
 	tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
-	tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
-	return (1);
+	return (-ETIMEDOUT);	/* tcp_drop() */
 }
 
 /*
@@ -6874,8 +6869,7 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 	if (ctf_progress_timeout_check(tp, false)) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
 		rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
-		tcp_set_inp_to_drop(inp, ETIMEDOUT);
-		return (1);
+		return (-ETIMEDOUT);	/* tcp_drop() */
 	}
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
 	rack->r_ctl.retran_during_recovery = 0;
@@ -6944,10 +6938,10 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 drop_it:
 		tp->t_rxtshift = TCP_MAXRXTSHIFT;
 		KMOD_TCPSTAT_INC(tcps_timeoutdrop);
-		retval = 1;
-		tcp_set_inp_to_drop(rack->rc_inp,
-		    (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
-		goto out;
+		/* XXXGL: previously t_softerror was casted to uint16_t */
+		MPASS(tp->t_softerror >= 0);
+		retval = tp->t_softerror ? -tp->t_softerror : -ETIMEDOUT;
+		goto out;	/* tcp_drop() */
 	}
 	if (tp->t_state == TCPS_SYN_SENT) {
 		/*
@@ -14164,7 +14158,12 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
 	ctf_calc_rwin(so, tp);
 	if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) {
 	send_out_a_rst:
-		(void)tcp_output(tp);
+		if (tcp_output(tp) < 0) {
+#ifdef TCP_ACCOUNTING
+			sched_unpin();
+#endif
+			return (1);
+		}
 		did_out = 1;
 	}
 	rack_free_trim(rack);
@@ -14649,8 +14648,9 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
 		if (nxt_pkt == 0) {
 			if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) {
 do_output_now:
+				if (tcp_output(tp) < 0)
+					return (1);
 				did_out = 1;
-				(void)tcp_output(tp);
 			}
 			rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
 			rack_free_trim(rack);
@@ -16877,12 +16877,20 @@ rack_output(struct tcpcb *tp)
 	}
 	/* Do the timers, which may override the pacer */
 	if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
-		if (rack_process_timers(tp, rack, cts, hpts_calling, &doing_tlp)) {
+		int retval;
+
+		retval = rack_process_timers(tp, rack, cts, hpts_calling,
+		    &doing_tlp);
+		if (retval != 0) {
 			counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
 #ifdef TCP_ACCOUNTING
 			sched_unpin();
 #endif
-			return (0);
+			/*
+			 * If timers want tcp_drop(), then pass error out,
+			 * otherwise suppress it.
+			 */
+			return (retval < 0 ? retval : 0);
 		}
 	}
 	if (rack->rc_in_persist) {
@@ -20393,6 +20401,7 @@ static struct tcp_function_block __tcp_rack = {
 	.tfb_tcp_mtu_chg = rack_mtu_change,
 	.tfb_pru_options = rack_pru_options,
 	.tfb_hwtls_change = rack_hw_tls_change,
+	.tfb_flags = TCP_FUNC_OUTPUT_CANDROP,
 };
 
 /*
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 15faca076001..dde8616dcec6 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -375,7 +375,7 @@ static void	tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged);
 static int	tcp_default_handoff_ok(struct tcpcb *tp);
 static struct inpcb *tcp_notify(struct inpcb *, int);
 static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int);
-static void tcp_mtudisc(struct inpcb *, int);
+static struct inpcb *tcp_mtudisc(struct inpcb *, int);
 static char *	tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
 		    void *ip4hdr, const void *ip6hdr);
 
@@ -2398,7 +2398,8 @@ tcp_drop(struct tcpcb *tp, int errno)
 
 	if (TCPS_HAVERCVDSYN(tp->t_state)) {
 		tcp_state_change(tp, TCPS_CLOSED);
-		(void) tcp_output(tp);
+		/* Don't use tcp_output() here due to possible recursion. */
+		(void)tcp_output_nodrop(tp);
 		TCPSTAT_INC(tcps_drops);
 	} else
 		TCPSTAT_INC(tcps_conndrops);
@@ -3026,7 +3027,7 @@ tcp_ctlinput_with_port(int cmd, struct sockaddr *sa, void *vip, uint16_t port)
 						inc.inc_fibnum =
 						    inp->inp_inc.inc_fibnum;
 						tcp_hc_updatemtu(&inc, mtu);
-						tcp_mtudisc(inp, mtu);
+						inp = tcp_mtudisc(inp, mtu);
 					}
 				} else
 					inp = (*notify)(inp,
@@ -3474,11 +3475,10 @@ static struct inpcb *
 tcp_mtudisc_notify(struct inpcb *inp, int error)
 {
 
-	tcp_mtudisc(inp, -1);
-	return (inp);
+	return (tcp_mtudisc(inp, -1));
 }
 
-static void
+static struct inpcb *
 tcp_mtudisc(struct inpcb *inp, int mtuoffer)
 {
 	struct tcpcb *tp;
@@ -3487,7 +3487,7 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer)
 	INP_WLOCK_ASSERT(inp);
 	if ((inp->inp_flags & INP_TIMEWAIT) ||
 	    (inp->inp_flags & INP_DROPPED))
-		return;
+		return (inp);
 
 	tp = intotcpcb(inp);
 	KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL"));
@@ -3517,7 +3517,10 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer)
 		 */
 		tp->t_fb->tfb_tcp_mtu_chg(tp);
 	}
-	tcp_output(tp);
+	if (tcp_output(tp) < 0)
+		return (NULL);
+	else
+		return (inp);
 }
 
 #ifdef INET
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index d41bd9465a08..a3cb16869dc9 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -292,8 +292,7 @@ tcp_timer_delack(void *xtp)
 	tp->t_flags |= TF_ACKNOW;
 	TCPSTAT_INC(tcps_delack);
 	NET_EPOCH_ENTER(et);
-	(void) tcp_output(tp);
-	INP_WUNLOCK(inp);
+	(void) tcp_output_unlock(tp);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 }
@@ -502,6 +501,7 @@ tcp_timer_persist(void *xtp)
 	struct tcpcb *tp = xtp;
 	struct inpcb *inp;
 	struct epoch_tracker et;
+	int outrv;
 	CURVNET_SET(tp->t_vnet);
 #ifdef TCPDEBUG
 	int ostate;
@@ -563,8 +563,7 @@ tcp_timer_persist(void *xtp)
 	tcp_setpersist(tp);
 	tp->t_flags |= TF_FORCEDATA;
 	NET_EPOCH_ENTER(et);
-	(void) tcp_output(tp);
-	NET_EPOCH_EXIT(et);
+	outrv = tcp_output_nodrop(tp);
 	tp->t_flags &= ~TF_FORCEDATA;
 
 #ifdef TCPDEBUG
@@ -572,7 +571,8 @@ tcp_timer_persist(void *xtp)
 		tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
 #endif
 	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
-	INP_WUNLOCK(inp);
+	(void) tcp_unlock_or_drop(tp, outrv);
+	NET_EPOCH_EXIT(et);
 out:
 	CURVNET_RESTORE();
 }
@@ -582,7 +582,7 @@ tcp_timer_rexmt(void * xtp)
 {
 	struct tcpcb *tp = xtp;
 	CURVNET_SET(tp->t_vnet);
-	int rexmt;
+	int rexmt, outrv;
 	struct inpcb *inp;
 	struct epoch_tracker et;
 	bool isipv6;
@@ -843,15 +843,15 @@ tcp_timer_rexmt(void * xtp)
 
 	cc_cong_signal(tp, NULL, CC_RTO);
 	NET_EPOCH_ENTER(et);
-	(void) tcp_output(tp);
-	NET_EPOCH_EXIT(et);
+	outrv = tcp_output_nodrop(tp);
 #ifdef TCPDEBUG
 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 			  PRU_SLOWTIMO);
 #endif
 	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
-	INP_WUNLOCK(inp);
+	(void) tcp_unlock_or_drop(tp, outrv);
+	NET_EPOCH_EXIT(et);
 out:
 	CURVNET_RESTORE();
 }
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 39705c3a9b54..55071a46e470 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -594,6 +594,8 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 #endif
 	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
 	error = tcp_output(tp);
+	KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()",
+	    tp->t_fb->tfb_tcp_block_name, tp));
 out_in_epoch:
 	NET_EPOCH_EXIT(et);
 out:
@@ -720,6 +722,8 @@ out_in_epoch:
 #endif
 	NET_EPOCH_EXIT(et);
 out:
+	KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()",
+	    tp->t_fb->tfb_tcp_block_name, tp));
 	/*
 	 * If the implicit bind in the connect call fails, restore
 	 * the flags we modified.
@@ -896,21 +900,20 @@ tcp_usr_shutdown(struct socket *so)
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("inp == NULL"));
 	INP_WLOCK(inp);
+	tp = intotcpcb(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		error = ECONNRESET;
 		goto out;
 	}
-	tp = intotcpcb(inp);
 	TCPDEBUG1();
 	socantsendmore(so);
 	tcp_usrclosed(tp);
 	if (!(inp->inp_flags & INP_DROPPED))
-		error = tcp_output(tp);
-
+		error = tcp_output_nodrop(tp);
 out:
 	TCPDEBUG2(PRU_SHUTDOWN);
 	TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN);
-	INP_WUNLOCK(inp);
+	error = tcp_unlock_or_drop(tp, error);
 	NET_EPOCH_EXIT(et);
 
 	return (error);
@@ -925,17 +928,18 @@ tcp_usr_rcvd(struct socket *so, int flags)
 	struct epoch_tracker et;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
-	int error = 0;
+	int outrv = 0, error = 0;
 
 	TCPDEBUG0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL"));
 	INP_WLOCK(inp);
+	NET_EPOCH_ENTER(et);
+	tp = intotcpcb(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		error = ECONNRESET;
 		goto out;
 	}
-	tp = intotcpcb(inp);
 	TCPDEBUG1();
 	/*
 	 * For passively-created TFO connections, don't attempt a window
@@ -947,18 +951,17 @@ tcp_usr_rcvd(struct socket *so, int flags)
 	if (IS_FASTOPEN(tp->t_flags) &&
 	    (tp->t_state == TCPS_SYN_RECEIVED))
 		goto out;
-	NET_EPOCH_ENTER(et);
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE)
 		tcp_offload_rcvd(tp);
 	else
 #endif
-	tcp_output(tp);
-	NET_EPOCH_EXIT(et);
+		outrv = tcp_output_nodrop(tp);
 out:
 	TCPDEBUG2(PRU_RCVD);
 	TCP_PROBE2(debug__user, tp, PRU_RCVD);
-	INP_WUNLOCK(inp);
+	(void) tcp_unlock_or_drop(tp, outrv);
+	NET_EPOCH_EXIT(et);
 	return (error);
 }
 
@@ -999,6 +1002,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
 	INP_WLOCK(inp);
+	tp = intotcpcb(inp);
 	vflagsav = inp->inp_vflag;
 	incflagsav = inp->inp_inc.inc_flags;
 	restoreflags = false;
@@ -1018,7 +1022,6 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
 		m_freem(control);	/* empty control, just free it */
 		control = NULL;
 	}
-	tp = intotcpcb(inp);
 	if ((flags & PRUS_OOB) != 0 &&
 	    (error = tcp_pru_options_support(tp, PRUS_OOB)) != 0)
 		goto out;
@@ -1188,7 +1191,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
 		    !(flags & PRUS_NOTREADY)) {
 			if (flags & PRUS_MORETOCOME)
 				tp->t_flags |= TF_MORETOCOME;
-			error = tcp_output(tp);
+			error = tcp_output_nodrop(tp);
 			if (flags & PRUS_MORETOCOME)
 				tp->t_flags &= ~TF_MORETOCOME;
 		}
@@ -1255,7 +1258,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
 		tp->snd_up = tp->snd_una + sbavail(&so->so_snd);
 		if ((flags & PRUS_NOTREADY) == 0) {
 			tp->t_flags |= TF_FORCEDATA;
-			error = tcp_output(tp);
+			error = tcp_output_nodrop(tp);
 			tp->t_flags &= ~TF_FORCEDATA;
 		}
 	}
@@ -1285,7 +1288,7 @@ out:
 		  ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
 	TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB :
 		   ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
-	INP_WUNLOCK(inp);
+	error = tcp_unlock_or_drop(tp, error);
 	NET_EPOCH_EXIT(et);
 	return (error);
 }
@@ -1310,12 +1313,13 @@ tcp_usr_ready(struct socket *so, struct mbuf *m, int count)
 	SOCKBUF_LOCK(&so->so_snd);
 	error = sbready(&so->so_snd, m, count);
 	SOCKBUF_UNLOCK(&so->so_snd);
-	if (error == 0) {
-		NET_EPOCH_ENTER(et);
-		error = tcp_output(tp);
-		NET_EPOCH_EXIT(et);
+	if (error) {
+		INP_WUNLOCK(inp);
+		return (error);
 	}
-	INP_WUNLOCK(inp);
+	NET_EPOCH_ENTER(et);
+	error = tcp_output_unlock(tp);
+	NET_EPOCH_EXIT(et);
 
 	return (error);
 }
@@ -2238,7 +2242,7 @@ unlock_and_done:
 					struct epoch_tracker et;
 
 					NET_EPOCH_ENTER(et);
-					error = tcp_output(tp);
+					error = tcp_output_nodrop(tp);
 					NET_EPOCH_EXIT(et);
 				}
 			}
@@ -2767,7 +2771,8 @@ tcp_disconnect(struct tcpcb *tp)
 		sbflush(&so->so_rcv);
 		tcp_usrclosed(tp);
 		if (!(inp->inp_flags & INP_DROPPED))
-			tcp_output(tp);
+			/* Ignore stack's drop request, we already at it. */
+			(void)tcp_output_nodrop(tp);
 	}
 }
 
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index a3e6e8a05107..7792e2162852 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -316,7 +316,8 @@ struct tcptemp {
  * function below.
  */
 /* Flags for tcp functions */
-#define TCP_FUNC_BEING_REMOVED 0x01   	/* Can no longer be referenced */
+#define	TCP_FUNC_BEING_REMOVED	0x01   	/* Can no longer be referenced */
+#define	TCP_FUNC_OUTPUT_CANDROP	0x02   	/* tfb_tcp_output may ask tcp_drop */
 
 /*
  * If defining the optional tcp_timers, in the
@@ -386,12 +387,107 @@ struct tcp_function {
 
 TAILQ_HEAD(tcp_funchead, tcp_function);
 
+struct tcpcb * tcp_drop(struct tcpcb *, int);
+
+#ifdef _NETINET_IN_PCB_H_
+/*
+ * tcp_output()
+ * Handles tcp_drop request from advanced stacks and reports that inpcb is
+ * gone with negative return code.
+ * Drop in replacement for the default stack.
+ */
 static inline int
 tcp_output(struct tcpcb *tp)
 {
+	int rv;
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	rv = tp->t_fb->tfb_tcp_output(tp);
+	if (rv < 0) {
+		KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP,
+		    ("TCP stack %s requested tcp_drop(%p)",
+		    tp->t_fb->tfb_tcp_block_name, tp));
+		tp = tcp_drop(tp, -rv);
+		if (tp)
+			INP_WUNLOCK(tp->t_inpcb);
+	}
 
-	return (tp->t_fb->tfb_tcp_output(tp));
+	return (rv);
 }
+
+/*
+ * tcp_output_unlock()
+ * Always returns unlocked, handles drop request from advanced stacks.
+ * Always returns positive error code.
+ */
+static inline int
+tcp_output_unlock(struct tcpcb *tp)
+{
+	int rv;
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	rv = tp->t_fb->tfb_tcp_output(tp);
+	if (rv < 0) {
+		KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP,
+		    ("TCP stack %s requested tcp_drop(%p)",
+		    tp->t_fb->tfb_tcp_block_name, tp));
+		rv = -rv;
+		tp = tcp_drop(tp, rv);
+		if (tp)
+			INP_WUNLOCK(tp->t_inpcb);
+	} else
+		INP_WUNLOCK(tp->t_inpcb);
+
+	return (rv);
+}
+
+/*
+ * tcp_output_nodrop()
+ * Always returns locked.  It is caller's responsibility to run tcp_drop()!
+ * Useful in syscall implementations, when we want to perform some logging
+ * and/or tracing with tcpcb before calling tcp_drop().  To be used with
+ * tcp_unlock_or_drop() later.
+ *
+ * XXXGL: maybe don't allow stacks to return a drop request at certain
+ * TCP states? Why would it do in connect(2)? In recv(2)?
+ */
+static inline int
+tcp_output_nodrop(struct tcpcb *tp)
+{
+	int rv;
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	rv = tp->t_fb->tfb_tcp_output(tp);
+	KASSERT(rv >= 0 || tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP,
+	    ("TCP stack %s requested tcp_drop(%p)",
+	    tp->t_fb->tfb_tcp_block_name, tp));
+	return (rv);
+}
+
+/*
+ * tcp_unlock_or_drop()
+ * Handle return code from tfb_tcp_output() after we have logged/traced,
+ * to be used with tcp_output_nodrop().
+ */
+static inline int
+tcp_unlock_or_drop(struct tcpcb *tp, int tcp_output_retval)
+{
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+        if (tcp_output_retval < 0) {
+                tcp_output_retval = -tcp_output_retval;
+                if (tcp_drop(tp, tcp_output_retval) != NULL)
+                        INP_WUNLOCK(tp->t_inpcb);
+        } else
+		INP_WUNLOCK(tp->t_inpcb);
+
+	return (tcp_output_retval);
+}
+#endif	/* _NETINET_IN_PCB_H_ */
 #endif	/* _KERNEL */
 
 /*
@@ -979,8 +1075,6 @@ void	 tcp_twclose(struct tcptw *, int);
 void	 tcp_ctlinput(int, struct sockaddr *, void *);
 int	 tcp_ctloutput(struct socket *, struct sockopt *);
 void 	 tcp_ctlinput_viaudp(int, struct sockaddr *, void *, void *);
-struct tcpcb *
-	 tcp_drop(struct tcpcb *, int);
 void	 tcp_drain(void);
 void	 tcp_init(void);
 void	 tcp_fini(void *);
diff --git a/sys/netinet/toecore.c b/sys/netinet/toecore.c
index 17900a047559..676eca5462bd 100644
--- a/sys/netinet/toecore.c
+++ b/sys/netinet/toecore.c
@@ -532,7 +532,8 @@ toe_connect_failed(struct toedev *tod, struct inpcb *inp, int err)
 			KASSERT(!(tp->t_flags & TF_TOE),
 			    ("%s: tp %p still offloaded.", __func__, tp));
 			tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
-			(void) tcp_output(tp);
+			if (tcp_output(tp) < 0)
+				INP_WLOCK(inp);	/* re-acquire */
 		} else {
 			tp = tcp_drop(tp, err);
 			if (tp == NULL)