git: 62ce18fc9a8e - main - tcp: Rack rwnd collapse.

From: Randall Stewart <rrs_at_FreeBSD.org>
Date: Tue, 23 Aug 2022 13:18:19 UTC
The branch main has been updated by rrs:

URL: https://cgit.FreeBSD.org/src/commit/?id=62ce18fc9a8e46ea72ce3a106e7b0cd1ad6a712b

commit 62ce18fc9a8e46ea72ce3a106e7b0cd1ad6a712b
Author:     Randall Stewart <rrs@FreeBSD.org>
AuthorDate: 2022-08-23 13:17:05 +0000
Commit:     Randall Stewart <rrs@FreeBSD.org>
CommitDate: 2022-08-23 13:17:05 +0000

    tcp: Rack rwnd collapse.
    
    Currently when the peer collapses its rwnd, we mark packets to be retransmitted
    and use the must_retran flags like we do when a PMTU collapses to retransmit the
    collapsed packets. However this causes a problem with some middle boxes that
    play with the rwnd to control flow. As soon as the rwnd increases we start resending
    which may be not even a rtt.. and in fact the peer may have gotten the packets. Which
    means we gratuitously retransmit packets we should not.
    
    The fix here is to make sure that a rack time has passed before retransmitting the packets.
    This makes sure that the rwnd collapse was real and the packets do need retransmission.
    
    Reviewed by: tuexen
    Sponsored by: Netflix Inc
    Differential Revision: https://reviews.freebsd.org/D35166
---
 sys/netinet/tcp_log_buf.h         |   4 +-
 sys/netinet/tcp_stacks/rack.c     | 403 ++++++++++++++++++++++++++------------
 sys/netinet/tcp_stacks/tcp_rack.h |  19 +-
 3 files changed, 300 insertions(+), 126 deletions(-)

diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h
index 1290a8ce6b29..c11757099c5d 100644
--- a/sys/netinet/tcp_log_buf.h
+++ b/sys/netinet/tcp_log_buf.h
@@ -236,7 +236,9 @@ enum tcp_log_events {
 	TCP_LOG_FSB,		/* FSB information 63 */
 	RACK_DSACK_HANDLING,	/* Handling of DSACK in rack for reordering window 64 */
 	TCP_HYSTART,		/* TCP Hystart logging 65 */
-	TCP_LOG_END		/* End (keep at end)                66 */
+	TCP_CHG_QUERY,		/* Change query during fnc_init() 66 */
+	TCP_RACK_LOG_COLLAPSE,	/* Window collapse by peer 67 */
+	TCP_LOG_END		/* End (keep at end)                68 */
 };
 
 enum tcp_log_states {
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 0c91c9c6703f..ea370fe9247c 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -385,6 +385,9 @@ counter_u64_t rack_move_some;
 
 counter_u64_t rack_input_idle_reduces;
 counter_u64_t rack_collapsed_win;
+counter_u64_t rack_collapsed_win_seen;
+counter_u64_t rack_collapsed_win_rxt;
+counter_u64_t rack_collapsed_win_rxt_bytes;
 counter_u64_t rack_try_scwnd;
 counter_u64_t rack_hw_pace_init_fail;
 counter_u64_t rack_hw_pace_lost;
@@ -790,6 +793,9 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
 		counter_u64_zero(rack_move_some);
 		counter_u64_zero(rack_try_scwnd);
 		counter_u64_zero(rack_collapsed_win);
+		counter_u64_zero(rack_collapsed_win_rxt);
+		counter_u64_zero(rack_collapsed_win_seen);
+		counter_u64_zero(rack_collapsed_win_rxt_bytes);
 	}
 	rack_clear_counter = 0;
 	return (0);
@@ -1757,12 +1763,31 @@ rack_init_sysctls(void)
 	    OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
 	    &rack_input_idle_reduces,
 	    "Total number of idle reductions on input");
+	rack_collapsed_win_seen = counter_u64_alloc(M_WAITOK);
+	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_counters),
+	    OID_AUTO, "collapsed_win_seen", CTLFLAG_RD,
+	    &rack_collapsed_win_seen,
+	    "Total number of collapsed window events seen (where our window shrinks)");
+
 	rack_collapsed_win = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "collapsed_win", CTLFLAG_RD,
 	    &rack_collapsed_win,
-	    "Total number of collapsed windows");
+	    "Total number of collapsed window events where we mark packets");
+	rack_collapsed_win_rxt = counter_u64_alloc(M_WAITOK);
+	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_counters),
+	    OID_AUTO, "collapsed_win_rxt", CTLFLAG_RD,
+	    &rack_collapsed_win_rxt,
+	    "Total number of packets that were retransmitted");
+	rack_collapsed_win_rxt_bytes = counter_u64_alloc(M_WAITOK);
+	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_counters),
+	    OID_AUTO, "collapsed_win_bytes", CTLFLAG_RD,
+	    &rack_collapsed_win_rxt_bytes,
+	    "Total number of bytes that were retransmitted");
 	rack_try_scwnd = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
@@ -2772,6 +2797,9 @@ rack_counter_destroy(void)
 	counter_u64_free(rack_sack_splits);
 	counter_u64_free(rack_input_idle_reduces);
 	counter_u64_free(rack_collapsed_win);
+	counter_u64_free(rack_collapsed_win_rxt);
+	counter_u64_free(rack_collapsed_win_rxt_bytes);
+	counter_u64_free(rack_collapsed_win_seen);
 	counter_u64_free(rack_try_scwnd);
 	counter_u64_free(rack_persists_sends);
 	counter_u64_free(rack_persists_acks);
@@ -5295,7 +5323,9 @@ activate_rxt:
 		goto activate_rxt;
 	}
 	/* Convert from ms to usecs */
-	if ((rsm->r_flags & RACK_SACK_PASSED) || (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
+	if ((rsm->r_flags & RACK_SACK_PASSED) ||
+	    (rsm->r_flags & RACK_RWND_COLLAPSED) ||
+	    (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
 		if ((tp->t_flags & TF_SENTFIN) &&
 		    ((tp->snd_max - tp->snd_una) == 1) &&
 		    (rsm->r_flags & RACK_HAS_FIN)) {
@@ -5757,7 +5787,7 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
 		 * real pacing. And the tlp or rxt is smaller
 		 * than the pacing calculation. Lets not
 		 * pace that long since we know the calculation
-		 * so far is not accurate. 
+		 * so far is not accurate.
 		 */
 		slot = hpts_timeout;
 	}
@@ -6501,7 +6531,7 @@ rack_remxt_tmr(struct tcpcb *tp)
 		trsm = rsm;
 		if (rsm->r_flags & RACK_ACKED)
 			rsm->r_flags |= RACK_WAS_ACKED;
-		rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS);
+		rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED);
 		rsm->r_flags |= RACK_MUST_RXT;
 	}
 	/* Clear the count (we just un-acked them) */
@@ -8040,6 +8070,13 @@ rack_log_sack_passed(struct tcpcb *tp,
 			 */
 			continue;
 		}
+		if (nrsm->r_flags & RACK_RWND_COLLAPSED) {
+			/*
+			 * If the peer dropped the rwnd on
+			 * these then we don't worry about them.
+			 */
+			continue;
+		}
 		if (nrsm->r_flags & RACK_SACK_PASSED) {
 			/*
 			 * We found one that is already marked
@@ -9797,7 +9834,7 @@ rack_strike_dupack(struct tcp_rack *rack)
 			/* Sendmap entries that are marked to
 			 * be retransmitted do not need dupack's
 			 * struck. We get these marks for a number
-			 * of reasons (rxt timeout with no sack, 
+			 * of reasons (rxt timeout with no sack,
 			 * mtu change, or rwnd collapses). When
 			 * these events occur, we know we must retransmit
 			 * them and mark the sendmap entries. Dupack counting
@@ -10308,47 +10345,83 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	return (0);
 }
 
+
 static void
-rack_collapsed_window(struct tcp_rack *rack)
+rack_log_collapse(struct tcp_rack *rack, uint32_t cnt, uint32_t split, uint32_t out, int line,
+		  int dir, uint32_t flags, struct rack_sendmap *rsm)
+{
+	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+		union tcp_log_stackspecific log;
+		struct timeval tv;
+
+		memset(&log, 0, sizeof(log));
+		log.u_bbr.flex1 = cnt;
+		log.u_bbr.flex2 = split;
+		log.u_bbr.flex3 = out;
+		log.u_bbr.flex4 = line;
+		log.u_bbr.flex5 = rack->r_must_retran;
+		log.u_bbr.flex6 = flags;
+		log.u_bbr.flex7 = rack->rc_has_collapsed;
+		log.u_bbr.flex8 = dir;	/*
+					 * 1 is collapsed, 0 is uncollapsed,
+					 * 2 is log of a rsm being marked, 3 is a split.
+					 */
+		if (rsm == NULL)
+			log.u_bbr.rttProp = 0;
+		else
+			log.u_bbr.rttProp = (uint64_t)rsm;
+		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+		TCP_LOG_EVENTP(rack->rc_tp, NULL,
+		    &rack->rc_inp->inp_socket->so_rcv,
+		    &rack->rc_inp->inp_socket->so_snd,
+		    TCP_RACK_LOG_COLLAPSE, 0,
+		    0, &log, false, &tv);
+	}
+}
+
+static void
+rack_collapsed_window(struct tcp_rack *rack, uint32_t out, int line)
 {
 	/*
-	 * Now we must walk the
-	 * send map and divide the
-	 * ones left stranded. These
-	 * guys can't cause us to abort
-	 * the connection and are really
-	 * "unsent". However if a buggy
-	 * client actually did keep some
-	 * of the data i.e. collapsed the win
-	 * and refused to ack and then opened
-	 * the win and acked that data. We would
-	 * get into an ack war, the simplier
-	 * method then of just pretending we
-	 * did not send those segments something
-	 * won't work.
+	 * Here all we do is mark the collapsed point and set the flag.
+	 * This may happen again and again, but there is no
+	 * sense splitting our map until we know where the
+	 * peer finally lands in the collapse.
 	 */
-	struct rack_sendmap *rsm, *nrsm, fe;
+	rack_trace_point(rack, RACK_TP_COLLAPSED_WND);
+	if ((rack->rc_has_collapsed == 0) ||
+	    (rack->r_ctl.last_collapse_point != (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd)))
+		counter_u64_add(rack_collapsed_win_seen, 1);
+	rack->r_ctl.last_collapse_point = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd;
+	rack->r_ctl.high_collapse_point = rack->rc_tp->snd_max;
+	rack->rc_has_collapsed = 1;
+	rack->r_collapse_point_valid = 1;
+	rack_log_collapse(rack, 0, 0, rack->r_ctl.last_collapse_point, line, 1, 0, NULL);
+}
+
+static void
+rack_un_collapse_window(struct tcp_rack *rack, int line)
+{
+	struct rack_sendmap *nrsm, *rsm, fe;
+	int cnt = 0, split = 0;
 #ifdef INVARIANTS
 	struct rack_sendmap *insret;
 #endif
-	tcp_seq max_seq;
 
-	rack_trace_point(rack, RACK_TP_COLLAPSED_WND);
-	max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd;
 	memset(&fe, 0, sizeof(fe));
-	fe.r_start = max_seq;
-	/* Find the first seq past or at maxseq */
+	rack->rc_has_collapsed = 0;
+	fe.r_start = rack->r_ctl.last_collapse_point;
 	rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
 	if (rsm == NULL) {
-		/* Nothing to do strange */
-		rack->rc_has_collapsed = 0;
+		/* Nothing to do maybe the peer ack'ed it all */
+		rack_log_collapse(rack, 0, 0, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL);
 		return;
 	}
-	/*
-	 * Now do we need to split at
-	 * the collapse point?
-	 */
-	if (SEQ_GT(max_seq, rsm->r_start)) {
+	/* Now do we need to split this one? */
+	if (SEQ_GT(rack->r_ctl.last_collapse_point, rsm->r_start)) {
+		rack_log_collapse(rack, rsm->r_start, rsm->r_end,
+				  rack->r_ctl.last_collapse_point, line, 3, rsm->r_flags, rsm);
 		nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
 		if (nrsm == NULL) {
 			/* We can't get a rsm, mark all? */
@@ -10356,7 +10429,8 @@ rack_collapsed_window(struct tcp_rack *rack)
 			goto no_split;
 		}
 		/* Clone it */
-		rack_clone_rsm(rack, nrsm, rsm, max_seq);
+		split = 1;
+		rack_clone_rsm(rack, nrsm, rsm, rack->r_ctl.last_collapse_point);
 #ifndef INVARIANTS
 		(void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
 #else
@@ -10366,7 +10440,8 @@ rack_collapsed_window(struct tcp_rack *rack)
 			      nrsm, insret, rack, rsm);
 		}
 #endif
-		rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT, max_seq, __LINE__);
+		rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT,
+				 rack->r_ctl.last_collapse_point, __LINE__);
 		if (rsm->r_in_tmap) {
 			TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 			nrsm->r_in_tmap = 1;
@@ -10378,38 +10453,15 @@ rack_collapsed_window(struct tcp_rack *rack)
 		rsm = nrsm;
 	}
 no_split:
-	counter_u64_add(rack_collapsed_win, 1);
 	RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) {
 		nrsm->r_flags |= RACK_RWND_COLLAPSED;
+		rack_log_collapse(rack, nrsm->r_start, nrsm->r_end, 0, line, 4, nrsm->r_flags, nrsm);
+		cnt++;
 	}
-	rack->rc_has_collapsed = 1;
-}
-
-static void
-rack_un_collapse_window(struct tcp_rack *rack)
-{
-	struct rack_sendmap *rsm;
-	int cnt = 0;;
-
-	rack->r_ctl.rc_out_at_rto = 0;
-	rack->r_ctl.rc_snd_max_at_rto = rack->rc_tp->snd_una;
-	RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
-		if (rsm->r_flags & RACK_RWND_COLLAPSED) {
-			rsm->r_flags &= ~RACK_RWND_COLLAPSED;
-			rsm->r_flags |= RACK_MUST_RXT;
-			if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) {
-				rack->r_ctl.rc_snd_max_at_rto = rsm->r_end;
-				rack->r_ctl.rc_out_at_rto += (rsm->r_end - rsm->r_start);
-			}
-			cnt++;
-		}
-		else
-			break;
-	}
-	rack->rc_has_collapsed = 0;
 	if (cnt) {
-		rack->r_must_retran = 1;
+		counter_u64_add(rack_collapsed_win, 1);
 	}
+	rack_log_collapse(rack, cnt, split, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL);
 }
 
 static void
@@ -10518,9 +10570,12 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	}
 	if (tp->snd_wnd < ctf_outstanding(tp))
 		/* The peer collapsed the window */
-		rack_collapsed_window(rack);
+		rack_collapsed_window(rack, ctf_outstanding(tp), __LINE__);
 	else if (rack->rc_has_collapsed)
-		rack_un_collapse_window(rack);
+		rack_un_collapse_window(rack, __LINE__);
+	if ((rack->r_collapse_point_valid) &&
+	    (SEQ_GT(th->th_ack, rack->r_ctl.high_collapse_point)))
+		rack->r_collapse_point_valid = 0;
 	/* Was persist timer active and now we have window space? */
 	if ((rack->rc_in_persist != 0) &&
 	    (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
@@ -11076,10 +11131,12 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	 */
 	if (tp->snd_wnd < ctf_outstanding(tp)) {
 		/* The peer collapsed the window */
-		rack_collapsed_window(rack);
+		rack_collapsed_window(rack, ctf_outstanding(tp), __LINE__);
 	} else if (rack->rc_has_collapsed)
-		rack_un_collapse_window(rack);
-
+		rack_un_collapse_window(rack, __LINE__);
+	if ((rack->r_collapse_point_valid) &&
+	    (SEQ_GT(tp->snd_una, rack->r_ctl.high_collapse_point)))
+		rack->r_collapse_point_valid = 0;
 	/*
 	 * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
 	 */
@@ -13066,13 +13123,6 @@ rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uin
 		/* Not a valid win update */
 		return;
 	}
-	if (tp->snd_wnd > tp->max_sndwnd)
-		tp->max_sndwnd = tp->snd_wnd;
-	if (tp->snd_wnd < (tp->snd_max - high_seq)) {
-		/* The peer collapsed the window */
-		rack_collapsed_window(rack);
-	} else if (rack->rc_has_collapsed)
-		rack_un_collapse_window(rack);
 	/* Do we exit persists? */
 	if ((rack->rc_in_persist != 0) &&
 	    (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
@@ -13609,6 +13659,15 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
 #ifdef TCP_ACCOUNTING
 	ts_val = get_cyclecount();
 #endif
+	/* Tend to any collapsed window */
+	if (SEQ_GT(tp->snd_max, high_seq) && (tp->snd_wnd < (tp->snd_max - high_seq))) {
+		/* The peer collapsed the window */
+		rack_collapsed_window(rack, (tp->snd_max - high_seq), __LINE__);
+	} else if (rack->rc_has_collapsed)
+		rack_un_collapse_window(rack, __LINE__);
+	if ((rack->r_collapse_point_valid) &&
+	    (SEQ_GT(high_seq, rack->r_ctl.high_collapse_point)))
+		rack->r_collapse_point_valid = 0;
 	acked_amount = acked = (high_seq - tp->snd_una);
 	if (acked) {
 		/*
@@ -15930,6 +15989,11 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
+		if (rsm->r_flags & RACK_RWND_COLLAPSED) {
+			rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm);
+			counter_u64_add(rack_collapsed_win_rxt, 1);
+			counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start));
+		}
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
 		if (rack->rack_no_prr)
@@ -16538,6 +16602,58 @@ failed:
 	return (-1);
 }
 
+static struct rack_sendmap *
+rack_check_collapsed(struct tcp_rack *rack, uint32_t cts)
+{
+	struct rack_sendmap *rsm = NULL;
+	struct rack_sendmap fe;
+	int thresh;
+
+restart:
+	fe.r_start = rack->r_ctl.last_collapse_point;
+	rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
+	if ((rsm == NULL) || ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0)) {
+		/* Nothing, strange turn off validity  */
+		rack->r_collapse_point_valid = 0;
+		return (NULL);
+	}
+	/* Can we send it yet? */
+	if (rsm->r_end > (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd)) {
+		/*
+		 * Receiver window has not grown enough for
+		 * the segment to be put on the wire.
+		 */
+		return (NULL);
+	}
+	if (rsm->r_flags & RACK_ACKED) {
+		/*
+		 * It has been sacked, lets move to the
+		 * next one if possible.
+		 */
+		rack->r_ctl.last_collapse_point = rsm->r_end;
+		/* Are we done? */
+		if (SEQ_GEQ(rack->r_ctl.last_collapse_point,
+			    rack->r_ctl.high_collapse_point)) {
+			rack->r_collapse_point_valid = 0;
+			return (NULL);
+		}
+		goto restart;
+	}
+	/* Now has it been long enough ? */
+	thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(rack->rc_tp, rack), cts);
+	if ((cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) > thresh) {
+		rack_log_collapse(rack, rsm->r_start,
+				  (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])),
+				  thresh, __LINE__, 6, rsm->r_flags, rsm);
+		return (rsm);
+	}
+	/* Not enough time */
+	rack_log_collapse(rack, rsm->r_start,
+			  (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])),
+			  thresh, __LINE__, 7, rsm->r_flags, rsm);
+	return (NULL);
+}
+
 static int
 rack_output(struct tcpcb *tp)
 {
@@ -16598,7 +16714,6 @@ rack_output(struct tcpcb *tp)
 	struct ip6_hdr *ip6 = NULL;
 	int32_t isipv6;
 #endif
-	uint8_t filled_all = 0;
 	bool hw_tls = false;
 
 	/* setup and take the cache hits here */
@@ -16863,6 +16978,29 @@ again:
 		sb_offset = rsm->r_start - tp->snd_una;
 		if (len >= segsiz)
 			len = segsiz;
+	} else if (rack->r_collapse_point_valid &&
+		   ((rsm = rack_check_collapsed(rack, cts)) != NULL)) {
+		/*
+		 * If an RSM is returned then enough time has passed
+		 * for us to retransmit it. Move up the collapse point,
+		 * since this rsm has its chance to retransmit now.
+		 */
+		rack_trace_point(rack, RACK_TP_COLLAPSED_RXT);
+		rack->r_ctl.last_collapse_point = rsm->r_end;
+		/* Are we done? */
+		if (SEQ_GEQ(rack->r_ctl.last_collapse_point,
+			    rack->r_ctl.high_collapse_point))
+			rack->r_collapse_point_valid = 0;
+		sack_rxmit = 1;
+		/* We are not doing a TLP */
+		doing_tlp = 0;
+		len = rsm->r_end - rsm->r_start;
+		sb_offset = rsm->r_start - tp->snd_una;
+		sendalot = 0;
+		if ((rack->full_size_rxt == 0) &&
+		    (rack->shape_rxt_to_pacing_min == 0) &&
+		    (len >= segsiz))
+			len = segsiz;
 	} else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) {
 		/* We have a retransmit that takes precedence */
 		if ((!IN_FASTRECOVERY(tp->t_flags)) &&
@@ -16921,53 +17059,72 @@ again:
 	}
 	if (rack->r_must_retran &&
 	    (doing_tlp == 0) &&
+	    (SEQ_GT(tp->snd_max, tp->snd_una)) &&
 	    (rsm == NULL)) {
 		/*
-		 * Non-Sack and we had a RTO or Sack/non-Sack and a
-		 * MTU change, we need to retransmit until we reach
-		 * the former snd_max (rack->r_ctl.rc_snd_max_at_rto).
+		 * There are two different ways that we
+		 * can get into this block:
+		 * a) This is a non-sack connection, we had a time-out
+		 *    and thus r_must_retran was set and everything
+		 *    left outstanding as been marked for retransmit.
+		 * b) The MTU of the path shrank, so that everything
+		 *    was marked to be retransmitted with the smaller
+		 *    mtu and r_must_retran was set.
+		 *
+		 * This means that we expect the sendmap (outstanding)
+		 * to all be marked must. We can use the tmap to
+		 * look at them.
+		 *
 		 */
-		if (SEQ_GT(tp->snd_max, tp->snd_una)) {
-			int sendwin, flight;
-
-			sendwin = min(tp->snd_wnd, tp->snd_cwnd);
-			flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto);
-			if (flight >= sendwin) {
-				so = inp->inp_socket;
-				sb = &so->so_snd;
-				goto just_return_nolock;
-			}
-			rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
-			if (rsm == NULL) {
-				/* TSNH */
-				rack->r_must_retran = 0;
-				rack->r_ctl.rc_out_at_rto = 0;
-				so = inp->inp_socket;
-				sb = &so->so_snd;
-				goto just_return_nolock;
-			}
-			if ((rsm->r_flags & RACK_MUST_RXT) == 0) {
-				/* It does not have the flag, we are done */
-				rack->r_must_retran = 0;
-				rack->r_ctl.rc_out_at_rto = 0;
-			} else {
-				sack_rxmit = 1;
-				len = rsm->r_end - rsm->r_start;
-				sendalot = 0;
-				sb_offset = rsm->r_start - tp->snd_una;
-				if (len >= segsiz)
-					len = segsiz;
-				/*
-				 * Delay removing the flag RACK_MUST_RXT so
-				 * that the fastpath for retransmit will
-				 * work with this rsm.
-				 */
+		int sendwin, flight;
 
-			}
-		} else {
-			/* We must be done if there is nothing outstanding */
+		sendwin = min(tp->snd_wnd, tp->snd_cwnd);
+		flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto);
+		if (flight >= sendwin) {
+			/*
+			 * We can't send yet.
+			 */
+			so = inp->inp_socket;
+			sb = &so->so_snd;
+			goto just_return_nolock;
+		}
+		/*
+		 * This is the case a/b mentioned above. All
+		 * outstanding/not-acked should be marked.
+		 * We can use the tmap to find them.
+		 */
+		rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
+		if (rsm == NULL) {
+			/* TSNH */
+			rack->r_must_retran = 0;
+			rack->r_ctl.rc_out_at_rto = 0;
+			so = inp->inp_socket;
+			sb = &so->so_snd;
+			goto just_return_nolock;
+		}
+		if ((rsm->r_flags & RACK_MUST_RXT) == 0) {
+			/*
+			 * The first one does not have the flag, did we collapse
+			 * further up in our list?
+			 */
 			rack->r_must_retran = 0;
 			rack->r_ctl.rc_out_at_rto = 0;
+			rsm = NULL;
+			sack_rxmit = 0;
+		} else {
+			sack_rxmit = 1;
+			len = rsm->r_end - rsm->r_start;
+			sb_offset = rsm->r_start - tp->snd_una;
+			sendalot = 0;
+			if ((rack->full_size_rxt == 0) &&
+			    (rack->shape_rxt_to_pacing_min == 0) &&
+			    (len >= segsiz))
+				len = segsiz;
+			/*
+			 * Delay removing the flag RACK_MUST_RXT so
+			 * that the fastpath for retransmit will
+			 * work with this rsm.
+			 */
 		}
 	}
 	/*
@@ -18177,7 +18334,7 @@ send:
 				if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb,
 				((rsm == NULL) ? hw_tls : 0)
 #ifdef NETFLIX_COPY_ARGS
-				, &filled_all
+				, &s_mb, &s_moff
 #endif
 				);
 			if (len <= (tp->t_maxseg - optlen)) {
@@ -18548,15 +18705,17 @@ send:
 		log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
 		log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
 		log.u_bbr.flex4 = orig_len;
-		if (filled_all)
-			log.u_bbr.flex5 = 0x80000000;
-		else
-			log.u_bbr.flex5 = 0;
 		/* Save off the early/late values */
 		log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
 		log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
 		log.u_bbr.bw_inuse = rack_get_bw(rack);
-		if (rsm || sack_rxmit) {
+		log.u_bbr.flex8 = 0;
+		if (rsm) {
+			if (rsm->r_flags & RACK_RWND_COLLAPSED) {
+				rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm);
+				counter_u64_add(rack_collapsed_win_rxt, 1);
+				counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start));
+			}
 			if (doing_tlp)
 				log.u_bbr.flex8 = 2;
 			else
diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h
index e8560446b798..c747ceac7628 100644
--- a/sys/netinet/tcp_stacks/tcp_rack.h
+++ b/sys/netinet/tcp_stacks/tcp_rack.h
@@ -275,7 +275,7 @@ struct rack_opts_stats {
  * non-zero, the default is 4 for continuous tracing.
  * You also set in the number of connections you want
  * have get BB logs in net.inet.tcp.<stack>.tp.count.
- * 
+ *
  * Count will decrement every time BB logging is assigned
  * to a connection that hit your tracepoint.
  *
@@ -291,6 +291,7 @@ struct rack_opts_stats {
 #define RACK_TP_HWENOBUF	0x00000001	/* When we are doing hardware pacing and hit enobufs */
 #define RACK_TP_ENOBUF		0x00000002	/* When we hit enobufs with software pacing */
 #define RACK_TP_COLLAPSED_WND	0x00000003	/* When a peer to collapses its rwnd on us */
+#define RACK_TP_COLLAPSED_RXT	0x00000004	/* When we actually retransmit a collapsed window rsm */
 
 #define MIN_GP_WIN 6	/* We need at least 6 MSS in a GP measurement */
 #ifdef _KERNEL
@@ -472,6 +473,8 @@ struct rack_control {
 	uint32_t roundends;		/* acked value above which round ends */
 	uint32_t num_dsack;		/* Count of dsack's seen  (1 per window)*/
 	uint32_t forced_ack_ts;
+ 	uint32_t last_collapse_point;	/* Last point peer collapsed too */
+	uint32_t high_collapse_point;
 	uint32_t rc_lower_rtt_us_cts;	/* Time our GP rtt was last lowered */
 	uint32_t rc_time_probertt_entered;
 	uint32_t rc_time_probertt_starts;
@@ -546,7 +549,15 @@ struct tcp_rack {
 	struct inpcb *rc_inp;	/* The inpcb Lock(a) */
 	uint8_t rc_free_cnt;	/* Number of free entries on the rc_free list
 				 * Lock(a) */
-	uint8_t client_bufferlvl;	/* 0 - 5 normaly, less than or at 2 means its real low */
+	uint8_t client_bufferlvl : 4, /* Expected range [0,5]: 0=unset, 1=low/empty */
+		rack_deferred_inited : 1,
+	        /* ******************************************************************** */
+	        /* Note for details of next two fields see rack_init_retransmit_rate()  */
+	        /* ******************************************************************** */
+		full_size_rxt: 1,
+		shape_rxt_to_pacing_min : 1,
+	        /* ******************************************************************** */
+		spare : 1;
 	uint8_t no_prr_addback : 1,
 		gp_ready : 1,
 		defer_options: 1,
@@ -647,7 +658,9 @@ struct tcp_rack {
 		r_late : 1,
 		r_wanted_output: 1,
 		r_rr_config : 2,
-		rc_avail_bit : 3;
+		r_persist_lt_bw_off : 1,
+ 		r_collapse_point_valid : 1,
+		rc_avail_bit : 2;
 	uint16_t rc_init_win : 8,
 		rc_gp_rtt_set : 1,
 		rc_gp_dyn_mul : 1,