git: 141a53cd58cd - main - tcp: Rack might retransmit forever.

From: Randall Stewart <rrs_at_FreeBSD.org>
Date: Fri, 29 Oct 2021 21:39:17 UTC
The branch main has been updated by rrs:

URL: https://cgit.FreeBSD.org/src/commit/?id=141a53cd58cdf0681f800bb98ed5718f32ea7909

commit 141a53cd58cdf0681f800bb98ed5718f32ea7909
Author:     Randall Stewart <rrs@FreeBSD.org>
AuthorDate: 2021-10-29 21:37:49 +0000
Commit:     Randall Stewart <rrs@FreeBSD.org>
CommitDate: 2021-10-29 21:37:49 +0000

    tcp: Rack might retransmit forever.
    
    If we get a Sacked peer with an MTU change we can retransmit forever if the
    last bytes are sacked and the client goes away (think power off). Then we
    never see the end condition and continually retransmit.
    
    Reviewed by: Michael Tuexen
    Sponsored by: Netflix Inc.
    Differential Revision: https://reviews.freebsd.org/D32671
---
 sys/netinet/tcp_stacks/rack.c     | 66 ++++++++++++++++++++++++++++++---------
 sys/netinet/tcp_stacks/tcp_rack.h | 38 +++++++++++-----------
 2 files changed, 71 insertions(+), 33 deletions(-)

diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 04252511ad18..616e079df60c 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -2422,7 +2422,8 @@ rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t l
 			log.u_bbr.pkt_epoch = rsm->r_start;
 			log.u_bbr.lost = rsm->r_end;
 			log.u_bbr.cwnd_gain = rsm->r_rtr_cnt;
-			log.u_bbr.pacing_gain = rsm->r_flags;
+			/* We loose any upper of the 24 bits */
+			log.u_bbr.pacing_gain = (uint16_t)rsm->r_flags;
 		} else {
 			/* Its a SYN */
 			log.u_bbr.pkt_epoch = rack->rc_tp->iss;
@@ -6670,6 +6671,7 @@ rack_remxt_tmr(struct tcpcb *tp)
 		if (rsm->r_flags & RACK_ACKED)
 			rsm->r_flags |= RACK_WAS_ACKED;
 		rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS);
+		rsm->r_flags |= RACK_MUST_RXT;
 	}
 	/* Clear the count (we just un-acked them) */
 	rack->r_ctl.rc_last_timeout_snduna = tp->snd_una;
@@ -7257,7 +7259,6 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag)
 {
 	int32_t idx;
-	uint16_t stripped_flags;
 
 	rsm->r_rtr_cnt++;
 	rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
@@ -7278,7 +7279,6 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
 	 */
 	rsm->r_fas = ctf_flight_size(rack->rc_tp,
 				     rack->r_ctl.rc_sacked);
-	stripped_flags = rsm->r_flags & ~(RACK_SENT_SP|RACK_SENT_FP);
 	if (rsm->r_flags & RACK_ACKED) {
 		/* Problably MTU discovery messing with us */
 		rsm->r_flags &= ~RACK_ACKED;
@@ -16112,12 +16112,25 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
 	rack_start_hpts_timer(rack, tp, cts, slot, len, 0);
 	if (rack->r_must_retran) {
 		rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start);
-		if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) {
+		if ((SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) ||
+		    ((rsm->r_flags & RACK_MUST_RXT) == 0)) {
 			/*
-			 * We have retransmitted all we need.
+			 * We have retransmitted all we need. If 
+			 * RACK_MUST_RXT is not set then we need to
+			 * not retransmit this guy.
 			 */
 			rack->r_must_retran = 0;
 			rack->r_ctl.rc_out_at_rto = 0;
+			if ((rsm->r_flags & RACK_MUST_RXT) == 0) {
+				/* Not one we should rxt */
+				goto failed;
+			} else {
+				/* Clear the flag */
+				rsm->r_flags &= ~RACK_MUST_RXT;
+			}
+		} else {
+			/* Remove  the flag */
+			rsm->r_flags &= ~RACK_MUST_RXT;
 		}
 	}
 #ifdef TCP_ACCOUNTING
@@ -17004,8 +17017,8 @@ again:
 	if (rack->r_must_retran &&
 	    (rsm == NULL)) {
 		/*
-		 * Non-Sack and we had a RTO or MTU change, we
-		 * need to retransmit until we reach
+		 * Non-Sack and we had a RTO or Sack/non-Sack and a 
+		 * MTU change, we need to retransmit until we reach
 		 * the former snd_max (rack->r_ctl.rc_snd_max_at_rto).
 		 */
 		if (SEQ_GT(tp->snd_max, tp->snd_una)) {
@@ -17028,12 +17041,24 @@ again:
 				sb = &so->so_snd;
 				goto just_return_nolock;
 			}
-			sack_rxmit = 1;
-			len = rsm->r_end - rsm->r_start;
-			sendalot = 0;
-			sb_offset = rsm->r_start - tp->snd_una;
-			if (len >= segsiz)
-				len = segsiz;
+			if ((rsm->r_flags & RACK_MUST_RXT) == 0) {
+				/* It does not have the flag, we are done */
+				rack->r_must_retran = 0;
+				rack->r_ctl.rc_out_at_rto = 0;
+			} else {
+				sack_rxmit = 1;
+				len = rsm->r_end - rsm->r_start;
+				sendalot = 0;
+				sb_offset = rsm->r_start - tp->snd_una;
+				if (len >= segsiz)
+					len = segsiz;
+				/* 
+				 * Delay removing the flag RACK_MUST_RXT so
+				 * that the fastpath for retransmit will
+				 * work with this rsm.
+				 */
+
+			}
 		} else {
 			/* We must be done if there is nothing outstanding */
 			rack->r_must_retran = 0;
@@ -17080,6 +17105,15 @@ again:
 		if (ret == 0)
 			return (0);
 	}
+	if (rsm && (rsm->r_flags & RACK_MUST_RXT)) {
+		/* 
+		 * Clear the flag in prep for the send
+		 * note that if we can't get an mbuf
+		 * and fail, we won't retransmit this
+		 * rsm but that should be ok (its rare).
+		 */
+		rsm->r_flags &= ~RACK_MUST_RXT;
+	}
 	so = inp->inp_socket;
 	sb = &so->so_snd;
 	if (do_a_prefetch == 0) {
@@ -19313,6 +19347,7 @@ rack_mtu_change(struct tcpcb *tp)
 	 * The MSS may have changed
 	 */
 	struct tcp_rack *rack;
+	struct rack_sendmap *rsm;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (rack->r_ctl.rc_pace_min_segs != ctf_fixed_maxseg(tp)) {
@@ -19329,7 +19364,10 @@ rack_mtu_change(struct tcpcb *tp)
 						rack->r_ctl.rc_sacked);
 		rack->r_ctl.rc_snd_max_at_rto = tp->snd_max;
 		rack->r_must_retran = 1;
-
+		/* Mark all inflight to needing to be rxt'd */
+		TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
+			rsm->r_flags |= RACK_MUST_RXT;
+		}
 	}
 	sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
 	/* We don't use snd_nxt to retransmit */
diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h
index 0ada2116dc6f..0893237e92f9 100644
--- a/sys/netinet/tcp_stacks/tcp_rack.h
+++ b/sys/netinet/tcp_stacks/tcp_rack.h
@@ -28,22 +28,23 @@
 #ifndef _NETINET_TCP_RACK_H_
 #define _NETINET_TCP_RACK_H_
 
-#define RACK_ACKED	    0x0001/* The remote endpoint acked this */
-#define RACK_TO_REXT	    0x0002/* A timeout occured on this sendmap entry */
-#define RACK_DEFERRED	    0x0004/* We can't use this for RTT calc - not used */
-#define RACK_OVERMAX	    0x0008/* We have more retran's then we can fit */
-#define RACK_SACK_PASSED    0x0010/* A sack was done above this block */
-#define RACK_WAS_SACKPASS   0x0020/* We retransmitted due to SACK pass */
-#define RACK_HAS_FIN	    0x0040/* segment is sent with fin */
-#define RACK_TLP	    0x0080/* segment sent as tail-loss-probe */
-#define RACK_RWND_COLLAPSED 0x0100/* The peer collapsed the rwnd on the segment */
-#define RACK_APP_LIMITED    0x0200/* We went app limited after this send */
-#define RACK_WAS_ACKED	    0x0400/* a RTO undid the ack, but it already had a rtt calc done */
-#define RACK_HAS_SYN	    0x0800/* SYN is on this guy */
-#define RACK_SENT_W_DSACK   0x1000/* Sent with a dsack */
-#define RACK_SENT_SP	    0x2000/* sent in slow path */
-#define RACK_SENT_FP        0x4000/* sent in fast path */
-#define RACK_HAD_PUSH	    0x8000/* Push was sent on original send */
+#define RACK_ACKED	    0x000001/* The remote endpoint acked this */
+#define RACK_TO_REXT	    0x000002/* A timeout occured on this sendmap entry */
+#define RACK_DEFERRED	    0x000004/* We can't use this for RTT calc - not used */
+#define RACK_OVERMAX	    0x000008/* We have more retran's then we can fit */
+#define RACK_SACK_PASSED    0x000010/* A sack was done above this block */
+#define RACK_WAS_SACKPASS   0x000020/* We retransmitted due to SACK pass */
+#define RACK_HAS_FIN	    0x000040/* segment is sent with fin */
+#define RACK_TLP	    0x000080/* segment sent as tail-loss-probe */
+#define RACK_RWND_COLLAPSED 0x000100/* The peer collapsed the rwnd on the segment */
+#define RACK_APP_LIMITED    0x000200/* We went app limited after this send */
+#define RACK_WAS_ACKED	    0x000400/* a RTO undid the ack, but it already had a rtt calc done */
+#define RACK_HAS_SYN	    0x000800/* SYN is on this guy */
+#define RACK_SENT_W_DSACK   0x001000/* Sent with a dsack */
+#define RACK_SENT_SP	    0x002000/* sent in slow path */
+#define RACK_SENT_FP        0x004000/* sent in fast path */
+#define RACK_HAD_PUSH	    0x008000/* Push was sent on original send */
+#define RACK_MUST_RXT	    0x010000/* We must retransmit this rsm (non-sack/mtu chg)*/
 #define RACK_NUM_OF_RETRANS 3
 
 #define RACK_INITIAL_RTO 1000000 /* 1 second in microseconds */
@@ -55,9 +56,8 @@ struct rack_sendmap {
 	uint32_t r_start;	/* Sequence number of the segment */
 	uint32_t r_end;		/* End seq, this is 1 beyond actually */
 	uint32_t r_rtr_bytes;	/* How many bytes have been retransmitted */
-	uint16_t r_rtr_cnt;	/* Retran count, index this -1 to get time
-				 * sent */
-	uint16_t r_flags;	/* Flags as defined above */
+	uint32_t r_flags : 24,	/* Flags as defined above */
+		 r_rtr_cnt : 8;	/* Retran count, index this -1 to get time */
 	struct mbuf *m;
 	uint32_t soff;
 	uint32_t orig_m_len;