git: 15a7c88058d4 - stable/13 - update the SACK loss recovery to RFC6675, with the following new features: - improved pipe calculation which does not degrade under heavy loss - engaging in Loss Recovery earlier under adverse conditions - Rescue Retransmission in case some of the trailing packets of a request got lost

Tue Mar 2 17:13:48 UTC 2021

The branch stable/13 has been updated by rscheff:

URL: https://cgit.FreeBSD.org/src/commit/?id=15a7c88058d419e3347673ab891ae77ba28ae1bd

commit 15a7c88058d419e3347673ab891ae77ba28ae1bd
Author:     Richard Scheffenegger <rscheff at FreeBSD.org>
AuthorDate: 2021-02-16 11:18:43 +0000
Commit:     Richard Scheffenegger <rscheff at FreeBSD.org>
CommitDate: 2021-03-02 15:59:58 +0000

    update the SACK loss recovery to RFC6675, with the following new features:
    - improved pipe calculation which does not degrade under heavy loss
    - engaging in Loss Recovery earlier under adverse conditions
    - Rescue Retransmission in case some of the trailing packets of a request got lost
    
    All above changes are toggled with the sysctl "rfc6675_pipe" (disabled by default).
    
    Reviewers:      #transport, tuexen, lstewart, slavash, jtl, hselasky, kib, rgrimes, chengc_netapp.com, thj, #manpages, kbowling, #netapp, rscheff
    Reviewed By:    #transport
    MFC after:      2 weeks
    Sponsored by:   NetApp, Inc.
    Differential Revision:  https://reviews.freebsd.org/D18985
    
    (cherry picked from commit 3c40e1d52cd86168779cf99dbabe58df465d7e3f)
---
 share/man/man4/tcp.4    | 10 +++++++++-
 sys/netinet/tcp_input.c | 34 +++++++++++++++++++++++++++++-----
 sys/netinet/tcp_sack.c  | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4
index 431bcd95513b..16cf02184516 100644
--- a/share/man/man4/tcp.4
+++ b/share/man/man4/tcp.4
@@ -34,7 +34,7 @@
 .\"     From: @(#)tcp.4	8.1 (Berkeley) 6/5/93
 .\" $FreeBSD$
 .\"
-.Dd February 11, 2021
+.Dd February 13, 2021
 .Dt TCP 4
 .Os
 .Sh NAME
@@ -560,6 +560,14 @@ high losses leading to RTO, but reduces PRR effectiveness in more common setting
 .It Va rfc6675_pipe
 Calculate the bytes in flight using the algorithm described in RFC 6675, and
 is also an improvement when Proportional Rate Reduction is enabled.
+Also enables two other mechanisms from RFC6675.
+Rescue Retransmission helps timely loss recovery, when the trailing segments
+of a transmission are lost, while no additional data is ready to be sent.
+In case a partial ACK without a SACK block is received during SACK loss
+recovery, the trailing segment is immediately resent, rather than waiting
+for a Retransmission timeout.
+SACK loss recovery is also engaged, once two segments plus one byte are
+SACKed - even if no traditional duplicate ACKs were seen.
 .It Va rfc3042
 Enable the Limited Transmit algorithm as described in RFC 3042.
 It helps avoid timeouts on lossy links and also when the congestion window
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index f16c62ae6a1f..1a4a4619c4bf 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -1507,6 +1507,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	struct mbuf *mfree;
 	struct tcpopt to;
 	int tfo_syn;
+	u_int maxseg;
 
 #ifdef TCPDEBUG
 	/*
@@ -2512,8 +2513,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
 #endif
 
 		if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
-			u_int maxseg;
-
 			maxseg = tcp_maxseg(tp);
 			if (tlen == 0 &&
 			    (tiwin == tp->snd_wnd ||
@@ -2648,7 +2647,21 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
 						tp->snd_cwnd += maxseg;
 					(void) tp->t_fb->tfb_tcp_output(tp);
 					goto drop;
-				} else if (tp->t_dupacks == tcprexmtthresh) {
+				} else if (tp->t_dupacks == tcprexmtthresh ||
+					    (tp->t_flags & TF_SACK_PERMIT &&
+					     V_tcp_do_rfc6675_pipe &&
+					     tp->sackhint.sacked_bytes >
+					     (tcprexmtthresh - 1) * maxseg)) {
+enter_recovery:
+					/*
+					 * Above is the RFC6675 trigger condition of
+					 * more than (dupthresh-1)*maxseg sacked data.
+					 * If the count of holes in the
+					 * scoreboard is >= dupthresh, we could
+					 * also enter loss recovery, but don't
+					 * have that value readily available.
+					 */
+					tp->t_dupacks = tcprexmtthresh;
 					tcp_seq onxt = tp->snd_nxt;
 
 					/*
@@ -2693,6 +2706,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
 						tp->snd_recover = tp->snd_nxt;
 						tp->snd_cwnd = maxseg;
 						(void) tp->t_fb->tfb_tcp_output(tp);
+						if (SEQ_GT(th->th_ack, tp->snd_una))
+							goto resume_partialack;
 						goto drop;
 					}
 					tp->snd_nxt = th->th_ack;
@@ -2779,10 +2794,19 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
 			 */
 			if ((tp->t_flags & TF_SACK_PERMIT) &&
 			    (to.to_flags & TOF_SACK) &&
-			    sack_changed)
+			    sack_changed) {
 				tp->t_dupacks++;
+				/* limit overhead by setting maxseg last */
+				if (!IN_FASTRECOVERY(tp->t_flags) &&
+				    (tp->sackhint.sacked_bytes >
+				    ((tcprexmtthresh - 1) *
+				    (maxseg = tcp_maxseg(tp))))) {
+					goto enter_recovery;
+				}
+			}
 		}
 
+resume_partialack:
 		KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
 		    ("%s: th_ack <= snd_una", __func__));
 
@@ -2793,7 +2817,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
 		if (IN_FASTRECOVERY(tp->t_flags)) {
 			if (SEQ_LT(th->th_ack, tp->snd_recover)) {
 				if (tp->t_flags & TF_SACK_PERMIT)
-					if (V_tcp_do_prr)
+					if (V_tcp_do_prr && to.to_flags & TOF_SACK)
 						tcp_prr_partialack(tp, th);
 					else
 						tcp_sack_partialack(tp, th);
diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c
index 2cae6a560c48..28cd5c93f106 100644
--- a/sys/netinet/tcp_sack.c
+++ b/sys/netinet/tcp_sack.c
@@ -750,6 +750,16 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
 		else
 			sblkp--;
 	}
+	if (!(to->to_flags & TOF_SACK))
+		/*
+		 * If this ACK did not contain any
+		 * SACK blocks, any only moved the
+		 * left edge right, it is a pure
+		 * cumulative ACK. Do not count
+		 * DupAck for this. Also required
+		 * for RFC6675 rescue retransmission.
+		 */
+		sack_changed = 0;
 	tp->sackhint.delivered_data = delivered_data;
 	tp->sackhint.sacked_bytes += delivered_data - left_edge_delta;
 	KASSERT((delivered_data >= 0), ("delivered_data < 0"));
@@ -800,6 +810,31 @@ tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th)
 	if (tp->snd_cwnd > tp->snd_ssthresh)
 		tp->snd_cwnd = tp->snd_ssthresh;
 	tp->t_flags |= TF_ACKNOW;
+	/*
+	 * RFC6675 rescue retransmission
+	 * Add a hole between th_ack (snd_una is not yet set) and snd_max,
+	 * if this was a pure cumulative ACK and no data was send beyond
+	 * recovery point. Since the data in the socket has not been freed
+	 * at this point, we check if the scoreboard is empty, and the ACK
+	 * delivered some new data, indicating a full ACK. Also, if the
+	 * recovery point is still at snd_max, we are probably application
+	 * limited. However, this inference might not always be true. The
+	 * rescue retransmission may rarely be slightly premature
+	 * compared to RFC6675.
+	 * The corresponding ACK+SACK will cause any further outstanding
+	 * segments to be retransmitted. This addresses a corner case, when
+	 * the trailing packets of a window are lost and no further data
+	 * is available for sending.
+	 */
+	if ((V_tcp_do_rfc6675_pipe) &&
+	    SEQ_LT(th->th_ack, tp->snd_recover) &&
+	    (tp->snd_recover == tp->snd_max) &&
+	    TAILQ_EMPTY(&tp->snd_holes) &&
+	    (tp->sackhint.delivered_data > 0)) {
+		struct sackhole *hole;
+		int maxseg = tcp_maxseg(tp);
+		hole = tcp_sackhole_insert(tp, SEQ_MAX(th->th_ack, tp->snd_max - maxseg), tp->snd_max, NULL);
+	}
 	(void) tp->t_fb->tfb_tcp_output(tp);
 }