git: 43b117f88f30 - main - tcp: make the maximum number of retransmissions tunable per VNET

From: Richard Scheffenegger <rscheff_at_FreeBSD.org>
Date: Tue, 06 Jun 2023 21:02:40 UTC
The branch main has been updated by rscheff:

URL: https://cgit.FreeBSD.org/src/commit/?id=43b117f88f3044d5f08e70b0daf0bb964f9ecb4b

commit 43b117f88f3044d5f08e70b0daf0bb964f9ecb4b
Author:     Richard Scheffenegger <rscheff@FreeBSD.org>
AuthorDate: 2023-06-06 20:56:44 +0000
Commit:     Richard Scheffenegger <rscheff@FreeBSD.org>
CommitDate: 2023-06-06 20:58:54 +0000

    tcp: make the maximum number of retransmissions tunable per VNET
    
    Both Windows (TcpMaxDataRetransmissions) and Linux (tcp_retries2)
    allow to restrict the maximum number of consecutive timer based
    retransmissions. Add that same capability on a per-VNet basis to
    FreeBSD.
    
    Reviewed By:            cc, tuexen, #transport
    Sponsored by:           NetApp, Inc.
    Differential Revision:  https://reviews.freebsd.org/D40424
---
 share/man/man4/tcp.4          |  5 ++++-
 sys/netinet/tcp_output.c      |  2 +-
 sys/netinet/tcp_stacks/bbr.c  |  8 ++++----
 sys/netinet/tcp_stacks/rack.c |  8 ++++----
 sys/netinet/tcp_timer.c       | 30 ++++++++++++++++++++++++++----
 sys/netinet/tcp_var.h         |  2 ++
 6 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4
index ce27705f7eda..382e39a4355d 100644
--- a/share/man/man4/tcp.4
+++ b/share/man/man4/tcp.4
@@ -34,7 +34,7 @@
 .\"     From: @(#)tcp.4	8.1 (Berkeley) 6/5/93
 .\" $FreeBSD$
 .\"
-.Dd February 3, 2023
+.Dd June 6, 2023
 .Dt TCP 4
 .Os
 .Sh NAME
@@ -843,6 +843,9 @@ Maximum size of automatic receive buffer.
 Initial
 .Tn TCP
 receive window (buffer size).
+.It Va retries
+Maximum number of consecutive timer based retransmits sent after a data
+segment is lost (default and maximum is 12).
 .It Va rexmit_drop_options
 Drop TCP options from third and later retransmitted SYN segments
 of a connection.
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index abfab1a62176..800480413586 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -1766,7 +1766,7 @@ tcp_setpersist(struct tcpcb *tp)
 			tt = maxunacktime;
 	}
 	tcp_timer_activate(tp, TT_PERSIST, tt);
-	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
+	if (tp->t_rxtshift < V_tcp_retries)
 		tp->t_rxtshift++;
 }
 
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index 5ecb558dadb3..1e8053afc45c 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -4763,7 +4763,7 @@ bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
 	 * the idle time (no responses to probes) reaches the maximum
 	 * backoff that we would use if retransmitting.
 	 */
-	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
+	if (tp->t_rxtshift >= V_tcp_retries &&
 	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
 	    ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
 		KMOD_TCPSTAT_INC(tcps_persistdrop);
@@ -4796,7 +4796,7 @@ bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
 			tp->t_flags &= ~TF_DELACK;
 		free(t_template, M_TEMP);
 	}
-	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
+	if (tp->t_rxtshift < V_tcp_retries)
 		tp->t_rxtshift++;
 	bbr_start_hpts_timer(bbr, tp, cts, 3, 0, 0);
 out:
@@ -4990,8 +4990,8 @@ bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
 		 */
 		tp->t_rxtshift++;
 	}
-	if (tp->t_rxtshift > TCP_MAXRXTSHIFT) {
-		tp->t_rxtshift = TCP_MAXRXTSHIFT;
+	if (tp->t_rxtshift > V_tcp_retries) {
+		tp->t_rxtshift = V_tcp_retries;
 		KMOD_TCPSTAT_INC(tcps_timeoutdrop);
 		tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
 		/* XXXGL: previously t_softerror was casted to uint16_t */
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index c9b5b937cc46..36fd5daf07dd 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -7445,7 +7445,7 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 	 * the idle time (no responses to probes) reaches the maximum
 	 * backoff that we would use if retransmitting.
 	 */
-	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
+	if (tp->t_rxtshift >= V_tcp_retries &&
 	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
 	     TICKS_2_USEC(ticks - tp->t_rcvtime) >= RACK_REXMTVAL(tp) * tcp_totbackoff)) {
 		KMOD_TCPSTAT_INC(tcps_persistdrop);
@@ -7491,7 +7491,7 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 			tp->t_flags &= ~TF_DELACK;
 		free(t_template, M_TEMP);
 	}
-	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
+	if (tp->t_rxtshift < V_tcp_retries)
 		tp->t_rxtshift++;
 out:
 	rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL);
@@ -7783,10 +7783,10 @@ rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 		 */
 		tp->t_rxtshift++;
 	}
-	if (tp->t_rxtshift > TCP_MAXRXTSHIFT) {
+	if (tp->t_rxtshift > V_tcp_retries) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
 drop_it:
-		tp->t_rxtshift = TCP_MAXRXTSHIFT;
+		tp->t_rxtshift = V_tcp_retries;
 		KMOD_TCPSTAT_INC(tcps_timeoutdrop);
 		/* XXXGL: previously t_softerror was casted to uint16_t */
 		MPASS(tp->t_softerror >= 0);
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index d1301c18d54f..6126d85c7565 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -201,6 +201,28 @@ static int	per_cpu_timers = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
     &per_cpu_timers , 0, "run tcp timers on all cpus");
 
+static int
+sysctl_net_inet_tcp_retries(SYSCTL_HANDLER_ARGS)
+{
+	int error, new;
+
+	new = V_tcp_retries;
+	error = sysctl_handle_int(oidp, &new, 0, req);
+	if (error == 0 && req->newptr) {
+		if ((new < 1) || (new > TCP_MAXRXTSHIFT))
+			error = EINVAL;
+		else
+			V_tcp_retries = new;
+	}
+	return (error);
+}
+
+VNET_DEFINE(int, tcp_retries) = TCP_MAXRXTSHIFT;
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, retries,
+    CTLTYPE_INT | CTLFLAG_VNET | CTLFLAG_RW,
+    &VNET_NAME(tcp_retries), 0, sysctl_net_inet_tcp_retries, "I",
+    "maximum number of consecutive timer based retransmissions");
+
 /*
  * Map the given inp to a CPU id.
  *
@@ -492,7 +514,7 @@ tcp_timer_persist(struct tcpcb *tp)
 	 * progress.
 	 */
 	progdrop = tcp_maxunacktime_check(tp);
-	if (progdrop || (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
+	if (progdrop || (tp->t_rxtshift >= V_tcp_retries &&
 	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
 	     ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff))) {
 		if (!progdrop)
@@ -555,10 +577,10 @@ tcp_timer_rexmt(struct tcpcb *tp)
 	 * or we've gone long enough without making progress, then drop
 	 * the session.
 	 */
-	if (++tp->t_rxtshift > TCP_MAXRXTSHIFT || tcp_maxunacktime_check(tp)) {
-		if (tp->t_rxtshift > TCP_MAXRXTSHIFT)
+	if (++tp->t_rxtshift > V_tcp_retries || tcp_maxunacktime_check(tp)) {
+		if (tp->t_rxtshift > V_tcp_retries)
 			TCPSTAT_INC(tcps_timeoutdrop);
-		tp->t_rxtshift = TCP_MAXRXTSHIFT;
+		tp->t_rxtshift = V_tcp_retries;
 		tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
 		NET_EPOCH_ENTER(et);
 		tp = tcp_drop(tp, ETIMEDOUT);
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 86345b2aa630..587998331fbf 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -1289,6 +1289,7 @@ VNET_DECLARE(int, tcp_perconn_stats_dflt_tpl);
 VNET_DECLARE(int, tcp_perconn_stats_enable);
 #endif /* STATS */
 VNET_DECLARE(int, tcp_recvspace);
+VNET_DECLARE(int, tcp_retries);
 VNET_DECLARE(int, tcp_sack_globalholes);
 VNET_DECLARE(int, tcp_sack_globalmaxholes);
 VNET_DECLARE(int, tcp_sack_maxholes);
@@ -1335,6 +1336,7 @@ VNET_DECLARE(struct inpcbinfo, tcbinfo);
 #define	V_tcp_perconn_stats_enable	VNET(tcp_perconn_stats_enable)
 #endif /* STATS */
 #define	V_tcp_recvspace			VNET(tcp_recvspace)
+#define	V_tcp_retries			VNET(tcp_retries)
 #define	V_tcp_sack_globalholes		VNET(tcp_sack_globalholes)
 #define	V_tcp_sack_globalmaxholes	VNET(tcp_sack_globalmaxholes)
 #define	V_tcp_sack_maxholes		VNET(tcp_sack_maxholes)