git: 26cbd0028c50 - main - tcp: Rack may still calculate long RTT on persists probes.

From: Randall Stewart <rrs_at_FreeBSD.org>
Date: Thu, 11 Nov 2021 11:37:19 UTC
The branch main has been updated by rrs:

URL: https://cgit.FreeBSD.org/src/commit/?id=26cbd0028c508787e1d88361f345ac707acabce5

commit 26cbd0028c508787e1d88361f345ac707acabce5
Author:     Randall Stewart <rrs@FreeBSD.org>
AuthorDate: 2021-11-11 11:35:51 +0000
Commit:     Randall Stewart <rrs@FreeBSD.org>
CommitDate: 2021-11-11 11:35:51 +0000

    tcp: Rack may still calculate long RTT on persists probes.
    
    When a persists probe is lost, we will end up calculating a long
    RTT based on the initial probe and when the response comes from the
    second probe (or third etc). This means we have a minimum of a
    confidence level of 3 on a incorrect probe. This commit will change it
    so that we have one of two options
    a) Just not count RTT of probes where we had a loss
    <or>
    b) Count them still but degrade the confidence to 0.
    
    I have set in this the default being to just not measure them, but I am open
    to having the default be otherwise.
    
    Reviewed by: Michael Tuexen
    Sponsored by: Netflix Inc.
    Differential Revision: https://reviews.freebsd.org/D32897
---
 sys/netinet/tcp_stacks/rack.c     | 163 ++++++++++++++++++++++++++++++++------
 sys/netinet/tcp_stacks/tcp_rack.h |   4 +-
 sys/netinet/tcp_subr.c            |  25 +++++-
 3 files changed, 163 insertions(+), 29 deletions(-)

diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index b1927f0a17e4..7769aa1272c0 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -205,6 +205,7 @@ static int32_t rack_hw_up_only = 1;
 static int32_t rack_stats_gets_ms_rtt = 1;
 static int32_t rack_prr_addbackmax = 2;
 static int32_t rack_do_hystart = 0;
+static int32_t rack_apply_rtt_with_reduced_conf = 0;
 
 static int32_t rack_pkt_delay = 1000;
 static int32_t rack_send_a_lot_in_prr = 1;
@@ -343,6 +344,10 @@ counter_u64_t rack_saw_enetunreach;
 counter_u64_t rack_per_timer_hole;
 counter_u64_t rack_large_ackcmp;
 counter_u64_t rack_small_ackcmp;
+counter_u64_t rack_persists_sends;
+counter_u64_t rack_persists_acks;
+counter_u64_t rack_persists_loss;
+counter_u64_t rack_persists_lost_ends;
 #ifdef INVARIANTS
 counter_u64_t rack_adjust_map_bw;
 #endif
@@ -772,6 +777,10 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
 		counter_u64_zero(rack_per_timer_hole);
 		counter_u64_zero(rack_large_ackcmp);
 		counter_u64_zero(rack_small_ackcmp);
+		counter_u64_zero(rack_persists_sends);
+		counter_u64_zero(rack_persists_acks);
+		counter_u64_zero(rack_persists_loss);
+		counter_u64_zero(rack_persists_lost_ends);
 #ifdef INVARIANTS
 		counter_u64_zero(rack_adjust_map_bw);
 #endif
@@ -1412,6 +1421,11 @@ rack_init_sysctls(void)
 	    &rack_tcp_accounting, 0,
 	    "Should we turn on TCP accounting for all rack sessions?");
 #endif
+	SYSCTL_ADD_S32(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_misc),
+	    OID_AUTO, "apply_rtt_with_low_conf", CTLFLAG_RW,
+	    &rack_apply_rtt_with_reduced_conf, 0,
+	    "When a persist or keep-alive probe is not answered do we calculate rtt on subsequent answers?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_misc),
 	    OID_AUTO, "rack_dsack_ctl", CTLFLAG_RW,
@@ -1774,6 +1788,30 @@ rack_init_sysctls(void)
 	    OID_AUTO, "cmp_large_mbufs", CTLFLAG_RD,
 	    &rack_large_ackcmp,
 	    "Number of TCP connections with large mbuf's for compressed acks");
+	rack_persists_sends = counter_u64_alloc(M_WAITOK);
+	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_counters),
+	    OID_AUTO, "persist_sends", CTLFLAG_RD,
+	    &rack_persists_sends,
+	    "Number of times we sent a persist probe");
+	rack_persists_acks = counter_u64_alloc(M_WAITOK);
+	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_counters),
+	    OID_AUTO, "persist_acks", CTLFLAG_RD,
+	    &rack_persists_acks,
+	    "Number of times a persist probe was acked");
+	rack_persists_loss = counter_u64_alloc(M_WAITOK);
+	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_counters),
+	    OID_AUTO, "persist_loss", CTLFLAG_RD,
+	    &rack_persists_loss,
+	    "Number of times we detected a lost persist probe (no ack)");
+	rack_persists_lost_ends = counter_u64_alloc(M_WAITOK);
+	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_counters),
+	    OID_AUTO, "persist_loss_ends", CTLFLAG_RD,
+	    &rack_persists_lost_ends,
+	    "Number of lost persist probe (no ack) that the run ended with a PERSIST abort");
 	rack_small_ackcmp = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
@@ -2938,6 +2976,10 @@ rack_counter_destroy(void)
 	counter_u64_free(rack_per_timer_hole);
 	counter_u64_free(rack_large_ackcmp);
 	counter_u64_free(rack_small_ackcmp);
+	counter_u64_free(rack_persists_sends);
+	counter_u64_free(rack_persists_acks);
+	counter_u64_free(rack_persists_loss);
+	counter_u64_free(rack_persists_lost_ends);
 #ifdef INVARIANTS
 	counter_u64_free(rack_adjust_map_bw);
 #endif
@@ -5623,6 +5665,9 @@ rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 		if (rack->r_ctl.rc_went_idle_time == 0)
 			rack->r_ctl.rc_went_idle_time = 1;
 		rack_timer_cancel(tp, rack, cts, __LINE__);
+		rack->r_ctl.persist_lost_ends = 0;
+		rack->probe_not_answered = 0;
+		rack->forced_ack = 0;
 		tp->t_rxtshift = 0;
 		RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
 			      rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
@@ -6494,6 +6539,7 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 		tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
 		rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
 		tcp_set_inp_to_drop(inp, ETIMEDOUT);
+		counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
 		return (1);
 	}
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
@@ -6515,6 +6561,7 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 		retval = 1;
 		tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
 		tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
+		counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
 		goto out;
 	}
 	if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
@@ -6531,6 +6578,7 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 		KMOD_TCPSTAT_INC(tcps_persistdrop);
 		tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
 		tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
+		counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
 		goto out;
 	}
 	t_template = tcpip_maketemplate(rack->rc_inp);
@@ -6539,7 +6587,12 @@ rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 		if (rack->forced_ack == 0) {
 			rack->forced_ack = 1;
 			rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
+		} else {
+			rack->probe_not_answered = 1;
+			counter_u64_add(rack_persists_loss, 1);
+			rack->r_ctl.persist_lost_ends++;
 		}
+		counter_u64_add(rack_persists_sends, 1);
 		tcp_respond(tp, t_template->tt_ipgen,
 			    &t_template->tt_t, (struct mbuf *)NULL,
 			    tp->rcv_nxt, tp->snd_una - 1, 0);
@@ -6602,6 +6655,8 @@ rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 			if (rack->forced_ack == 0) {
 				rack->forced_ack = 1;
 				rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
+			} else {
+				rack->probe_not_answered = 1;
 			}
 			tcp_respond(tp, t_template->tt_ipgen,
 			    &t_template->tt_t, (struct mbuf *)NULL,
@@ -10301,6 +10356,14 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	acked = BYTES_THIS_ACK(tp, th);
+	if (acked) {
+		/* 
+		 * Any time we move the cum-ack forward clear
+		 * keep-alive tied probe-not-answered. The
+		 * persists clears its own on entry.
+		 */
+		rack->probe_not_answered = 0;
+	}
 	KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
 	KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
 	/*
@@ -13374,6 +13437,61 @@ rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent
 
 }
 
+static void
+rack_handle_probe_response(struct tcp_rack *rack, uint32_t tiwin, uint32_t us_cts)
+{
+	uint32_t us_rtt;
+	/*
+	 * A persist or keep-alive was forced out, update our
+	 * min rtt time. Note now worry about lost responses.
+	 * When a subsequent keep-alive or persist times out
+	 * and forced_ack is still on, then the last probe
+	 * was not responded to. In such cases we have a
+	 * sysctl that controls the behavior. Either we apply
+	 * the rtt but with reduced confidence (0). Or we just
+	 * plain don't apply the rtt estimate. Having data flow
+	 * will clear the probe_not_answered flag i.e. cum-ack
+	 * move forward <or> exiting and reentering persists.
+	 */
+
+	rack->forced_ack = 0;
+	rack->rc_tp->t_rxtshift = 0;
+	if ((rack->rc_in_persist &&
+	     (tiwin == rack->rc_tp->snd_wnd)) ||
+	    (rack->rc_in_persist == 0)) {
+		/*
+		 * In persists only apply the RTT update if this is
+		 * a response to our window probe. And that
+		 * means the rwnd sent must match the current
+		 * snd_wnd. If it does not, then we got a
+		 * window update ack instead. For keepalive
+		 * we allow the answer no matter what the window.
+		 *
+		 * Note that if the probe_not_answered is set then
+		 * the forced_ack_ts is the oldest one i.e. the first
+		 * probe sent that might have been lost. This assures
+		 * us that if we do calculate an RTT it is longer not
+		 * some short thing.
+		 */
+		if (rack->rc_in_persist)
+			counter_u64_add(rack_persists_acks, 1);
+		us_rtt = us_cts - rack->r_ctl.forced_ack_ts;
+		if (us_rtt == 0)
+			us_rtt = 1;
+		if (rack->probe_not_answered == 0) {
+			rack_apply_updated_usrtt(rack, us_rtt, us_cts);
+			tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1);
+		} else {
+			/* We have a retransmitted probe here too */
+			if (rack_apply_rtt_with_reduced_conf) {
+				rack_apply_updated_usrtt(rack, us_rtt, us_cts);
+				tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 0, NULL, 1);
+			}
+		}
+	}
+}
+
+
 static int
 rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv)
 {
@@ -13483,7 +13601,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
 		} else if (SEQ_GT(ae->ack, high_seq)) {
 			/* Case A */
 			ae->ack_val_set = ACK_CUMACK;
-		} else if (tiwin == the_win) {
+		} else if ((tiwin == the_win) && (rack->rc_in_persist == 0)){
 			/* Case D */
 			ae->ack_val_set = ACK_DUPACK;
 		} else {
@@ -13596,6 +13714,18 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
 			rack_strike_dupack(rack);
 		} else if (ae->ack_val_set == ACK_RWND) {
 			/* Case C */
+			if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) {
+				ts.tv_sec = ae->timestamp / 1000000000;
+				ts.tv_nsec = ae->timestamp % 1000000000;
+				rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
+				rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
+			} else {
+				rack->r_ctl.act_rcv_time = *tv;
+			}
+			if (rack->forced_ack) {
+				rack_handle_probe_response(rack, tiwin,
+							   tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time));
+			}
 			win_up_req = 1;
 			win_upd_ack = ae->ack;
 			win_seq = ae->seq;
@@ -13677,6 +13807,11 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
 #endif
 	acked_amount = acked = (high_seq - tp->snd_una);
 	if (acked) {
+		/* 
+		 * Clear the probe not answered flag
+		 * since cum-ack moved forward.
+		 */
+		rack->probe_not_answered = 0;
 		if (rack->sack_attack_disable == 0)
 			rack_do_decay(rack);
 		if (acked >= segsiz) {
@@ -14432,31 +14567,7 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	}
 	rack_clear_rate_sample(rack);
 	if (rack->forced_ack) {
-		uint32_t us_rtt;
-
-		/*
-		 * A persist or keep-alive was forced out, update our
-		 * min rtt time. Note we do not worry about lost
-		 * retransmissions since KEEP-ALIVES and persists
-		 * are usually way long on times of sending (though
-		 * if we were really paranoid or worried we could
-		 * at least use timestamps if available to validate).
-		 */
-		rack->forced_ack = 0;
-		if (tiwin == tp->snd_wnd) {
-			/*
-			 * Only apply the RTT update if this is
-			 * a response to our window probe. And that
-			 * means the rwnd sent must match the current
-			 * snd_wnd. If it does not, then we got a
-			 * window update ack instead.
-			 */
-			us_rtt = us_cts - rack->r_ctl.forced_ack_ts;
-			if (us_rtt == 0)
-				us_rtt = 1;
-			rack_apply_updated_usrtt(rack, us_rtt, us_cts);
-			tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1);
-		}
+		rack_handle_probe_response(rack, tiwin, us_cts);
 	}
 	/*
 	 * This is the one exception case where we set the rack state
diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h
index 0893237e92f9..4b1f8513055d 100644
--- a/sys/netinet/tcp_stacks/tcp_rack.h
+++ b/sys/netinet/tcp_stacks/tcp_rack.h
@@ -496,6 +496,7 @@ struct rack_control {
 	uint32_t challenge_ack_cnt;
 	uint32_t rc_min_to;	/* Socket option value Lock(a) */
 	uint32_t rc_pkt_delay;	/* Socket option value Lock(a) */
+	uint32_t persist_lost_ends;
 	struct newreno rc_saved_beta;	/*
 					 * For newreno cc:
 					 * rc_saved_cc are the values we have had
@@ -567,7 +568,8 @@ struct tcp_rack {
 		rc_last_tlp_past_cumack: 1,
 		rc_last_sent_tlp_seq_valid: 1,
 		rc_last_sent_tlp_past_cumack: 1,
-		avail_bytes : 3;
+		probe_not_answered: 1,
+		avail_bytes : 2;
 	uint32_t rc_rack_rtt;	/* RACK-RTT Lock(a) */
 	uint16_t r_mbuf_queue : 1,	/* Do we do mbuf queue for non-paced */
 		 rtt_limit_mul : 4,	/* muliply this by low rtt */
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index fab225fbc804..4805d6c80327 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -1748,6 +1748,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
 	struct mbuf *optm;
 	struct udphdr *uh = NULL;
 	struct tcphdr *nth;
+	struct tcp_log_buffer *lgb;
 	u_char *optp;
 #ifdef INET6
 	struct ip6_hdr *ip6;
@@ -1756,6 +1757,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
 	int optlen, tlen, win, ulen;
 	bool incl_opts;
 	uint16_t port;
+	int output_ret;
 
 	KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
 	NET_EPOCH_ASSERT();
@@ -2086,11 +2088,26 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
 	TCP_PROBE3(debug__output, tp, th, m);
 	if (flags & TH_RST)
 		TCP_PROBE5(accept__refused, NULL, NULL, m, tp, nth);
+	if ((tp != NULL) && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
+		union tcp_log_stackspecific log;
+		struct timeval tv;
+
+		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+		log.u_bbr.inhpts = tp->t_inpcb->inp_in_hpts;
+		log.u_bbr.ininput = tp->t_inpcb->inp_in_input;
+		log.u_bbr.flex8 = 4;
+		log.u_bbr.pkts_out = tp->t_maxseg;
+		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+		log.u_bbr.delivered = 0;
+		lgb = tcp_log_event_(tp, nth, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
+				     0, &log, false, NULL, NULL, 0, &tv);
+	} else
+		lgb = NULL;
 
 #ifdef INET6
 	if (isipv6) {
 		TCP_PROBE5(send, NULL, tp, ip6, tp, nth);
-		(void)ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
+		output_ret = ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
 	}
 #endif /* INET6 */
 #if defined(INET) && defined(INET6)
@@ -2099,9 +2116,13 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
 #ifdef INET
 	{
 		TCP_PROBE5(send, NULL, tp, ip, tp, nth);
-		(void)ip_output(m, NULL, NULL, 0, NULL, inp);
+		output_ret = ip_output(m, NULL, NULL, 0, NULL, inp);
 	}
 #endif
+	if (lgb) {
+		lgb->tlb_errno = output_ret;
+		lgb = NULL;
+	}
 }
 
 /*