git: f3bba8cd62f2 - main - TCP without LRO doing static pacing does not always pace as expected.

From: Randall Stewart <rrs_at_FreeBSD.org>
Date: Sun, 15 Jun 2025 09:05:00 UTC
The branch main has been updated by rrs:

URL: https://cgit.FreeBSD.org/src/commit/?id=f3bba8cd62f2ddcacba7d9c36a2cc21b1fe11968

commit f3bba8cd62f2ddcacba7d9c36a2cc21b1fe11968
Author:     Randall Stewart <rrs@FreeBSD.org>
AuthorDate: 2025-06-15 09:02:39 +0000
Commit:     Randall Stewart <rrs@FreeBSD.org>
CommitDate: 2025-06-15 09:02:39 +0000

    TCP without LRO doing static pacing does not always pace as expected.
    
    cause a surprise extra packet i.e. the burst limit being overstepped and another one where
    the burst limit is under stepped. This patch fixes those errors.
    Reported by: tuexen
    Reviewed by: tuexen
    Differential Revision:  <https://reviews.freebsd.org/D50858>
---
 sys/netinet/tcp_stacks/rack.c | 56 +++++++++++++++++++++++++++----------------
 1 file changed, 36 insertions(+), 20 deletions(-)

diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index cb4e22401c12..1d4bc3124058 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -17178,6 +17178,12 @@ rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t slot,
 		log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss;
 		log.u_bbr.cwnd_gain <<= 1;
 		log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca;
+		log.u_bbr.cwnd_gain <<= 1;
+		log.u_bbr.cwnd_gain |= rack->use_fixed_rate;
+		log.u_bbr.cwnd_gain <<= 1;
+		log.u_bbr.cwnd_gain |= rack->rc_always_pace;
+		log.u_bbr.cwnd_gain <<= 1;
+		log.u_bbr.cwnd_gain |= rack->gp_ready;
 		log.u_bbr.bbr_substate = quality;
 		log.u_bbr.bbr_state = rack->dgp_on;
 		log.u_bbr.bbr_state <<= 1;
@@ -17539,8 +17545,8 @@ rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, str
 						   rack->r_ctl.rc_last_us_rtt,
 						   88, __LINE__, NULL, gain);
 		}
-		if ((bw_est == 0) || (rate_wanted == 0) ||
-		    ((rack->gp_ready == 0) && (rack->use_fixed_rate == 0))) {
+		if (((bw_est == 0) || (rate_wanted == 0)  || (rack->gp_ready == 0)) &&
+		    (rack->use_fixed_rate == 0)) {
 			/*
 			 * No way yet to make a b/w estimate or
 			 * our raise is set incorrectly.
@@ -19039,7 +19045,7 @@ rack_sndbuf_autoscale(struct tcp_rack *rack)
 
 static int
 rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val,
-		 uint32_t cts, uint32_t ms_cts, struct timeval *tv, long tot_len, int *send_err)
+		 uint32_t cts, uint32_t ms_cts, struct timeval *tv, long *tot_len, int *send_err, int line)
 {
 	/*
 	 * Enter to do fast output. We are given that the sched_pin is
@@ -19212,7 +19218,7 @@ again:
 	}
 	if (rack->r_ctl.fsb.rfo_apply_push &&
 	    (len == rack->r_ctl.fsb.left_to_send)) {
-		tcp_set_flags(th, flags | TH_PUSH);
+		flags |= TH_PUSH;
 		add_flag |= RACK_HAD_PUSH;
 	}
 	if ((m->m_next == NULL) || (len <= 0)){
@@ -19391,11 +19397,11 @@ again:
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.flex5 = log.u_bbr.inflight;
 		log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use;
-		log.u_bbr.delivered = 0;
+		log.u_bbr.delivered = rack->r_ctl.fsb.left_to_send;
 		log.u_bbr.rttProp = 0;
 		log.u_bbr.delRate = rack->r_must_retran;
 		log.u_bbr.delRate <<= 1;
-		log.u_bbr.pkt_epoch = __LINE__;
+		log.u_bbr.pkt_epoch = line;
 		/* For fast output no retrans so just inflight and how many mss we send */
 		log.u_bbr.flex5 = log.u_bbr.inflight;
 		log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz);
@@ -19468,7 +19474,7 @@ again:
 	tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls);
 
 	rack->forced_ack = 0;	/* If we send something zap the FA flag */
-	tot_len += len;
+	*tot_len += len;
 	if ((tp->t_flags & TF_GPUTINPROG) == 0)
 		rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset);
 	tp->snd_max += len;
@@ -19504,6 +19510,7 @@ again:
 	}
 	if ((rack->r_ctl.fsb.left_to_send >= segsiz) &&
 	    (max_val > len) &&
+	    (*tot_len < rack->r_ctl.rc_pace_max_segs) &&
 	    (tso == 0)) {
 		max_val -= len;
 		len = segsiz;
@@ -19515,14 +19522,14 @@ again:
 	}
 	tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
 	counter_u64_add(rack_fto_send, 1);
-	slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz, __LINE__);
-	rack_start_hpts_timer(rack, tp, cts, slot, tot_len, 0);
+	slot = rack_get_pacing_delay(rack, tp, *tot_len, NULL, segsiz, __LINE__);
+	rack_start_hpts_timer(rack, tp, cts, slot, *tot_len, 0);
 #ifdef TCP_ACCOUNTING
 	crtsc = get_cyclecount();
 	if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 		tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru;
 		tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
-		tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len + segsiz - 1) / segsiz);
+		tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((*tot_len + segsiz - 1) / segsiz);
 	}
 	sched_unpin();
 #endif
@@ -19884,20 +19891,36 @@ rack_output(struct tcpcb *tp)
 	    TCPS_HAVEESTABLISHED(tp->t_state)) {
 		rack_set_state(tp, rack);
 	}
+	segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
+	minseg = segsiz;
+	if (rack->r_ctl.rc_pace_max_segs == 0)
+		pace_max_seg = rack->rc_user_set_max_segs * segsiz;
+	else
+		pace_max_seg = rack->r_ctl.rc_pace_max_segs;
 	if ((rack->r_fast_output) &&
 	    (doing_tlp == 0) &&
 	    (tp->rcv_numsacks == 0)) {
 		int ret;
 
 		error = 0;
-		ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error);
-		if (ret >= 0)
+		ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, &tot_len_this_send, &error, __LINE__);
+		if (ret > 0)
 			return(ret);
 		else if (error) {
 			inp = rack->rc_inp;
 			so = inp->inp_socket;
 			sb = &so->so_snd;
 			goto nomore;
+		} else {
+			/* Return == 0, if there is more we can send tot_len wise fall through and send */
+			if (tot_len_this_send >= pace_max_seg) 
+				return (ret);
+#ifdef TCP_ACCOUNTING
+			/* We need to re-pin since fast_output un-pined */
+			sched_pin();
+			ts_val = get_cyclecount();
+#endif
+			/* Fall back out so we can send any more that may bring us to pace_max_seg */
 		}
 	}
 	inp = rack->rc_inp;
@@ -20004,12 +20027,6 @@ again:
 	ms_cts = tcp_tv_to_mssectick(&tv);
 	tso = 0;
 	mtu = 0;
-	segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
-	minseg = segsiz;
-	if (rack->r_ctl.rc_pace_max_segs == 0)
-		pace_max_seg = rack->rc_user_set_max_segs * segsiz;
-	else
-		pace_max_seg = rack->r_ctl.rc_pace_max_segs;
 	if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 	    (rack->r_ctl.pcm_max_seg == 0)) {
 		/*
@@ -21593,7 +21610,6 @@ send:
 			flags |= TH_PUSH;
 			add_flag |= RACK_HAD_PUSH;
 		}
-
 		SOCK_SENDBUF_UNLOCK(so);
 	} else {
 		SOCK_SENDBUF_UNLOCK(so);
@@ -22536,7 +22552,7 @@ enobufs:
 					       segsiz, pace_max_seg, hw_tls, flags);
 			if (rack->r_fast_output) {
 				error = 0;
-				ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error);
+				ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, &tot_len_this_send, &error, __LINE__);
 				if (ret >= 0)
 					return (ret);
 			        else if (error)