git: 13c0e198ca27 - main - tcp: Fix bugs related to the PUSH bit and rack and an ack war

Randall Stewart rrs at FreeBSD.org
Tue May 25 17:25:47 UTC 2021


The branch main has been updated by rrs:

URL: https://cgit.FreeBSD.org/src/commit/?id=13c0e198ca275447f9a60a03f730c38c98f19009

commit 13c0e198ca275447f9a60a03f730c38c98f19009
Author:     Randall Stewart <rrs at FreeBSD.org>
AuthorDate: 2021-05-25 17:23:31 +0000
Commit:     Randall Stewart <rrs at FreeBSD.org>
CommitDate: 2021-05-25 17:23:31 +0000

    tcp: Fix bugs related to the PUSH bit and rack and an ack war
    
    Michaels testing with UDP tunneling found an issue with the push bit, which was only partly fixed
    in the last commit. The problem is the left edge gets transmitted before the adjustments are done
    to the send_map, this means that right edge bits must be considered to be added only if
    the entire RSM is being retransmitted.
    
    Now syzkaller also continued to find a crash, which Michael sent me the reproducer for. Turns
    out that the reproducer on default (freebsd) stack made the stack get into an ack-war with itself.
    After fixing the reference issues in rack the same ack-war was found in rack (and bbr). Basically
    what happens is we go into the reassembly code and lose the FIN bit. The trick here is we
    should not be going into the reassembly code if tlen == 0 i.e. the peer never sent you anything.
    That then gets the proper action on the FIN bit but then you end up in LAST_ACK with no
    timers running. This is because the usrclosed function gets called and the FIN's and such have
    already been exchanged. So when we should be entering FIN_WAIT2 (or even FIN_WAIT1) we get
    stuck in LAST_ACK. Fixing this means tweaking the usrclosed function so that we properly
    recognize the condition and drop into FIN_WAIT2 where a timer will allow at least TP_MAXIDLE
    before closing (to allow time for the peer to retransmit its FIN if the ack is lost). Setting the fast_finwait2
    timer can speed this up in testing.
    
    Reviewed by: mtuexen,rscheff
    Sponsored by: Netflix Inc
    Differential Revision:  https://reviews.freebsd.org/D30451
---
 sys/netinet/tcp_input.c       |  6 +++--
 sys/netinet/tcp_stacks/bbr.c  |  6 +++--
 sys/netinet/tcp_stacks/rack.c | 58 +++++++++++++++++++++++++++++--------------
 sys/netinet/tcp_usrreq.c      | 16 ++++++++++++
 4 files changed, 64 insertions(+), 22 deletions(-)

diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 18ef52959c15..b9836a137608 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -3191,8 +3191,10 @@ dodata:							/* XXX */
 			 * when trimming from the head.
 			 */
 			tcp_seq temp = save_start;
-			thflags = tcp_reass(tp, th, &temp, &tlen, m);
-			tp->t_flags |= TF_ACKNOW;
+			if (tlen) {
+				thflags = tcp_reass(tp, th, &temp, &tlen, m);
+				tp->t_flags |= TF_ACKNOW;
+			}
 		}
 		if ((tp->t_flags & TF_SACK_PERMIT) &&
 		    (save_tlen > 0) &&
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index 56691def6e1d..b2fc5c1f928e 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -8320,8 +8320,10 @@ bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
 			 * trimming from the head.
 			 */
 			tcp_seq temp = save_start;
-			thflags = tcp_reass(tp, th, &temp, &tlen, m);
-			tp->t_flags |= TF_ACKNOW;
+			if (tlen) {
+				thflags = tcp_reass(tp, th, &temp, &tlen, m);
+				tp->t_flags |= TF_ACKNOW;
+			}
 		}
 		if ((tp->t_flags & TF_SACK_PERMIT) &&
 		    (save_tlen > 0) &&
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 71970e180808..ad2c7d31d110 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -6017,7 +6017,7 @@ rack_setup_offset_for_rsm(struct rack_sendmap *src_rsm, struct rack_sendmap *rsm
 	struct mbuf *m;
 	uint32_t soff;
 
-	if (src_rsm->orig_m_len != src_rsm->m->m_len) {
+	if (src_rsm->m && (src_rsm->orig_m_len != src_rsm->m->m_len)) {
 		/* Fix up the orig_m_len and possibly the mbuf offset */
 		rack_adjust_orig_mlen(src_rsm);
 	}
@@ -8818,21 +8818,23 @@ more:
 	rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
 	rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_TRIM_HEAD, th_ack, __LINE__);
 	/* Now we need to move our offset forward too */
-	if (rsm->orig_m_len != rsm->m->m_len) {
+	if (rsm->m && (rsm->orig_m_len != rsm->m->m_len)) {
 		/* Fix up the orig_m_len and possibly the mbuf offset */
 		rack_adjust_orig_mlen(rsm);
 	}
 	rsm->soff += (th_ack - rsm->r_start);
 	rsm->r_start = th_ack;
 	/* Now do we need to move the mbuf fwd too? */
-	while (rsm->soff >= rsm->m->m_len) {
-		rsm->soff -= rsm->m->m_len;
-		rsm->m = rsm->m->m_next;
-		KASSERT((rsm->m != NULL),
-			(" nrsm:%p hit at soff:%u null m",
-			 rsm, rsm->soff));
-	}
-	rsm->orig_m_len = rsm->m->m_len;
+	if (rsm->m) {
+		while (rsm->soff >= rsm->m->m_len) {
+			rsm->soff -= rsm->m->m_len;
+			rsm->m = rsm->m->m_next;
+			KASSERT((rsm->m != NULL),
+				(" nrsm:%p hit at soff:%u null m",
+				 rsm, rsm->soff));
+		}
+		rsm->orig_m_len = rsm->m->m_len;
+	}
 	if (rack->app_limited_needs_set)
 		rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG);
 }
@@ -9655,7 +9657,7 @@ rack_adjust_sendmap(struct tcp_rack *rack, struct sockbuf *sb, tcp_seq snd_una)
 		/* Nothing outstanding */
 		return;
 	}
-	while (rsm->m == m) {
+	while (rsm->m && (rsm->m == m)) {
 		/* one to adjust */
 #ifdef INVARIANTS
 		struct mbuf *tm;
@@ -9676,10 +9678,16 @@ rack_adjust_sendmap(struct tcp_rack *rack, struct sockbuf *sb, tcp_seq snd_una)
 		}
 		rsm->m = tm;
 		rsm->soff = soff;
-		rsm->orig_m_len = rsm->m->m_len;
+		if (tm)
+			rsm->orig_m_len = rsm->m->m_len;
+		else
+			rsm->orig_m_len = 0;
 #else
 		rsm->m = sbsndmbuf(sb, (rsm->r_start - snd_una), &rsm->soff);
-		rsm->orig_m_len = rsm->m->m_len;
+		if (rsm->m)
+			rsm->orig_m_len = rsm->m->m_len;
+		else
+			rsm->orig_m_len = 0;
 #endif
 		rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree,
 			      rsm);
@@ -10058,6 +10066,7 @@ rack_validate_fo_sendwin_up(struct tcpcb *tp, struct tcp_rack *rack)
 	}
 }
 
+
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
@@ -10226,9 +10235,10 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
 			 * trimming from the head.
 			 */
 			tcp_seq temp = save_start;
-
-			thflags = tcp_reass(tp, th, &temp, &tlen, m);
-			tp->t_flags |= TF_ACKNOW;
+			if (tlen) {
+				thflags = tcp_reass(tp, th, &temp, &tlen, m);
+				tp->t_flags |= TF_ACKNOW;
+			}
 		}
 		if ((tp->t_flags & TF_SACK_PERMIT) &&
 		    (save_tlen > 0) &&
@@ -12190,7 +12200,10 @@ rack_init(struct tcpcb *tp)
 		rsm->r_dupack = 0;
 		if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) {
 			rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff);
-			rsm->orig_m_len = rsm->m->m_len;
+			if (rsm->m)
+				rsm->orig_m_len = rsm->m->m_len;
+			else
+				rsm->orig_m_len = 0;
 		} else {
 			/*
 			 * This can happen if we have a stand-alone FIN or
@@ -15074,6 +15087,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
 	uint32_t us_cts;
 	uint32_t if_hw_tsomaxsegcount = 0, startseq;
 	uint32_t if_hw_tsomaxsegsize;
+
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 
@@ -15183,7 +15197,15 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
 	}
 	th->th_seq = htonl(rsm->r_start);
 	th->th_ack = htonl(tp->rcv_nxt);
-	if(rsm->r_flags & RACK_HAD_PUSH)
+	/*
+	 * The PUSH bit should only be applied
+	 * if the full retransmission is made. If
+	 * we are sending less than this is the
+	 * left hand edge and should not have
+	 * the PUSH bit.
+	 */
+	if ((rsm->r_flags & RACK_HAD_PUSH) &&
+	    (len == (rsm->r_end - rsm->r_start)))
 		flags |= TH_PUSH;
 	th->th_flags = flags;
 	th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale));
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 37bedc0125c9..4f418f8809a7 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -2637,6 +2637,22 @@ tcp_usrclosed(struct tcpcb *tp)
 		tcp_state_change(tp, TCPS_LAST_ACK);
 		break;
 	}
+	if ((tp->t_state == TCPS_LAST_ACK) &&
+	    (tp->t_flags & TF_SENTFIN)) {
+		/*
+		 * If we have reached LAST_ACK, and
+		 * we sent a FIN (e.g. via MSG_EOR), then
+		 * we really should move to either FIN_WAIT_1
+		 * or FIN_WAIT_2 depending on snd_max/snd_una.
+		 */
+		if (tp->snd_una == tp->snd_max) {
+			/* The FIN is acked */
+			tcp_state_change(tp, TCPS_FIN_WAIT_2);
+		} else {
+			/* The FIN is still outstanding */
+			tcp_state_change(tp, TCPS_FIN_WAIT_1);
+		}
+	}
 	if (tp->t_state >= TCPS_FIN_WAIT_2) {
 		soisdisconnected(tp->t_inpcb->inp_socket);
 		/* Prevent the connection hanging in FIN_WAIT_2 forever. */


More information about the dev-commits-src-all mailing list