PERFORCE change 166244 for review

Andre Oppermann andre at FreeBSD.org
Sat Jul 18 20:34:33 UTC 2009


http://perforce.freebsd.org/chv.cgi?CH=166244

Change 166244 by andre at andre_t61 on 2009/07/18 20:34:11

	Update and enhance comments.
	More refined loss recovery handling.
	tcp_do_ack() now handles fast recovery entry and exit.
	Add pipe size tracking for SACK based loss recovery.
	Move limited transmit directly into tcp_output().
	Add SACK based recovery to tcp_retransmit().

Affected files ...

.. //depot/projects/tcp_new/netinet/tcp_input.c#14 edit
.. //depot/projects/tcp_new/netinet/tcp_output.c#14 edit
.. //depot/projects/tcp_new/netinet/tcp_sack.c#10 edit
.. //depot/projects/tcp_new/netinet/tcp_var.h#14 edit

Differences ...

==== //depot/projects/tcp_new/netinet/tcp_input.c#14 (text+ko) ====

@@ -1733,6 +1733,8 @@
 	/*
 	 * Update send SACK information and tell us how much more
 	 * data has left the network (relative to last SACK we got).
+	 *  RFC2018: section 5
+	 *  RFC3517: section 4 Update(), section 5 first sentence and (B)
 	 */
 	if ((to.to_flags & TOF_SACK) || !RB_EMPTY(&tp->snd_sackblocks))
 		sacked = tcp_sack_doack(tp, &to, th->th_ack);
@@ -1762,8 +1764,11 @@
 
 	/*
 	 * Update congestion control information.
+	 * NB: The algorithm must not increase cwnd when acked is zero.
 	 */
-	//tcp_cc_ack(tp, th, tiwin, acked, tlen, sacked);
+	if (tp->t_phase < TP_LOSSRECOV)
+		tcp_cc_ack(tp, th, tiwin, acked, tlen, sacked);
+
 	KASSERT(tp->snd_cwnd > tp->snd_mss,
 	    ("%s: cwnd < 1*mss after congestion control function", __func__));
 
@@ -2953,19 +2958,45 @@
 		tp->snd_dupack++;
 	else if (tp->snd_dupack > 0 && (acked > 0 || SEQ_GT(th->th_seq, tp->snd_una)))
 		tp->snd_dupack = 0;
-	
-	if (tp->snd_dupack > 0 && tp->t_phase < TP_LOSSRECOV)
+
+	/* Advance the unacknowledged pointer. */
+	if (acked > 0)
+		tp->snd_una += acked;
+
+	/* Exit loss recovery phase. */
+	if (SEQ_GEQ(tp->snd_una, tp->snd_recover)) {
+		tp->snd_pipe = 0;
+		tp->t_phase = TP_SENDING;
+		tcp_cc_post_fr(tp);
+	}
+
+	/* Enter loss recovery phase. */
+	if (tp->snd_dupack == 3 && tp->t_phase < TP_LOSSRECOV) {
+		tcp_cc_pre_fr(tp);		/* updates ssthresh */
 		tp->t_phase = TP_LOSSRECOV;
+		tp->snd_recover = tp->snd_nxt;
+		tp->snd_rxmit = tp->snd_una;
+		tp->snd_pipe = tcp_sack_pipe(tp);
+		tp->snd_cwnd = tp->snd_ssthresh;
+		tp->snd_cwnd += 3 * tp->snd_mss;
+	}
+
+	/* In loss recovery phase. */
+	if (tp->t_phase == TP_LOSSRECOV) {
+		tp->snd_pipe -= acked;
+		tp->snd_pipe -= sacked;
+		tp->snd_cwnd += tp->snd_mss;
+		if (acked) {
+			tp->snd_rxmit = tp->snd_una;
+			tp->snd_cwnd -= acked;
+			if (acked > tp->snd_mss)
+				tp->snd_cwnd += tp->snd_mss;
+		}
+	}
 
 	KASSERT(SEQ_LT(tp->snd_una, tp->snd_nxt) || tp->snd_dupack == 0,
 	    ("%s: snd_dupack > 0 but snd_una == snd_nxt", __func__));
 
-	/*
-	 * Advance the unacknowledged pointer.
-	 */
-	if (acked > 0)
-		tp->snd_una += acked;
-
 	KASSERT(tp->snd_una == tp->snd_nxt || tcp_timer_active(tp, TT_REXMT),
 	    ("%s: outstanding data but REXMT timer not active", __func__));
 

==== //depot/projects/tcp_new/netinet/tcp_output.c#14 (text+ko) ====

@@ -107,7 +107,7 @@
 		    int optlen, int rwin, int flags);
 static int	tcp_retransmit(struct tcpcb *tp, struct socket *so,
 		    struct tcpopt *to, u_char *opt, int *len,
-		    int optlen, int rwin, int dlen, int flags);
+		    int optlen, int rwin, int dlen, int slen, int flags);
 static int	tcp_send_segments(struct tcpcb *tp, struct tcphdr *ths,
 		    u_char *opt, int off, int *olen, int optlen);
 static u_int	tcp_rcv_wnd(struct tcpcb *tp, struct socket *so);
@@ -146,7 +146,7 @@
 {
 	int flags, error, optlen = 0;
 	tcp_win len;
-	int duna, swnd, cwnd, dlen, inflight, rwin;
+	int duna, swnd, cwnd, dlen, slen, inflight, rwin;
 	int tcp_min_idle = 1;		/* XXXAO */
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
@@ -196,6 +196,7 @@
 	 * swnd = remaining space in send window as advertised by remote end
 	 * cwnd = congestion window, remaing amount of data that can be in flight unacknowledged
 	 * dlen = remaing amount of data in send buffer available for sending
+	 * slen = available amount of data that fits into send window
 	 * len = amount of data we have *and* can send righ now
 	 *
 	 *             <- duna -><-       swnd       ->
@@ -206,23 +207,12 @@
 	 *          snd_una  snd_nxt
 	 *
 	 */
-	duna = SEQ_DELTA(tp->snd_nxt, tp->snd_una);
+	duna = SEQ_DELTA(tp->snd_una, tp->snd_nxt);
 	swnd = imax(0, tp->snd_wnd - duna);
 	cwnd = imax(0, tp->snd_cwnd - duna);
-	dlen = min(so->so_snd.sb_cc - duna, swnd);
-	len = min(dlen, cwnd);
-
-	/*
-	 * XXXAO: todo token bucket, mss sized
-	 * Retransmits should not fall under pacing limit
-	 * and neither ACKs, window updates, etc. if there
-	 * is no data pending.
-	 */
-	if (len > 0 && (tp->t_flags & TF_PACE)) {
-		len = tcp_snd_pace(tp, len);
-		if (len == 0)
-			return (0);		/* next token is pending */
-	}
+	dlen = so->so_snd.sb_cc - duna;
+	slen = min(dlen, swnd);
+	len = min(slen, cwnd);
 
 	/*
 	 * Conservative approximation of data still travelling in the network.
@@ -259,12 +249,17 @@
 		}
 		break;
 	case TP_SENDING:
+		/*
+		 * Limited transmit: transmit new data upon the arrival of the
+		 * first two consecutive duplicate ACKs.
+		 *  RFC3042: section 2
+		 */
+		if (tp->snd_dupack > 0 && dlen > len && cwnd < tp->snd_mss)
+			len = min(slen, tp->snd_mss);	/* up to one mss above cwnd */
 		break;
 	case TP_LOSSRECOV:
 	case TP_REXMT:
-		error = tcp_retransmit(tp, so, &to, &opt[0], &len, optlen, rwin, dlen, flags);
-		if (len == 0)
-			return (0);
+		error = tcp_retransmit(tp, so, &to, &opt[0], &len, optlen, rwin, dlen, slen, flags);
 		break;
 	case TP_PERSIST:
 		/*
@@ -279,8 +274,8 @@
 		}
 		if (swnd == 0 && duna > tp->snd_wnd) {
 			/*
-			 * Window shrank
-			 * after we sent into it.  If window shrank to 0,
+			 * Window shrank after we sent into it.
+			 * If window shrank to 0,
 			 * cancel pending retransmit, pull snd_nxt back
 			 * to (closed) window, and set the persist timer
 			 * if it isn't already going.  If the window didn't
@@ -298,6 +293,18 @@
 	}
 
 	/*
+	 * XXXAO: todo token bucket, mss sized
+	 * Retransmits should not fall under pacing limit
+	 * and neither ACKs, window updates, etc. if there
+	 * is no data pending.
+	 */
+	if (len > 0 && (tp->t_flags & TF_PACE)) {
+		len = tcp_snd_pace(tp, len);
+		if (len == 0)
+			return (0);		/* next token is pending */
+	}
+
+	/*
 	 * Send out a SYN immediatly.
 	 */
 	if ((flags & TH_SYN) && !(tp->t_flags & TF_SENTSYN))
@@ -766,9 +773,9 @@
  */
 static int
 tcp_retransmit(struct tcpcb *tp, struct socket *so, struct tcpopt *to,
-    u_char *opt, int *len, int optlen, int rwin, int dlen, int flags)
+    u_char *opt, int *len, int optlen, int rwin, int dlen, int slen, int flags)
 {
-	int error, off, rlen = 0;
+	int error = 0, off, rlen = 0, rxmit;
 	struct tcphdr ths, *th = &ths;
 
 	/*
@@ -793,62 +800,73 @@
 	 *  5. TCP congestion window validation RFC2861
 	 */
 
-	/*
-	 * Limited transmit: transmit new data upon the arrival of the
-	 * first two consecutive duplicate ACKs.
-	 *  RFC3042: section 2
-	 */
-	if (tp->snd_dupack < tcp_dupthresh && dlen > *len) {
-		*len = min(dlen, tp->snd_mss);	/* up to one mss above cwnd */
-		return (0);
-	}
+	do {
+		/* Calculate amount of data we may inject into the pipe (C). */
+		rxmit = imax(0, tp->snd_cwnd - tp->snd_pipe);
 
-	/*
-	 * Remember the highest byte sent yet
-	 * and set snd_rxmit to snd_una.
-	 */
-	if (tp->snd_dupack == tcp_dupthresh) {
-		tp->snd_recover = tp->snd_nxt;
-		tp->snd_rxmit = tp->snd_una;
-		rlen = tcp_sack_firsthole(tp, &rexmit);
-	} else {
-		rlen = tcp_sack_nextseg(tp, &tp->snd_rexmit, dlen);
-	}
-
-	if (rlen == 0)
-		if (dlen)
-			*len = dlen;	/* XXXAO: pipe! */
+		if (!RB_EMPTY(&tp->snd_sackblocks)) {
+			/*
+			 * Get the amount of consequtive data for retransmit.
+			 * (C.1) modulo (C.3)
+			 */
+			if (tp->snd_rxmit == tp->snd_una)
+				rlen = tcp_sack_firsthole(tp, &rexmit);
+			else
+				rlen = tcp_sack_nextseg(tp, &tp->snd_rexmit, slen);
+			/*
+			 * If we have nothing to retransmit, see if we can
+			 * send some new data.
+			 * (C.3)
+			 */
+			if (rlen == 0)
+				if (slen > 0 && (rxmit >= tp->snd_mss ||
+				    (rxmit >= slen && dlen == slen))
+					*len = min(slen, rxmit);
+				else
+					*len = 0;
+				break;
+			}
+			/*
+			 * Retransmit what we've got.
+			 * (C.1)
+			 */
+			if (rxmit >= rlen || (rlen > rxmit && rxmit > tp->snd_mss))
+				rlen = min(rlen, pipe);
+			else
+				break;
+		} else if (tp->snd_rexmit == tp->snd_una)
+			rlen = min(tp->snd_mss, SEQ_DELTA(tp->snd_una, tp->snd_nxt));
 		else
-			*len = 0;
-		return (0);
-	} else
-		rlen = min(rlen, pipe);	/* XXXAO: pipe! */
+			break;
 
+		/*
+		 * Fill in headers.
+		 */
+		th->th_win = (u_short)rwin;
+		th->th_seq = tp->snd_rxmit;
+		th->th_flags = flags;
+		th->th_ack = tp->rcv_nxt;
 
-	/*
-	 * Fill in headers.
-	 */
-	th->th_win = (u_short)rwin;
-	th->th_seq = tp->snd_rxmit;
-	th->th_flags = flags;
-	th->th_ack = tp->rcv_nxt;
+		/*
+		 * If resending a SYN or FIN, be sure NOT to use a new sequence number.
+		 */
+		if ((flags & TH_SYN) && (tp->t_flags & TF_SENTSYN))
+ 			th->th_seq--;
+		if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) &&
+		    th->th_seq == tp->snd_nxt)
+			th->th_seq--;
+		
+		SOCKBUF_LOCK(&so->so_snd);
+		off = min(tp->snd_rxmit - tp->snd_una, so->so_snd.sb_cc);
+		error = tcp_send_segments(tp, &ths, opt, off, &rlen, optlen);
+		SOCKBUF_UNLOCK(&so->so_snd);
 
-	/*
-	 * If resending a SYN or FIN, be sure NOT to use a new sequence number.
-	 */
-	if ((flags & TH_SYN) && (tp->t_flags & TF_SENTSYN))
-		th->th_seq--;
-	if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) &&
-	    th->th_seq == tp->snd_nxt)
-		th->th_seq--;
-	
-	SOCKBUF_LOCK(&so->so_snd);
-	off = min(tp->snd_rxmit - tp->snd_una, so->so_snd.sb_cc);
-	error = tcp_send_segments(tp, &ths, opt, off, &rlen, optlen);
-	SOCKBUF_UNLOCK(&so->so_snd);
+		/* Start from here the next time. */
+		tp->snd_rxmit += rlen;
+		/* Increase amount of data in the 'pipe' (C.4). */
+		tp->snd_pipe += rlen;
 
-	/* Start from here the next time. */
-	tp->snd_rxmit += rlen;
+	} while (error == 0);
 
 	return (error);
 }

==== //depot/projects/tcp_new/netinet/tcp_sack.c#10 (text+ko) ====

@@ -254,9 +254,10 @@
 				/*
 				 * D-SACK, was a duplicate retransmit.
 				 *  RFC2883: section 5
-				 * XXXAO: Adjust pipe.
+				 * XXXAO: Adjust pipe for data that has left the network.
 				 */
 				if (i == 0 && SEQ_DELTA(sack.tsb_blk.start, sack.tsb_blk.end) <= tp->snd_mss) {
+					tp->snd_pipe -= SEQ_DELTA(sack.tsb_blk.start, sack.tsb_blk.end);
 					//TCPSTAT_INC();
 				}
 				continue;
@@ -391,6 +392,34 @@
 	return (len);
 }
 
+/*
+ * Calculate the number of segments assumed to be in the 'pipe'.
+ * Instead of counting all the bytes from snd_una up to snd_nxt
+ * we start from the highest sackblock and work our way down.
+ * The calculated number is only valid when snd_una == snd_rxmit;
+ */
+int
+tcp_sack_pipe(struct tcpcb *tp)
+{
+	int pipe = 0, blocks = 0, sacked = 0;
+	tcp_seq prev;
+	struct tcp_sack_block *tsb;
+
+	prev = tp->snd_nxt;
+
+	RB_FOREACH_REVERSE(tsb, tcp_sackblocks, &tp->snd_sackblocks) {
+		pipe += SEQ_DELTA(tsb->tsb_blk.end, prev);
+		sacked += SEQ_DELTA(tsb->tsb_blk.start, tsb->tsb_blk.end);
+		if (sacked > 3 * tp->snd_mss)
+			break;
+		if (blocks++ > 2)
+			break;
+		prev = tsb->tsb_blk.start;
+	}
+
+	return (pipe);
+}
+
 #ifdef DDB
 static void
 db_print_sackblocks(struct tcpcb *tp)

==== //depot/projects/tcp_new/netinet/tcp_var.h#14 (text+ko) ====

@@ -173,7 +173,6 @@
 	tcp_seq	snd_una;		/* send unacknowledged */
 	tcp_seq	snd_nxt;		/* send next */
 	tcp_seq	snd_rxmit;		/* from where to retransmit */
-	tcp_seq snd_inflight;		/* estimate of data currently in the network (~SACK) */
 	u_int	snd_maxburst;		/* maximum send burst length */
 
 	tcp_seq	snd_up;			/* send urgent pointer */
@@ -188,6 +187,7 @@
 
 	int	snd_dupack;		/* number of duplicate ACK's reveived */
 	tcp_seq	snd_recover;		/* fast retransmit recover */
+	int	snd_pipe;		/* bytes assumed to be inflight in the pipe */
 	int	snd_abcack;		/* count the ack'ed data for ABC */
 
 	tcp_seq	snd_rtseq;		/* seq# of current RTT measurement */
@@ -647,6 +647,7 @@
 void	 tcp_sack_flush(struct tcpcb *);
 void	 tcp_sack_init(void);
 int	 tcp_sack_nextseg(struct tcpcb *, tcp_seq *);
+int 	 tcp_sack_pipe(struct tcpcb *tp);
 
 int	 tcp_newreno(struct tcpcb *, struct tcphdr *);
 u_long	 tcp_seq_subtract(u_long, u_long );


More information about the p4-projects mailing list