PERFORCE change 166244 for review
Andre Oppermann
andre at FreeBSD.org
Sat Jul 18 20:34:33 UTC 2009
http://perforce.freebsd.org/chv.cgi?CH=166244
Change 166244 by andre at andre_t61 on 2009/07/18 20:34:11
Update and enhance comments.
More refined loss recovery handling.
tcp_do_ack() now handles fast recovery entry and exit.
Add pipe size tracking for SACK based loss recovery.
Move limited transmit directly into tcp_output().
Add SACK based recovery to tcp_retransmit().
Affected files ...
.. //depot/projects/tcp_new/netinet/tcp_input.c#14 edit
.. //depot/projects/tcp_new/netinet/tcp_output.c#14 edit
.. //depot/projects/tcp_new/netinet/tcp_sack.c#10 edit
.. //depot/projects/tcp_new/netinet/tcp_var.h#14 edit
Differences ...
==== //depot/projects/tcp_new/netinet/tcp_input.c#14 (text+ko) ====
@@ -1733,6 +1733,8 @@
/*
* Update send SACK information and tell us how much more
* data has left the network (relative to last SACK we got).
+ * RFC2018: section 5
+ * RFC3517: section 4 Update(), section 5 first sentence and (B)
*/
if ((to.to_flags & TOF_SACK) || !RB_EMPTY(&tp->snd_sackblocks))
sacked = tcp_sack_doack(tp, &to, th->th_ack);
@@ -1762,8 +1764,11 @@
/*
* Update congestion control information.
+ * NB: The algorithm must not increase cwnd when acked is zero.
*/
- //tcp_cc_ack(tp, th, tiwin, acked, tlen, sacked);
+ if (tp->t_phase < TP_LOSSRECOV)
+ tcp_cc_ack(tp, th, tiwin, acked, tlen, sacked);
+
KASSERT(tp->snd_cwnd > tp->snd_mss,
("%s: cwnd < 1*mss after congestion control function", __func__));
@@ -2953,19 +2958,45 @@
tp->snd_dupack++;
else if (tp->snd_dupack > 0 && (acked > 0 || SEQ_GT(th->th_seq, tp->snd_una)))
tp->snd_dupack = 0;
-
- if (tp->snd_dupack > 0 && tp->t_phase < TP_LOSSRECOV)
+
+ /* Advance the unacknowledged pointer. */
+ if (acked > 0)
+ tp->snd_una += acked;
+
+ /* Exit loss recovery phase. */
+ if (SEQ_GEQ(tp->snd_una, tp->snd_recover)) {
+ tp->snd_pipe = 0;
+ tp->t_phase = TP_SENDING;
+ tcp_cc_post_fr(tp);
+ }
+
+ /* Enter loss recovery phase. */
+ if (tp->snd_dupack == 3 && tp->t_phase < TP_LOSSRECOV) {
+ tcp_cc_pre_fr(tp); /* updates ssthresh */
tp->t_phase = TP_LOSSRECOV;
+ tp->snd_recover = tp->snd_nxt;
+ tp->snd_rxmit = tp->snd_una;
+ tp->snd_pipe = tcp_sack_pipe(tp);
+ tp->snd_cwnd = tp->snd_ssthresh;
+ tp->snd_cwnd += 3 * tp->snd_mss;
+ }
+
+ /* In loss recovery phase. */
+ if (tp->t_phase == TP_LOSSRECOV) {
+ tp->snd_pipe -= acked;
+ tp->snd_pipe -= sacked;
+ tp->snd_cwnd += tp->snd_mss;
+ if (acked) {
+ tp->snd_rxmit = tp->snd_una;
+ tp->snd_cwnd -= acked;
+ if (acked > tp->snd_mss)
+ tp->snd_cwnd += tp->snd_mss;
+ }
+ }
KASSERT(SEQ_LT(tp->snd_una, tp->snd_nxt) || tp->snd_dupack == 0,
("%s: snd_dupack > 0 but snd_una == snd_nxt", __func__));
- /*
- * Advance the unacknowledged pointer.
- */
- if (acked > 0)
- tp->snd_una += acked;
-
KASSERT(tp->snd_una == tp->snd_nxt || tcp_timer_active(tp, TT_REXMT),
("%s: outstanding data but REXMT timer not active", __func__));
==== //depot/projects/tcp_new/netinet/tcp_output.c#14 (text+ko) ====
@@ -107,7 +107,7 @@
int optlen, int rwin, int flags);
static int tcp_retransmit(struct tcpcb *tp, struct socket *so,
struct tcpopt *to, u_char *opt, int *len,
- int optlen, int rwin, int dlen, int flags);
+ int optlen, int rwin, int dlen, int slen, int flags);
static int tcp_send_segments(struct tcpcb *tp, struct tcphdr *ths,
u_char *opt, int off, int *olen, int optlen);
static u_int tcp_rcv_wnd(struct tcpcb *tp, struct socket *so);
@@ -146,7 +146,7 @@
{
int flags, error, optlen = 0;
tcp_win len;
- int duna, swnd, cwnd, dlen, inflight, rwin;
+ int duna, swnd, cwnd, dlen, slen, inflight, rwin;
int tcp_min_idle = 1; /* XXXAO */
struct inpcb *inp = tp->t_inpcb;
struct socket *so = inp->inp_socket;
@@ -196,6 +196,7 @@
* swnd = remaining space in send window as advertised by remote end
* cwnd = congestion window, remaing amount of data that can be in flight unacknowledged
* dlen = remaing amount of data in send buffer available for sending
+ * slen = available amount of data that fits into send window
* len = amount of data we have *and* can send righ now
*
* <- duna -><- swnd ->
@@ -206,23 +207,12 @@
* snd_una snd_nxt
*
*/
- duna = SEQ_DELTA(tp->snd_nxt, tp->snd_una);
+ duna = SEQ_DELTA(tp->snd_una, tp->snd_nxt);
swnd = imax(0, tp->snd_wnd - duna);
cwnd = imax(0, tp->snd_cwnd - duna);
- dlen = min(so->so_snd.sb_cc - duna, swnd);
- len = min(dlen, cwnd);
-
- /*
- * XXXAO: todo token bucket, mss sized
- * Retransmits should not fall under pacing limit
- * and neither ACKs, window updates, etc. if there
- * is no data pending.
- */
- if (len > 0 && (tp->t_flags & TF_PACE)) {
- len = tcp_snd_pace(tp, len);
- if (len == 0)
- return (0); /* next token is pending */
- }
+ dlen = so->so_snd.sb_cc - duna;
+ slen = min(dlen, swnd);
+ len = min(slen, cwnd);
/*
* Conservative approximation of data still travelling in the network.
@@ -259,12 +249,17 @@
}
break;
case TP_SENDING:
+ /*
+ * Limited transmit: transmit new data upon the arrival of the
+ * first two consecutive duplicate ACKs.
+ * RFC3042: section 2
+ */
+ if (tp->snd_dupack > 0 && dlen > len && cwnd < tp->snd_mss)
+ len = min(slen, tp->snd_mss); /* up to one mss above cwnd */
break;
case TP_LOSSRECOV:
case TP_REXMT:
- error = tcp_retransmit(tp, so, &to, &opt[0], &len, optlen, rwin, dlen, flags);
- if (len == 0)
- return (0);
+ error = tcp_retransmit(tp, so, &to, &opt[0], &len, optlen, rwin, dlen, slen, flags);
break;
case TP_PERSIST:
/*
@@ -279,8 +274,8 @@
}
if (swnd == 0 && duna > tp->snd_wnd) {
/*
- * Window shrank
- * after we sent into it. If window shrank to 0,
+ * Window shrank after we sent into it.
+ * If window shrank to 0,
* cancel pending retransmit, pull snd_nxt back
* to (closed) window, and set the persist timer
* if it isn't already going. If the window didn't
@@ -298,6 +293,18 @@
}
/*
+ * XXXAO: todo token bucket, mss sized
+ * Retransmits should not fall under pacing limit
+ * and neither ACKs, window updates, etc. if there
+ * is no data pending.
+ */
+ if (len > 0 && (tp->t_flags & TF_PACE)) {
+ len = tcp_snd_pace(tp, len);
+ if (len == 0)
+ return (0); /* next token is pending */
+ }
+
+ /*
* Send out a SYN immediatly.
*/
if ((flags & TH_SYN) && !(tp->t_flags & TF_SENTSYN))
@@ -766,9 +773,9 @@
*/
static int
tcp_retransmit(struct tcpcb *tp, struct socket *so, struct tcpopt *to,
- u_char *opt, int *len, int optlen, int rwin, int dlen, int flags)
+ u_char *opt, int *len, int optlen, int rwin, int dlen, int slen, int flags)
{
- int error, off, rlen = 0;
+ int error = 0, off, rlen = 0, rxmit;
struct tcphdr ths, *th = &ths;
/*
@@ -793,62 +800,73 @@
* 5. TCP congestion window validation RFC2861
*/
- /*
- * Limited transmit: transmit new data upon the arrival of the
- * first two consecutive duplicate ACKs.
- * RFC3042: section 2
- */
- if (tp->snd_dupack < tcp_dupthresh && dlen > *len) {
- *len = min(dlen, tp->snd_mss); /* up to one mss above cwnd */
- return (0);
- }
+ do {
+ /* Calculate amount of data we may inject into the pipe (C). */
+ rxmit = imax(0, tp->snd_cwnd - tp->snd_pipe);
- /*
- * Remember the highest byte sent yet
- * and set snd_rxmit to snd_una.
- */
- if (tp->snd_dupack == tcp_dupthresh) {
- tp->snd_recover = tp->snd_nxt;
- tp->snd_rxmit = tp->snd_una;
- rlen = tcp_sack_firsthole(tp, &rexmit);
- } else {
- rlen = tcp_sack_nextseg(tp, &tp->snd_rexmit, dlen);
- }
-
- if (rlen == 0)
- if (dlen)
- *len = dlen; /* XXXAO: pipe! */
+ if (!RB_EMPTY(&tp->snd_sackblocks)) {
+ /*
+ * Get the amount of consequtive data for retransmit.
+ * (C.1) modulo (C.3)
+ */
+ if (tp->snd_rxmit == tp->snd_una)
+ rlen = tcp_sack_firsthole(tp, &rexmit);
+ else
+ rlen = tcp_sack_nextseg(tp, &tp->snd_rexmit, slen);
+ /*
+ * If we have nothing to retransmit, see if we can
+ * send some new data.
+ * (C.3)
+ */
+ if (rlen == 0)
+ if (slen > 0 && (rxmit >= tp->snd_mss ||
+ (rxmit >= slen && dlen == slen))
+ *len = min(slen, rxmit);
+ else
+ *len = 0;
+ break;
+ }
+ /*
+ * Retransmit what we've got.
+ * (C.1)
+ */
+ if (rxmit >= rlen || (rlen > rxmit && rxmit > tp->snd_mss))
+ rlen = min(rlen, pipe);
+ else
+ break;
+ } else if (tp->snd_rexmit == tp->snd_una)
+ rlen = min(tp->snd_mss, SEQ_DELTA(tp->snd_una, tp->snd_nxt));
else
- *len = 0;
- return (0);
- } else
- rlen = min(rlen, pipe); /* XXXAO: pipe! */
+ break;
+ /*
+ * Fill in headers.
+ */
+ th->th_win = (u_short)rwin;
+ th->th_seq = tp->snd_rxmit;
+ th->th_flags = flags;
+ th->th_ack = tp->rcv_nxt;
- /*
- * Fill in headers.
- */
- th->th_win = (u_short)rwin;
- th->th_seq = tp->snd_rxmit;
- th->th_flags = flags;
- th->th_ack = tp->rcv_nxt;
+ /*
+ * If resending a SYN or FIN, be sure NOT to use a new sequence number.
+ */
+ if ((flags & TH_SYN) && (tp->t_flags & TF_SENTSYN))
+ th->th_seq--;
+ if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) &&
+ th->th_seq == tp->snd_nxt)
+ th->th_seq--;
+
+ SOCKBUF_LOCK(&so->so_snd);
+ off = min(tp->snd_rxmit - tp->snd_una, so->so_snd.sb_cc);
+ error = tcp_send_segments(tp, &ths, opt, off, &rlen, optlen);
+ SOCKBUF_UNLOCK(&so->so_snd);
- /*
- * If resending a SYN or FIN, be sure NOT to use a new sequence number.
- */
- if ((flags & TH_SYN) && (tp->t_flags & TF_SENTSYN))
- th->th_seq--;
- if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) &&
- th->th_seq == tp->snd_nxt)
- th->th_seq--;
-
- SOCKBUF_LOCK(&so->so_snd);
- off = min(tp->snd_rxmit - tp->snd_una, so->so_snd.sb_cc);
- error = tcp_send_segments(tp, &ths, opt, off, &rlen, optlen);
- SOCKBUF_UNLOCK(&so->so_snd);
+ /* Start from here the next time. */
+ tp->snd_rxmit += rlen;
+ /* Increase amount of data in the 'pipe' (C.4). */
+ tp->snd_pipe += rlen;
- /* Start from here the next time. */
- tp->snd_rxmit += rlen;
+ } while (error == 0);
return (error);
}
==== //depot/projects/tcp_new/netinet/tcp_sack.c#10 (text+ko) ====
@@ -254,9 +254,10 @@
/*
* D-SACK, was a duplicate retransmit.
* RFC2883: section 5
- * XXXAO: Adjust pipe.
+ * XXXAO: Adjust pipe for data that has left the network.
*/
if (i == 0 && SEQ_DELTA(sack.tsb_blk.start, sack.tsb_blk.end) <= tp->snd_mss) {
+ tp->snd_pipe -= SEQ_DELTA(sack.tsb_blk.start, sack.tsb_blk.end);
//TCPSTAT_INC();
}
continue;
@@ -391,6 +392,34 @@
return (len);
}
+/*
+ * Calculate the number of segments assumed to be in the 'pipe'.
+ * Instead of counting all the bytes from snd_una up to snd_nxt
+ * we start from the highest sackblock and work our way down.
+ * The calculated number is only valid when snd_una == snd_rxmit;
+ */
+int
+tcp_sack_pipe(struct tcpcb *tp)
+{
+ int pipe = 0, blocks = 0, sacked = 0;
+ tcp_seq prev;
+ struct tcp_sack_block *tsb;
+
+ prev = tp->snd_nxt;
+
+ RB_FOREACH_REVERSE(tsb, tcp_sackblocks, &tp->snd_sackblocks) {
+ pipe += SEQ_DELTA(tsb->tsb_blk.end, prev);
+ sacked += SEQ_DELTA(tsb->tsb_blk.start, tsb->tsb_blk.end);
+ if (sacked > 3 * tp->snd_mss)
+ break;
+ if (blocks++ > 2)
+ break;
+ prev = tsb->tsb_blk.start;
+ }
+
+ return (pipe);
+}
+
#ifdef DDB
static void
db_print_sackblocks(struct tcpcb *tp)
==== //depot/projects/tcp_new/netinet/tcp_var.h#14 (text+ko) ====
@@ -173,7 +173,6 @@
tcp_seq snd_una; /* send unacknowledged */
tcp_seq snd_nxt; /* send next */
tcp_seq snd_rxmit; /* from where to retransmit */
- tcp_seq snd_inflight; /* estimate of data currently in the network (~SACK) */
u_int snd_maxburst; /* maximum send burst length */
tcp_seq snd_up; /* send urgent pointer */
@@ -188,6 +187,7 @@
int snd_dupack; /* number of duplicate ACK's reveived */
tcp_seq snd_recover; /* fast retransmit recover */
+ int snd_pipe; /* bytes assumed to be inflight in the pipe */
int snd_abcack; /* count the ack'ed data for ABC */
tcp_seq snd_rtseq; /* seq# of current RTT measurement */
@@ -647,6 +647,7 @@
void tcp_sack_flush(struct tcpcb *);
void tcp_sack_init(void);
int tcp_sack_nextseg(struct tcpcb *, tcp_seq *);
+int tcp_sack_pipe(struct tcpcb *tp);
int tcp_newreno(struct tcpcb *, struct tcphdr *);
u_long tcp_seq_subtract(u_long, u_long );
More information about the p4-projects
mailing list