PERFORCE change 165494 for review
Andre Oppermann
andre at FreeBSD.org
Wed Jul 1 08:00:46 UTC 2009
http://perforce.freebsd.org/chv.cgi?CH=165494
Change 165494 by andre at andre_t61 on 2009/07/01 08:00:14
Dump of WIP from my Laptop. Still much shuffling. Picture is refining a bit all
the time blike a progressive jpeg.
Affected files ...
.. //depot/projects/tcp_new/netinet/tcp_output.c#9 edit
.. //depot/projects/tcp_new/netinet/tcp_syncache.c#4 edit
.. //depot/projects/tcp_new/netinet/tcp_var.h#9 edit
Differences ...
==== //depot/projects/tcp_new/netinet/tcp_output.c#9 (text+ko) ====
@@ -100,6 +100,20 @@
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
&tcp_autosndbuf_max, 0, "Max size of automatic send buffer");
+static int
+tcp_send(struct tcpcb *tp, struct tcpopt *to, int len, int rwin, int flags);
+static int
+tcp_retransmit(struct tcpcb *tp, int *len);
+static int
+tcp_send_segments(struct tcpcb *tp, struct tcphdr *ths, struct tcpopt *opt,
+ int off, int *olen, int optlen);
+static u_int
+tcp_rcv_wnd(struct tcpcb *tp, struct socket *so);
+static void
+tcp_snd_pace(struct tcpcp *tp);
+static void
+tcp_options(struct tcpcb *tp, struct tcpopt *to, int flags);
+
/*
* Tcp output routine: figure out what should be sent and send it.
*
@@ -157,30 +171,6 @@
flags = tcp_outflags[tp->t_state];
/*
- * Determine our current receive window.
- * This value is used for the window field in the TCP
- * header and to determine whether we have to send a
- * window update.
- *
- * NB: rwin is already downscaled.
- */
- rwin = tcp_rcv_wnd(tp, so);
-
- /*
- * We have been idle for "a while" and no acks are
- * expected to clock out any data we send --
- * slow start to get ack "clock" running again.
- * RFC2581: Restart window.
- *
- * XXXAO: Use a decaying algorithm. It's not useful
- * to have cwnd to drop of a cliff. See RFC2861.
- */
- if (tp->snd_nxt == tp->snd_una &&
- (ticks - tp->t_rcvtime) >= max(tp->t_rxtcur, tcp_min_idle)) {
- tp->snd_cwnd = tcp_init_cwnd(tp);
- }
-
- /*
* Determine length of data that should be transmitted, if there
* is some data to send, then transmit; otherwise, investigate further.
*
@@ -230,18 +220,74 @@
return (0); /* next token is pending */
}
+ /*
+ * Conservative approximation of data still travelling in the network.
+ */
inflight = duna - tp->snd_sacked;
+ /*
+ * Determine our current receive window.
+ * This value is used for the window field in the TCP
+ * header and to determine whether we have to send a
+ * window update.
+ *
+ * NB: rwin is already downscaled.
+ */
+ rwin = tcp_rcv_wnd(tp, so);
+
+ /*
+ * Act based on the phase we are in.
+ */
switch (tp->t_phase) {
case TP_IDLE:
- case TP_SLOWSTART:
- case TP_CONGAVOID:
+ /*
+ * We have been idle for "a while" and no acks are
+ * expected to clock out any data we send --
+ * slow start to get ack "clock" running again.
+ * RFC2581: Restart window.
+ *
+ * XXXAO: Use a decaying algorithm. It's not useful
+ * to have cwnd to drop of a cliff. See RFC2861.
+ */
+ if (tp->snd_nxt == tp->snd_una &&
+ (ticks - tp->t_rcvtime) >= max(tp->t_rxtcur, tcp_min_idle)) {
+ tp->snd_cwnd = tcp_init_cwnd(tp);
+ }
+ break;
+ case TP_SENDING:
+ break;
case TP_LOSSRECOV:
+ case TP_REXMT:
tcp_retransmit(tp, &len);
if (len = 0)
return (0);
- case TP_LOSSREXMT:
+ break;
case TP_PERSIST:
+ /*
+ * Persistent mode.
+ * Send out probe byte if there is data available.
+ * RFC793: section 3.7, page 42-44
+ * RFC1122: section 4.2.2.17
+ */
+ if (swnd == 0 && dlen > 0 && (tp->t_flags & TF_FORCEDATA)) {
+ len = 1;
+ goto send;
+ }
+ if (swnd == 0 && duna > tp->snd_wnd) {
+ /*
+ * Window shrank
+ * after we sent into it. If window shrank to 0,
+ * cancel pending retransmit, pull snd_nxt back
+ * to (closed) window, and set the persist timer
+ * if it isn't already going. If the window didn't
+ * close completely, just wait for an ACK.
+ */
+ tcp_timer_activate(tp, TT_REXMT, 0);
+ tp->t_rxtshift = 0;
+ if (!tcp_timer_active(tp, TT_PERSIST))
+ tcp_setpersist(tp);
+ }
+ break;
case TP_RETRY:
case TP_URGENT:
break;
@@ -290,7 +336,7 @@
* a duplicate ACK (if the ack value didn't move forward). The
* question whether the other implementations see it the same way.
*/
- if ((tp->t_flags & TF_DUPACK) && tp->snd_dupack > 0) {
+ if ((tp->t_flags & TF_DUPACK) && tp->rcv_trqlen > 0) {
if (!(tp->t_flags & TF_SACK_PERMIT))
len = 0;
goto send;
@@ -318,26 +364,30 @@
* b) silly window syndrome: buffer almost full
*
* Quoting Nagle:
- * <<The concept behind delayed ACKs is to bet, when receiving some data from the net,
- * that the local application will send a reply very soon. So there's no need to
- * send an ACK immediately; the ACK can be piggybacked on the next data going the
- * other way. If that doesn't happen, after a 500ms delay, an ACK is sent anyway.
- * The concept behind the Nagle algorithm is that if the sender is doing very tiny
- * writes (like single bytes, from Telnet), there's no reason to have more than one
- * packet outstanding on the connection. This prevents slow links from choking with
- * huge numbers of outstanding tinygrams.
- * Both are reasonable. But they interact badly in the case where an application does
- * two or more small writes to a socket, then waits for a reply. (X-Windows is notorious
- * for this.) When an application does that, the first write results in an immediate
- * packet send. The second write is held up until the first is acknowledged. But because
- * of the delayed ACK strategy, that acknowledgement is held up for 500ms. This adds
- * 500ms of latency to the transaction, even on a LAN.
- * The real problem is that 500ms unconditional delay. (Why 500ms? That was a reasonable
- * response time for a time-sharing system of the 1980s.) As mentioned above, delaying
- * an ACK is a bet that the local application will reply to the data just received.
- * Some apps, like character echo in Telnet servers, do respond every time. Others,
- * like X-Windows "clients" (really servers, but X is backwards about this), only reply
- * some of the time.>>
+ * <<The concept behind delayed ACKs is to bet, when receiving some
+ * data from the net, that the local application will send a reply
+ * very soon. So there's no need to send an ACK immediately;
+ * the ACK can be piggybacked on the next data going the other way.
+ * If that doesn't happen, after a 500ms delay, an ACK is sent anyway.
+ * The concept behind the Nagle algorithm is that if the sender is
+ * doing very tiny writes (like single bytes, from Telnet), there's
+ * no reason to have more than one packet outstanding on the connection.
+ * This prevents slow links from choking with huge numbers of outstanding
+ * tinygrams. Both are reasonable. But they interact badly in the case
+ * where an application does two or more small writes to a socket, then
+ * waits for a reply. (X-Windows is notorious for this.) When an
+ * application does that, the first write results in an immediate
+ * packet send. The second write is held up until the first is
+ * acknowledged. But because of the delayed ACK strategy, that
+ * acknowledgement is held up for 500ms. This adds 500ms of latency
+ * to the transaction, even on a LAN. The real problem is that 500ms
+ * unconditional delay. (Why 500ms? That was a reasonable response
+ * time for a time-sharing system of the 1980s.) As mentioned above,
+ * delaying an ACK is a bet that the local application will reply to
+ * the data just received. Some apps, like character echo in Telnet
+ * servers, do respond every time. Others, like X-Windows "clients"
+ * (really servers, but X is backwards about this), only reply some
+ * of the time.>>
* http://developers.slashdot.org/comments.pl?sid=174457&threshold=1&commentsort=0&mode=thread&cid=14515105
*
* XXXAO: mss - options!
@@ -372,31 +422,6 @@
}
/*
- * Persistent mode.
- * Send out probe byte if there is data available.
- * RFC793: section 3.7, page 42-44
- * RFC1122: section 4.2.2.17
- */
- if (swnd == 0 && dlen > 0 && (tp->t_flags & TF_FORCEDATA)) {
- len = 1;
- goto send;
- }
- if (swnd == 0 && duna > tp->snd_wnd) {
- /*
- * Window shrank
- * after we sent into it. If window shrank to 0,
- * cancel pending retransmit, pull snd_nxt back
- * to (closed) window, and set the persist timer
- * if it isn't already going. If the window didn't
- * close completely, just wait for an ACK.
- */
- tcp_timer_activate(tp, TT_REXMT, 0);
- tp->t_rxtshift = 0;
- if (!tcp_timer_active(tp, TT_PERSIST))
- tcp_setpersist(tp);
- }
-
- /*
* Send window update?
*
* The receive window informs the remote side about the
@@ -457,7 +482,7 @@
return (tcp_send(tp, &to, flags));
}
-int
+static int
tcp_send(struct tcpcb *tp, struct tcpopt *to, int len, int rwin, int flags)
{
@@ -470,6 +495,7 @@
* Be careful not to send data and/or FIN on SYN segments.
* This measure is needed to prevent interoperability problems
* with not fully conformant TCP implementations.
+ *
* NB: For now we don't send any data with SYN. This will have
* to change if some reincarnation of T/TCP comes up again.
*/
@@ -492,25 +518,18 @@
else if (tp->t_flags & TF_DUPACK)
th->th_win = (u_short)tp->rcv_advwin;
else
- th->th_win = (u_short)(rwin >> tp->rcv_scale);
+ th->th_win = (u_short)rwin;
- SOCKBUF_LOCK(&so->so_snd);
/*
* Fill in fields.
*/
- if (tp->snd_nxt == tp->snd_rxmit) {
- th->th_seq = tp->snd_nxt;
- off = tp->snd_nxt - tp->snd_una;
- } else {
- th->th_seq = tp->snd_rxmit;
- off = min(tp->snd_rxmit - tp->snd_una, so->so_snd.sb_cc);
- }
-
+ th->th_seq = tp->snd_nxt;
th->th_flags = flags;
th->th_ack = tp->rcv_nxt;
+ SOCKBUF_LOCK(&so->so_snd);
+ off = tp->snd_nxt - tp->snd_una;
error = tcp_send_segments(tp, &ths, opt, off, &len, optlen);
-
SOCKBUF_UNLOCK(&so->so_snd);
/*
@@ -735,7 +754,7 @@
static int
tcp_retransmit(struct tcpcb *tp, int *len)
{
- struct tcphdr ths;
+ struct tcphdr ths, *th;
/*
* Retransmit over the SACK holes.
@@ -744,6 +763,13 @@
* Retransmit only the stuff that was not SACK'ed.
*/
/*
+ * The moment we receive a duplicate ACK everything freezes.
+ * No more new data is sent except for those allowed by limited
+ * transmit.
+ * The fast recovery algorithms start their work by the third
+ * duplicate ACK.
+ */
+ /*
* We have the following mechanisms:
* 1. Fast recovery: After we get three duplicate ACKs RFC2581
* 2. NewReno RFC3782
@@ -752,6 +778,28 @@
* 5. TCP congestion window validation RFC2861
*/
+ /* Limited transmit */
+ if (tp->snd_dupack < 3)
+ *len = min(len, tp->snd_mss); /* one mss */
+ else
+ *len = 0;
+
+ if (tp->snd_dupack < 3)
+ return;
+
+ /*
+ * XXXAO: Temporary.
+ */
+ tp->snd_rxmit = tp->snd_una;
+
+ /*
+ * Fill in headers.
+ */
+ th->th_win = (u_short)rwin;
+ th->th_seq = tp->snd_rxmit;
+ th->th_flags = flags;
+ th->th_ack = tp->rcv_nxt;
+
/*
* If resending a SYN or FIN, be sure NOT to use a new sequence number.
*/
@@ -761,21 +809,11 @@
th->th_seq == tp->snd_nxt)
th->th_seq--;
- /*
- * The moment we receive a duplicate ACK everything freezes.
- * No more new data is sent except for those allowed by limited
- * transmit.
- * The fast recovery algorithms start their work by the third
- * duplicate ACK.
- */
+ SOCKBUF_LOCK(&so->so_snd);
+ off = min(tp->snd_rxmit - tp->snd_una, so->so_snd.sb_cc);
error = tcp_send_segments(tp, &ths, opt, off, olen, optlen);
+ SOCKBUF_UNLOCK(&so->so_snd);
- /* Limited transmit */
- if (tp->snd_dupack < 3)
- *len = min(len, tp->snd_mss); /* one mss */
- else
- *len = 0;
-
return;
}
@@ -879,6 +917,14 @@
("%s: segment too big", __func__));
/*
+ * Do not send small fragments unless we empty the buffer
+ * or this is the only segment.
+ */
+ if (slen < tp->snd_mss - optlen && *olen > 0 &&
+ off + slen == so->so_snd.sb_cc)
+ break;
+
+ /*
* Allocate an mbuf sufficiently large to hold all
* headers for this segment plus space for the link
* headers to remove the need for prepends in the
@@ -987,7 +1033,8 @@
* Set the PUSH bit to indicate that we have reached
* the end of the send buffer.
*/
- if (slen > 0 && off + slen == so->so_snd.sb_cc)
+ if (slen > 0 && !(tp->t_flags & TF_MORETOCOME) &&
+ off + slen == so->so_snd.sb_cc)
th->th_flags |= TH_PUSH;
KASSERT(off + slen <= so->so_snd.sb_cc,
@@ -1178,7 +1225,7 @@
return;
}
-void
+static void
tcp_options(struct tcpcb *tp, struct tcpopt *to, int flags)
{
/*
==== //depot/projects/tcp_new/netinet/tcp_syncache.c#4 (text+ko) ====
@@ -768,6 +768,10 @@
goto abort;
}
}
+
+ /*
+ * Initialize the TCP control block.
+ */
tp = intotcpcb(inp);
tp->t_state = TCPS_SYN_RECEIVED;
tp->iss = sc->sc_iss;
==== //depot/projects/tcp_new/netinet/tcp_var.h#9 (text+ko) ====
@@ -139,13 +139,12 @@
u_int t_phase; /* send phase we are currently in */
#define TP_IDLE 0 /* nothing to send */
-#define TP_SLOWSTART 1 /* slow start */
-#define TP_CONGAVOID 2 /* congestion avoidance */
-#define TP_LOSSRECOV 3 /* loss recovery */
-#define TP_LOSSREXMT 4 /* loss recovery failed, retransmit */
-#define TP_PERSIST 5 /* persistent mode */
-#define TP_RETRY 6 /* retry after ENOMEM or ENOBUF */
-#define TP_URGENT 7 /* urgent mode */
+#define TP_SENDING 1 /* sending data */
+#define TP_LOSSRECOV 2 /* loss recovery */
+#define TP_REXMT 3 /* loss recovery failed, retransmit */
+#define TP_PERSIST 4 /* persistent mode */
+#define TP_RETRY 5 /* retry after ENOMEM or ENOBUF */
+#define TP_URGENT 6 /* urgent mode */
int t_softerror; /* possible error not yet reported */
@@ -173,7 +172,7 @@
u_int snd_delackdelay; /* time to delay an ACK in ticks */
int snd_dupack; /* number of duplicate ACK's reveived */
- tcp_seq snd_fr_recover; /* fast retransmit recover */
+ tcp_seq snd_recover; /* fast retransmit recover */
int snd_abcack; /* count the ack'ed data for ABC */
tcp_seq snd_rtseq; /* seq# of current RTT measurement */
@@ -200,6 +199,7 @@
uint8_t rcv_scale; /* window scaling for recv window */
struct trq_head rcv_trq; /* segment reassembly queue */
int rcv_trqlen; /* segment reassembly queue length in bytes */
+ int rcv_dupack; /* duplicate acks we sent */
tcp_ts tsecr_recent; /* timestamp echo data */
u_long tsecr_age; /* when echo last updated */
@@ -247,7 +247,6 @@
tcp_win snd_ssthresh_prev; /* ssthresh prior to retransmit */
tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */
tcp_win t_badrxtwin; /* window for retransmit recovery */
- int snd_limited; /* segments limited transmitted */
};
/*
More information about the p4-projects
mailing list