PERFORCE change 165494 for review

Andre Oppermann andre at FreeBSD.org
Wed Jul 1 08:00:46 UTC 2009


http://perforce.freebsd.org/chv.cgi?CH=165494

Change 165494 by andre at andre_t61 on 2009/07/01 08:00:14

	Dump of WIP from my Laptop. Still much shuffling. Picture is refining a bit all
	the time blike a progressive jpeg.

Affected files ...

.. //depot/projects/tcp_new/netinet/tcp_output.c#9 edit
.. //depot/projects/tcp_new/netinet/tcp_syncache.c#4 edit
.. //depot/projects/tcp_new/netinet/tcp_var.h#9 edit

Differences ...

==== //depot/projects/tcp_new/netinet/tcp_output.c#9 (text+ko) ====

@@ -100,6 +100,20 @@
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
     &tcp_autosndbuf_max, 0, "Max size of automatic send buffer");
 
+static int
+tcp_send(struct tcpcb *tp, struct tcpopt *to, int len, int rwin, int flags);
+static int
+tcp_retransmit(struct tcpcb *tp, int *len);
+static int
+tcp_send_segments(struct tcpcb *tp, struct tcphdr *ths, struct tcpopt *opt,
+    int off, int *olen, int optlen);
+static u_int
+tcp_rcv_wnd(struct tcpcb *tp, struct socket *so);
+static void
+tcp_snd_pace(struct tcpcp *tp);
+static void
+tcp_options(struct tcpcb *tp, struct tcpopt *to, int flags);
+
 /*
  * Tcp output routine: figure out what should be sent and send it.
  *
@@ -157,30 +171,6 @@
 	flags = tcp_outflags[tp->t_state];
 
 	/*
-	 * Determine our current receive window.
-	 * This value is used for the window field in the TCP
-	 * header and to determine whether we have to send a
-	 * window update.
-	 *
-	 * NB: rwin is already downscaled.
-	 */
-	rwin = tcp_rcv_wnd(tp, so);
-
-	/*
-	 * We have been idle for "a while" and no acks are
-	 * expected to clock out any data we send --
-	 * slow start to get ack "clock" running again.
-	 *  RFC2581: Restart window.
-	 *
-	 * XXXAO: Use a decaying algorithm.  It's not useful
-	 * to have cwnd to drop of a cliff.  See RFC2861.
-	 */
-	if (tp->snd_nxt == tp->snd_una &&
-	    (ticks - tp->t_rcvtime) >= max(tp->t_rxtcur, tcp_min_idle)) {
-		tp->snd_cwnd = tcp_init_cwnd(tp);
-	}
-
-	/*
 	 * Determine length of data that should be transmitted, if there
 	 * is some data to send, then transmit; otherwise, investigate further.
 	 *
@@ -230,18 +220,74 @@
 			return (0);		/* next token is pending */
 	}
 
+	/*
+	 * Conservative approximation of data still travelling in the network.
+	 */
 	inflight = duna - tp->snd_sacked;
 
+	/*
+	 * Determine our current receive window.
+	 * This value is used for the window field in the TCP
+	 * header and to determine whether we have to send a
+	 * window update.
+	 *
+	 * NB: rwin is already downscaled.
+	 */
+	rwin = tcp_rcv_wnd(tp, so);
+
+	/*
+	 * Act based on the phase we are in.
+	 */
 	switch (tp->t_phase) {
 	case TP_IDLE:
-	case TP_SLOWSTART:
-	case TP_CONGAVOID:
+		/*
+		 * We have been idle for "a while" and no acks are
+		 * expected to clock out any data we send --
+		 * slow start to get ack "clock" running again.
+		 *  RFC2581: Restart window.
+		 *
+		 * XXXAO: Use a decaying algorithm.  It's not useful
+		 * to have cwnd to drop of a cliff.  See RFC2861.
+		 */
+		if (tp->snd_nxt == tp->snd_una &&
+		    (ticks - tp->t_rcvtime) >= max(tp->t_rxtcur, tcp_min_idle)) {
+			tp->snd_cwnd = tcp_init_cwnd(tp);
+		}
+		break;
+	case TP_SENDING:
+		break;
 	case TP_LOSSRECOV:
+	case TP_REXMT:
 		tcp_retransmit(tp, &len);
 		if (len = 0)
 			return (0);
-	case TP_LOSSREXMT:
+		break;
 	case TP_PERSIST:
+		/*
+		 * Persistent mode.
+		 * Send out probe byte if there is data available.
+		 *  RFC793: section 3.7, page 42-44
+		 *  RFC1122: section 4.2.2.17
+		 */
+		if (swnd == 0 && dlen > 0 && (tp->t_flags & TF_FORCEDATA)) {
+			len = 1;
+			goto send;
+		}
+		if (swnd == 0 && duna > tp->snd_wnd) {
+			/*
+			 * Window shrank
+			 * after we sent into it.  If window shrank to 0,
+			 * cancel pending retransmit, pull snd_nxt back
+			 * to (closed) window, and set the persist timer
+			 * if it isn't already going.  If the window didn't
+			 * close completely, just wait for an ACK.
+			 */
+			tcp_timer_activate(tp, TT_REXMT, 0);
+			tp->t_rxtshift = 0;
+			if (!tcp_timer_active(tp, TT_PERSIST))
+				tcp_setpersist(tp);
+		}
+		break;
 	case TP_RETRY:
 	case TP_URGENT:
 		break;
@@ -290,7 +336,7 @@
 	 * a duplicate ACK (if the ack value didn't move forward).  The
 	 * question whether the other implementations see it the same way.
 	 */
-	if ((tp->t_flags & TF_DUPACK) && tp->snd_dupack > 0) {
+	if ((tp->t_flags & TF_DUPACK) && tp->rcv_trqlen > 0) {
 		if (!(tp->t_flags & TF_SACK_PERMIT))
 			len = 0;
 		goto send;
@@ -318,26 +364,30 @@
 	 * b) silly window syndrome: buffer almost full
 	 *
 	 * Quoting Nagle:
-	 * <<The concept behind delayed ACKs is to bet, when receiving some data from the net,
-	 * that the local application will send a reply very soon.  So there's no need to
-	 * send an ACK immediately; the ACK can be piggybacked on the next data going the
-	 * other way. If that doesn't happen, after a 500ms delay, an ACK is sent anyway.
-	 * The concept behind the Nagle algorithm is that if the sender is doing very tiny
-	 * writes (like single bytes, from Telnet), there's no reason to have more than one
-	 * packet outstanding on the connection. This prevents slow links from choking with
-	 * huge numbers of outstanding tinygrams.
-	 * Both are reasonable. But they interact badly in the case where an application does
-	 * two or more small writes to a socket, then waits for a reply. (X-Windows is notorious
-	 * for this.) When an application does that, the first write results in an immediate
-	 * packet send. The second write is held up until the first is acknowledged. But because
-	 * of the delayed ACK strategy, that acknowledgement is held up for 500ms. This adds
-	 * 500ms of latency to the transaction, even on a LAN.
-	 * The real problem is that 500ms unconditional delay. (Why 500ms? That was a reasonable
-	 * response time for a time-sharing system of the 1980s.) As mentioned above, delaying
-	 * an ACK is a bet that the local application will reply to the data just received.
-	 * Some apps, like character echo in Telnet servers, do respond every time. Others,
-	 * like X-Windows "clients" (really servers, but X is backwards about this), only reply
-	 * some of the time.>>
+	 * <<The concept behind delayed ACKs is to bet, when receiving some
+	 * data from the net, that the local application will send a reply
+	 * very soon.  So there's no need to send an ACK immediately;
+	 * the ACK can be piggybacked on the next data going the other way.
+	 * If that doesn't happen, after a 500ms delay, an ACK is sent anyway.
+	 * The concept behind the Nagle algorithm is that if the sender is
+	 * doing very tiny writes (like single bytes, from Telnet), there's
+	 * no reason to have more than one packet outstanding on the connection.
+	 * This prevents slow links from choking with huge numbers of outstanding
+	 * tinygrams.  Both are reasonable.  But they interact badly in the case
+	 * where an application does two or more small writes to a socket, then
+	 * waits for a reply.  (X-Windows is notorious for this.)  When an
+	 * application does that, the first write results in an immediate
+	 * packet send.  The second write is held up until the first is
+	 * acknowledged.  But because of the delayed ACK strategy, that
+	 * acknowledgement is held up for 500ms.  This adds 500ms of latency
+	 * to the transaction, even on a LAN.  The real problem is that 500ms
+	 * unconditional delay.  (Why 500ms? That was a reasonable response
+	 * time for a time-sharing system of the 1980s.)  As mentioned above,
+	 * delaying an ACK is a bet that the local application will reply to
+	 * the data just received.  Some apps, like character echo in Telnet
+	 * servers, do respond every time.  Others, like X-Windows "clients"
+	 * (really servers, but X is backwards about this), only reply some
+	 * of the time.>>
 	 * http://developers.slashdot.org/comments.pl?sid=174457&threshold=1&commentsort=0&mode=thread&cid=14515105
 	 *
 	 * XXXAO: mss - options!
@@ -372,31 +422,6 @@
 	}
 
 	/*
-	 * Persistent mode.
-	 * Send out probe byte if there is data available.
-	 *  RFC793: section 3.7, page 42-44
-	 *  RFC1122: section 4.2.2.17
-	 */
-	if (swnd == 0 && dlen > 0 && (tp->t_flags & TF_FORCEDATA)) {
-		len = 1;
-		goto send;
-	}
-	if (swnd == 0 && duna > tp->snd_wnd) {
-		/*
-		 * Window shrank
-		 * after we sent into it.  If window shrank to 0,
-		 * cancel pending retransmit, pull snd_nxt back
-		 * to (closed) window, and set the persist timer
-		 * if it isn't already going.  If the window didn't
-		 * close completely, just wait for an ACK.
-		 */
-		tcp_timer_activate(tp, TT_REXMT, 0);
-		tp->t_rxtshift = 0;
-		if (!tcp_timer_active(tp, TT_PERSIST))
-			tcp_setpersist(tp);
-	}
-
-	/*
 	 * Send window update?
 	 *
 	 * The receive window informs the remote side about the
@@ -457,7 +482,7 @@
 	return (tcp_send(tp, &to, flags));
 }
 
-int
+static int
 tcp_send(struct tcpcb *tp, struct tcpopt *to, int len, int rwin, int flags)
 {
 
@@ -470,6 +495,7 @@
 	 * Be careful not to send data and/or FIN on SYN segments.
 	 * This measure is needed to prevent interoperability problems
 	 * with not fully conformant TCP implementations.
+	 *
 	 * NB: For now we don't send any data with SYN.  This will have
 	 * to change if some reincarnation of T/TCP comes up again.
 	 */
@@ -492,25 +518,18 @@
 	else if (tp->t_flags & TF_DUPACK)
 		th->th_win = (u_short)tp->rcv_advwin;
 	else
-		th->th_win = (u_short)(rwin >> tp->rcv_scale);
+		th->th_win = (u_short)rwin;
 
-	SOCKBUF_LOCK(&so->so_snd);
 	/*
 	 * Fill in fields.
 	 */
-	if (tp->snd_nxt == tp->snd_rxmit) {
-		th->th_seq = tp->snd_nxt;
-		off = tp->snd_nxt - tp->snd_una;
-	} else {
-		th->th_seq = tp->snd_rxmit;
-		off = min(tp->snd_rxmit - tp->snd_una, so->so_snd.sb_cc);
-	}
- 
+	th->th_seq = tp->snd_nxt;
 	th->th_flags = flags;
 	th->th_ack = tp->rcv_nxt;
 
+	SOCKBUF_LOCK(&so->so_snd);
+	off = tp->snd_nxt - tp->snd_una;
 	error = tcp_send_segments(tp, &ths, opt, off, &len, optlen);
-
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	/*
@@ -735,7 +754,7 @@
 static int
 tcp_retransmit(struct tcpcb *tp, int *len)
 {
-	struct tcphdr ths;
+	struct tcphdr ths, *th;
 
 	/*
 	 * Retransmit over the SACK holes.
@@ -744,6 +763,13 @@
 	 * Retransmit only the stuff that was not SACK'ed.
 	 */
 	/*
+	 * The moment we receive a duplicate ACK everything freezes.
+	 * No more new data is sent except for those allowed by limited
+	 * transmit.
+	 * The fast recovery algorithms start their work by the third
+	 * duplicate ACK.
+	 */
+	/*
 	 * We have the following mechanisms:
 	 *  1. Fast recovery: After we get three duplicate ACKs RFC2581
 	 *  2. NewReno RFC3782
@@ -752,6 +778,28 @@
 	 *  5. TCP congestion window validation RFC2861
 	 */
 
+	/* Limited transmit */
+	if (tp->snd_dupack < 3)
+		*len = min(len, tp->snd_mss); /* one mss */
+	else
+		*len = 0;
+
+	if (tp->snd_dupack < 3)
+		return;
+
+	/*
+	 * XXXAO: Temporary.
+	 */
+	tp->snd_rxmit = tp->snd_una;
+
+	/*
+	 * Fill in headers.
+	 */
+	th->th_win = (u_short)rwin;
+	th->th_seq = tp->snd_rxmit;
+	th->th_flags = flags;
+	th->th_ack = tp->rcv_nxt;
+
 	/*
 	 * If resending a SYN or FIN, be sure NOT to use a new sequence number.
 	 */
@@ -761,21 +809,11 @@
 	    th->th_seq == tp->snd_nxt)
 		th->th_seq--;
 	
-	/*
-	 * The moment we receive a duplicate ACK everything freezes.
-	 * No more new data is sent except for those allowed by limited
-	 * transmit.
-	 * The fast recovery algorithms start their work by the third
-	 * duplicate ACK.
-	 */
+	SOCKBUF_LOCK(&so->so_snd);
+	off = min(tp->snd_rxmit - tp->snd_una, so->so_snd.sb_cc);
 	error = tcp_send_segments(tp, &ths, opt, off, olen, optlen);
+	SOCKBUF_UNLOCK(&so->so_snd);
 
-	/* Limited transmit */
-	if (tp->snd_dupack < 3)
-		*len = min(len, tp->snd_mss); /* one mss */
-	else
-		*len = 0;
-
 	return;
 }
 
@@ -879,6 +917,14 @@
 		    ("%s: segment too big", __func__));
 
 		/*
+		 * Do not send small fragments unless we empty the buffer
+		 * or this is the only segment.
+		 */
+		if (slen < tp->snd_mss - optlen && *olen > 0 &&
+		    off + slen == so->so_snd.sb_cc)
+			break;
+
+		/*
 		 * Allocate an mbuf sufficiently large to hold all
 		 * headers for this segment plus space for the link
 		 * headers to remove the need for prepends in the
@@ -987,7 +1033,8 @@
 		 * Set the PUSH bit to indicate that we have reached
 		 * the end of the send buffer.
 		 */
-		if (slen > 0 && off + slen == so->so_snd.sb_cc)
+		if (slen > 0 && !(tp->t_flags & TF_MORETOCOME) &&
+		    off + slen == so->so_snd.sb_cc)
 			th->th_flags |= TH_PUSH;
 
 		KASSERT(off + slen <= so->so_snd.sb_cc,
@@ -1178,7 +1225,7 @@
 	return;
 }
 
-void
+static void
 tcp_options(struct tcpcb *tp, struct tcpopt *to, int flags)
 {
 	/*

==== //depot/projects/tcp_new/netinet/tcp_syncache.c#4 (text+ko) ====

@@ -768,6 +768,10 @@
 			goto abort;
 		}
 	}
+
+	/*
+	 * Initialize the TCP control block.
+	 */
 	tp = intotcpcb(inp);
 	tp->t_state = TCPS_SYN_RECEIVED;
 	tp->iss = sc->sc_iss;

==== //depot/projects/tcp_new/netinet/tcp_var.h#9 (text+ko) ====

@@ -139,13 +139,12 @@
 
 	u_int	t_phase;		/* send phase we are currently in */
 #define	TP_IDLE		0		/* nothing to send */
-#define	TP_SLOWSTART	1		/* slow start */
-#define	TP_CONGAVOID	2		/* congestion avoidance */
-#define	TP_LOSSRECOV	3		/* loss recovery */
-#define	TP_LOSSREXMT	4		/* loss recovery failed, retransmit */
-#define	TP_PERSIST	5		/* persistent mode */
-#define	TP_RETRY	6		/* retry after ENOMEM or ENOBUF */
-#define	TP_URGENT	7		/* urgent mode */
+#define	TP_SENDING	1		/* sending data */
+#define	TP_LOSSRECOV	2		/* loss recovery */
+#define	TP_REXMT	3		/* loss recovery failed, retransmit */
+#define	TP_PERSIST	4		/* persistent mode */
+#define	TP_RETRY	5		/* retry after ENOMEM or ENOBUF */
+#define	TP_URGENT	6		/* urgent mode */
 
 	int	t_softerror;		/* possible error not yet reported */
 
@@ -173,7 +172,7 @@
 	u_int	snd_delackdelay;	/* time to delay an ACK in ticks */
 
 	int	snd_dupack;		/* number of duplicate ACK's reveived */
-	tcp_seq	snd_fr_recover;		/* fast retransmit recover */
+	tcp_seq	snd_recover;		/* fast retransmit recover */
 	int	snd_abcack;		/* count the ack'ed data for ABC */
 
 	tcp_seq	snd_rtseq;		/* seq# of current RTT measurement */
@@ -200,6 +199,7 @@
 	uint8_t	rcv_scale;		/* window scaling for recv window */
 	struct	trq_head rcv_trq;	/* segment reassembly queue */
 	int	rcv_trqlen;		/* segment reassembly queue length in bytes */
+	int	rcv_dupack;		/* duplicate acks we sent */
 
 	tcp_ts	tsecr_recent;		/* timestamp echo data */
 	u_long	tsecr_age;		/* when echo last updated */
@@ -247,7 +247,6 @@
 	tcp_win	snd_ssthresh_prev;	/* ssthresh prior to retransmit */
 	tcp_seq	snd_recover_prev;	/* snd_recover prior to retransmit */
 	tcp_win	t_badrxtwin;		/* window for retransmit recovery */
-	int	snd_limited;		/* segments limited transmitted */
 };
 
 /*


More information about the p4-projects mailing list