kern/68110: [PATCH] RFC 3522 for -HEAD

Xin LI delphij at FreeBSD.org.cn
Sat Jun 19 12:20:27 GMT 2004


>Number:         68110
>Category:       kern
>Synopsis:       [PATCH] RFC 3522 for -HEAD
>Confidential:   no
>Severity:       non-critical
>Priority:       high
>Responsible:    freebsd-bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          change-request
>Submitter-Id:   current-users
>Arrival-Date:   Sat Jun 19 12:20:23 GMT 2004
>Closed-Date:
>Last-Modified:
>Originator:     Xin LI
>Release:        FreeBSD 5.2-delphij i386
>Organization:
The FreeBSD Simplified Chinese Project
>Environment:
System: FreeBSD beastie.frontfree.net 5.2-delphij FreeBSD 5.2-delphij #66: Tue Jun 15 11:25:44 CST 2004 root at beastie.frontfree.net:/usr/obj/usr/src/sys/BEASTIE i386


>Description:
	The attached patch brings RFC 3522 (Eifel detection) to FreeBSD.
	The original work was obtained from DragonFlyBSD, which implemented RFC3522 last August. It will be good for FreeBSD to have RFC3522 implementation before 5.3-RELEASE.
>How-To-Repeat:
	N/A
>Fix:
	Apply the attached patchset against HEAD.

	Please be ware that the attached patch will cause an ABI change. I will write an UPDATING entry if this patchset would be accepted.

--- rfc3522.diff begins here ---

Index: src/sys/netinet/tcp_input.c
diff -u src/sys/netinet/tcp_input.c:1.241 src/sys/netinet/tcp_input.c:1.241.1000.1
--- src/sys/netinet/tcp_input.c:1.241	Wed Jun 16 17:35:07 2004
+++ src/sys/netinet/tcp_input.c	Sat Jun 19 17:21:09 2004
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2002-2003 Jeffrey Hsu
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
@@ -131,6 +132,11 @@
     &tcp_do_rfc3390, 0,
     "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
 
+static int tcp_do_eifel_detect = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, eifel, CTLFLAG_RW,
+    &tcp_do_eifel_detect, 0,
+    "Eifel detection algorithm (RFC 3522)");
+
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
 	    "TCP Segment Reassembly Queue");
 
@@ -1130,19 +1136,26 @@
 				++tcpstat.tcps_predack;
 				/*
 				 * "bad retransmit" recovery
+				 *
+				 * If Eifel detection applies, then
+				 * it is deterministic, so use it
+				 * unconditionally over the old heuristic
+				 * Otherwise, fall back to the old heuristic.
 				 */
-				if (tp->t_rxtshift == 1 &&
+				if (tcp_do_eifel_detect &&
+				    (to.to_flags & TOF_TS) && to.to_tsecr &&
+				    (tp->t_flags & TF_FIRSTACCACK)) {
+					/* Eifel detection applicable. */
+					if (to.to_tsecr < tp->t_rexmtTS) {
+						tcp_revert_congestion_state(tp);
+						++tcpstat.tcps_eifeldetected;
+					}
+				} else if (tp->t_rxtshift == 1 &&
 				    ticks < tp->t_badrxtwin) {
-					++tcpstat.tcps_sndrexmitbad;
-					tp->snd_cwnd = tp->snd_cwnd_prev;
-					tp->snd_ssthresh =
-					    tp->snd_ssthresh_prev;
-					tp->snd_recover = tp->snd_recover_prev;
-					if (tp->t_flags & TF_WASFRECOVERY)
-					    ENTER_FASTRECOVERY(tp);
-					tp->snd_nxt = tp->snd_max;
-					tp->t_badrxtwin = 0;
+					tcp_revert_congestion_state(tp);
+					++tcpstat.tcps_rttdetected;
 				}
+				tp->t_flags &= ~(TF_FIRSTACCACK | TF_FASTREXMT);
 
 				/*
 				 * Recalculate the transmit timer / rtt.
@@ -1911,6 +1924,11 @@
 						tp->t_dupacks = 0;
 						break;
 					}
+					if (tcp_do_eifel_detect &&
+					    (tp->t_flags & TF_RCVD_TSTMP)) {
+						tcp_save_congestion_state(tp);
+						tp->t_flags |= TF_FASTREXMT;
+					}
 					win = min(tp->snd_wnd, tp->snd_cwnd) /
 					    2 / tp->t_maxseg;
 					if (win < 2)
@@ -2037,15 +2055,17 @@
 		 * original cwnd and ssthresh, and proceed to transmit where
 		 * we left off.
 		 */
-		if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) {
-			++tcpstat.tcps_sndrexmitbad;
-			tp->snd_cwnd = tp->snd_cwnd_prev;
-			tp->snd_ssthresh = tp->snd_ssthresh_prev;
-			tp->snd_recover = tp->snd_recover_prev;
-			if (tp->t_flags & TF_WASFRECOVERY)
-				ENTER_FASTRECOVERY(tp);
-			tp->snd_nxt = tp->snd_max;
-			tp->t_badrxtwin = 0;	/* XXX probably not required */ 
+		if (tcp_do_eifel_detect && acked &&
+		    (to.to_flags & TOF_TS) && to.to_tsecr &&
+		    (tp->t_flags & TF_FIRSTACCACK)) {
+			/* Eifel detection applicable. */
+			if (to.to_tsecr < tp->t_rexmtTS) {
+				tcp_revert_congestion_state(tp);
+				++tcpstat.tcps_eifeldetected;
+			}
+		} else if (tp->t_rxtshift == 1 && ticks < tp->t_badrxtwin) {
+			tcp_revert_congestion_state(tp);
+			++tcpstat.tcps_rttdetected;
 		}
 
 		/*
@@ -2090,6 +2110,9 @@
 		if (acked == 0)
 			goto step6;
 
+		/* Stop looking for an acceptable ACK since one was received. */
+		tp->t_flags &= ~(TF_FIRSTACCACK | TF_FASTREXMT);
+
 		/*
 		 * When new data is acked, open the congestion window.
 		 * If the window gives us less than ssthresh packets
Index: src/sys/netinet/tcp_timer.c
diff -u src/sys/netinet/tcp_timer.c:1.64 src/sys/netinet/tcp_timer.c:1.64.1000.1
--- src/sys/netinet/tcp_timer.c:1.64	Thu Apr  8 04:46:14 2004
+++ src/sys/netinet/tcp_timer.c	Sat Jun 19 17:21:09 2004
@@ -467,6 +467,39 @@
 }
 
 void
+tcp_save_congestion_state(struct tcpcb *tp)
+{
+	tp->snd_cwnd_prev = tp->snd_cwnd;
+	tp->snd_ssthresh_prev = tp->snd_ssthresh;
+	tp->snd_recover_prev = tp->snd_recover;
+	if (IN_FASTRECOVERY(tp))
+	    tp->t_flags |= TF_WASFRECOVERY;
+	else
+	    tp->t_flags &= ~TF_WASFRECOVERY;
+	if (tp->t_flags & TF_RCVD_TSTMP) {
+		tp->t_rexmtTS = ticks;
+		tp->t_flags |= TF_FIRSTACCACK;
+	}
+}
+
+void
+tcp_revert_congestion_state(struct tcpcb *tp)
+{
+	tp->snd_cwnd = tp->snd_cwnd_prev;
+	tp->snd_ssthresh = tp->snd_ssthresh_prev;
+	tp->snd_recover = tp->snd_recover_prev;
+	if (tp->t_flags & TF_WASFRECOVERY)
+	    ENTER_FASTRECOVERY(tp);
+	if (tp->t_flags & TF_FASTREXMT)
+	    ++tcpstat.tcps_sndfastrexmitbad;
+	else
+	    ++tcpstat.tcps_sndrtobad;
+	tp->t_badrxtwin = 0;
+	tp->t_rxtshift = 0;
+	tp->snd_nxt = tp->snd_max;
+}
+
+void
 tcp_timer_rexmt(xtp)
 	void *xtp;
 {
@@ -521,14 +554,9 @@
 		 * "On Estimating End-to-End Network Path Properties" by
 		 * Allman and Paxson for more details.
 		 */
-		tp->snd_cwnd_prev = tp->snd_cwnd;
-		tp->snd_ssthresh_prev = tp->snd_ssthresh;
-		tp->snd_recover_prev = tp->snd_recover;
-		if (IN_FASTRECOVERY(tp))
-		  tp->t_flags |= TF_WASFRECOVERY;
-		else
-		  tp->t_flags &= ~TF_WASFRECOVERY;
 		tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
+		tcp_save_congestion_state(tp);
+		tp->t_flags &= ~TF_FASTREXMT;
 	}
 	tcpstat.tcps_rexmttimeo++;
 	if (tp->t_state == TCPS_SYN_SENT)
Index: src/sys/netinet/tcp_var.h
diff -u src/sys/netinet/tcp_var.h:1.105 src/sys/netinet/tcp_var.h:1.105.1000.3
--- src/sys/netinet/tcp_var.h:1.105	Mon Apr 26 10:56:31 2004
+++ src/sys/netinet/tcp_var.h	Sat Jun 19 18:35:10 2004
@@ -78,29 +78,32 @@
 	struct	inpcb *t_inpcb;		/* back pointer to internet pcb */
 	int	t_state;		/* state of this connection */
 	u_int	t_flags;
-#define	TF_ACKNOW	0x000001	/* ack peer immediately */
-#define	TF_DELACK	0x000002	/* ack, but try to delay it */
-#define	TF_NODELAY	0x000004	/* don't delay packets to coalesce */
-#define	TF_NOOPT	0x000008	/* don't use tcp options */
-#define	TF_SENTFIN	0x000010	/* have sent FIN */
-#define	TF_REQ_SCALE	0x000020	/* have/will request window scaling */
-#define	TF_RCVD_SCALE	0x000040	/* other side has requested scaling */
-#define	TF_REQ_TSTMP	0x000080	/* have/will request timestamps */
-#define	TF_RCVD_TSTMP	0x000100	/* a timestamp was received in SYN */
-#define	TF_SACK_PERMIT	0x000200	/* other side said I could SACK */
-#define	TF_NEEDSYN	0x000400	/* send SYN (implicit state) */
-#define	TF_NEEDFIN	0x000800	/* send FIN (implicit state) */
-#define	TF_NOPUSH	0x001000	/* don't push */
-#define	TF_REQ_CC	0x002000	/* have/will request CC */
-#define	TF_RCVD_CC	0x004000	/* a CC was received in SYN */
-#define	TF_SENDCCNEW	0x008000	/* send CCnew instead of CC in SYN */
-#define	TF_MORETOCOME	0x010000	/* More data to be appended to sock */
-#define	TF_LQ_OVERFLOW	0x020000	/* listen queue overflow */
-#define	TF_LASTIDLE	0x040000	/* connection was previously idle */
-#define	TF_RXWIN0SENT	0x080000	/* sent a receiver win 0 in response */
-#define	TF_FASTRECOVERY	0x100000	/* in NewReno Fast Recovery */
-#define	TF_WASFRECOVERY	0x200000	/* was in NewReno Fast Recovery */
-#define	TF_SIGNATURE	0x400000	/* require MD5 digests (RFC2385) */
+#define	TF_ACKNOW	0x00000001	/* ack peer immediately */
+#define	TF_DELACK	0x00000002	/* ack, but try to delay it */
+#define	TF_NODELAY	0x00000004	/* don't delay packets to coalesce */
+#define	TF_NOOPT	0x00000008	/* don't use tcp options */
+#define	TF_SENTFIN	0x00000010	/* have sent FIN */
+#define	TF_REQ_SCALE	0x00000020	/* have/will request window scaling */
+#define	TF_RCVD_SCALE	0x00000040	/* other side has requested scaling */
+#define	TF_REQ_TSTMP	0x00000080	/* have/will request timestamps */
+#define	TF_RCVD_TSTMP	0x00000100	/* a timestamp was received in SYN */
+#define	TF_SACK_PERMIT	0x00000200	/* other side said I could SACK */
+#define	TF_NEEDSYN	0x00000400	/* send SYN (implicit state) */
+#define	TF_NEEDFIN	0x00000800	/* send FIN (implicit state) */
+#define	TF_NOPUSH	0x00001000	/* don't push */
+#define	TF_REQ_CC	0x00002000	/* have/will request CC */
+#define	TF_RCVD_CC	0x00004000	/* a CC was received in SYN */
+#define	TF_SENDCCNEW	0x00008000	/* send CCnew instead of CC in SYN */
+#define	TF_MORETOCOME	0x00010000	/* More data to be appended to sock */
+#define	TF_LQ_OVERFLOW	0x00020000	/* listen queue overflow */
+#define	TF_LASTIDLE	0x00040000	/* connection was previously idle */
+#define	TF_RXWIN0SENT	0x00080000	/* sent a receiver win 0 in response */
+#define	TF_FASTRECOVERY	0x00100000	/* in NewReno Fast Recovery */
+#define	TF_WASFRECOVERY	0x00200000	/* was in NewReno Fast Recovery */
+#define	TF_SIGNATURE	0x00400000	/* require MD5 digests (RFC2385) */
+#define	TF_FIRSTACCACK	0x00800000	/* Look for 1st acceptable ACK. */
+#define	TF_FASTREXMT	0x01000000	/* Did Fast Retransmit. */
+
 	int	t_force;		/* 1 if forcing out a byte */
 
 	tcp_seq	snd_una;		/* send unacknowledged */
@@ -174,6 +177,7 @@
 	u_long	snd_ssthresh_prev;	/* ssthresh prior to retransmit */
 	tcp_seq	snd_recover_prev;	/* snd_recover prior to retransmit */
 	u_long	t_badrxtwin;		/* window for retransmit recovery */
+	u_long	t_rexmtTS;		/* timestamp of last retransmit */
 	u_char	snd_limited;		/* segments limited transmitted */
 /* anti DoS counters */
 	u_long	rcv_second;		/* start of interval second */
@@ -371,7 +375,10 @@
 	u_long	tcps_sndbyte;		/* data bytes sent */
 	u_long	tcps_sndrexmitpack;	/* data packets retransmitted */
 	u_long	tcps_sndrexmitbyte;	/* data bytes retransmitted */
-	u_long	tcps_sndrexmitbad;	/* unnecessary packet retransmissions */
+	u_long	tcps_sndrtobad;		/* spurious RTO retransmissions */
+	u_long	tcps_sndfastrexmitbad;	/* spurious Fast Retransmissions */
+	u_long	tcps_eifeldetected;	/* Eifel-detected spurious rexmits */
+	u_long	tcps_rttdetected;	/* RTT-detected spurious RTO rexmits */
 	u_long	tcps_sndacks;		/* ack-only packets sent */
 	u_long	tcps_sndprobe;		/* window probes sent */
 	u_long	tcps_sndurg;		/* packets sent with URG only */
@@ -538,6 +545,8 @@
 void	 tcp_respond(struct tcpcb *, void *,
 	    struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int);
 int	 tcp_twrespond(struct tcptw *, int);
+void	 tcp_save_congestion_state(struct tcpcb *tp);
+void	 tcp_revert_congestion_state(struct tcpcb *tp);
 void	 tcp_setpersist(struct tcpcb *);
 #ifdef TCP_SIGNATURE
 int	 tcp_signature_compute(struct mbuf *, int, int, int, u_char *, u_int);
Index: src/usr.bin/netstat/inet.c
diff -u src/usr.bin/netstat/inet.c:1.65 src/usr.bin/netstat/inet.c:1.65.1000.1
--- src/usr.bin/netstat/inet.c:1.65	Wed Jun 16 15:00:50 2004
+++ src/usr.bin/netstat/inet.c	Sat Jun 19 18:33:40 2004
@@ -382,8 +382,10 @@
 		"\t\t%lu data packet%s (%lu byte%s)\n");
 	p2(tcps_sndrexmitpack, tcps_sndrexmitbyte,
 		"\t\t%lu data packet%s (%lu byte%s) retransmitted\n");
-	p(tcps_sndrexmitbad,
-		"\t\t%lu data packet%s unnecessarily retransmitted\n");
+	p(tcps_sndrtobad, "\t\t%lu spurious RTO retransmit%s\n");
+	p(tcps_sndfastrexmitbad, "\t\t%lu spurious Fast Retransmit%s\n");
+	p(tcps_eifeldetected, "\t\t%lu Eifel-detected spurious retransmit%s\n");
+	p(tcps_rttdetected, "\t\t%lu RTT-detected spurious retransmit%s\n");
 	p(tcps_mturesent, "\t\t%lu resend%s initiated by MTU discovery\n");
 	p2a(tcps_sndacks, tcps_delack,
 		"\t\t%lu ack-only packet%s (%lu delayed)\n");
--- rfc3522.diff ends here ---


>Release-Note:
>Audit-Trail:
>Unformatted:


More information about the freebsd-bugs mailing list