PERFORCE change 141720 for review

Fri May 16 14:09:33 UTC 2008

http://perforce.freebsd.org/chv.cgi?CH=141720

Change 141720 by andre at andre_flirtbox on 2008/05/16 14:08:46

	Remove TCP inflight code.  Anything like this belongs into
	the congestion control algrithms, not into a separate things.
	On top of it some assumptions of this code are not true anymore.

Affected files ...

.. //depot/projects/tcp_new/netinet/tcp.h#2 edit
.. //depot/projects/tcp_new/netinet/tcp_input.c#4 edit
.. //depot/projects/tcp_new/netinet/tcp_output.c#2 edit
.. //depot/projects/tcp_new/netinet/tcp_subr.c#2 edit
.. //depot/projects/tcp_new/netinet/tcp_usrreq.c#2 edit
.. //depot/projects/tcp_new/netinet/tcp_var.h#2 edit
.. //depot/projects/tcp_reass/netinet/tcp_output.c#9 edit

Differences ...

==== //depot/projects/tcp_new/netinet/tcp.h#2 (text+ko) ====

@@ -210,10 +210,9 @@
 
 	/* FreeBSD extensions to tcp_info. */
 	u_int32_t	tcpi_snd_wnd;		/* Advertised send window. */
-	u_int32_t	tcpi_snd_bwnd;		/* Bandwidth send window. */
 
 	/* Padding to grow without breaking ABI. */
-	u_int32_t	__tcpi_pad[32];		/* Padding. */
+	u_int32_t	__tcpi_pad[36];		/* Padding. */
 };
 #endif
 

==== //depot/projects/tcp_new/netinet/tcp_input.c#4 (text+ko) ====

@@ -2525,8 +2525,6 @@
 		tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh);
 		tcpstat.tcps_usedssthresh++;
 	}
-	if (metrics.rmx_bandwidth)
-		tp->snd_bandwidth = metrics.rmx_bandwidth;
 
 	/*
 	 * Check the interface for TSO capabilities.

==== //depot/projects/tcp_new/netinet/tcp_output.c#2 (text+ko) ====

@@ -200,7 +200,6 @@
 	sendalot = 0;
 	off = tp->snd_nxt - tp->snd_una;
 	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
-	sendwin = min(sendwin, tp->snd_bwnd);
 
 	flags = tcp_outflags[tp->t_state];
 	/*

==== //depot/projects/tcp_new/netinet/tcp_subr.c#2 (text+ko) ====

@@ -161,39 +161,6 @@
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
     &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
 
-/*
- * TCP bandwidth limiting sysctls.  Note that the default lower bound of
- * 1024 exists only for debugging.  A good production default would be
- * something like 6100.
- */
-SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0,
-    "TCP inflight data limiting");
-
-static int	tcp_inflight_enable = 1;
-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW,
-    &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
-
-static int	tcp_inflight_debug = 0;
-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW,
-    &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
-
-static int	tcp_inflight_rttthresh;
-SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, CTLTYPE_INT|CTLFLAG_RW,
-    &tcp_inflight_rttthresh, 0, sysctl_msec_to_ticks, "I",
-    "RTT threshold below which inflight will deactivate itself");
-
-static int	tcp_inflight_min = 6144;
-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW,
-    &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
-
-static int	tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW,
-    &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
-
-static int	tcp_inflight_stab = 20;
-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW,
-    &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
-
 uma_zone_t sack_hole_zone;
 
 static struct inpcb *tcp_notify(struct inpcb *, int);
@@ -265,7 +232,6 @@
 	if (tcp_rexmit_min < 1)
 		tcp_rexmit_min = 1;
 	tcp_rexmit_slop = TCPTV_CPU_VAR;
-	tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH;
 	tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT;
 
 	INP_INFO_LOCK_INIT(&tcbinfo, "tcp");
@@ -623,7 +589,6 @@
 	tp->t_rttmin = tcp_rexmit_min;
 	tp->t_rxtcur = TCPTV_RTOBASE;
 	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
-	tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->t_rcvtime = ticks;
 	tp->t_bw_rtttime = ticks;
@@ -734,7 +699,6 @@
 		metrics.rmx_rtt = tp->t_srtt;
 		metrics.rmx_rttvar = tp->t_rttvar;
 		/* XXX: This wraps if the pipe is more than 4 Gbit per second */
-		metrics.rmx_bandwidth = tp->snd_bandwidth;
 		metrics.rmx_cwnd = tp->snd_cwnd;
 		metrics.rmx_sendpipe = 0;
 		metrics.rmx_recvpipe = 0;
@@ -1696,153 +1660,6 @@
 }
 #endif /* IPSEC */
 
-/*
- * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
- *
- * This code attempts to calculate the bandwidth-delay product as a
- * means of determining the optimal window size to maximize bandwidth,
- * minimize RTT, and avoid the over-allocation of buffers on interfaces and
- * routers.  This code also does a fairly good job keeping RTTs in check
- * across slow links like modems.  We implement an algorithm which is very
- * similar (but not meant to be) TCP/Vegas.  The code operates on the
- * transmitter side of a TCP connection and so only effects the transmit
- * side of the connection.
- *
- * BACKGROUND:  TCP makes no provision for the management of buffer space
- * at the end points or at the intermediate routers and switches.  A TCP
- * stream, whether using NewReno or not, will eventually buffer as
- * many packets as it is able and the only reason this typically works is
- * due to the fairly small default buffers made available for a connection
- * (typicaly 16K or 32K).  As machines use larger windows and/or window
- * scaling it is now fairly easy for even a single TCP connection to blow-out
- * all available buffer space not only on the local interface, but on
- * intermediate routers and switches as well.  NewReno makes a misguided
- * attempt to 'solve' this problem by waiting for an actual failure to occur,
- * then backing off, then steadily increasing the window again until another
- * failure occurs, ad-infinitum.  This results in terrible oscillation that
- * is only made worse as network loads increase and the idea of intentionally
- * blowing out network buffers is, frankly, a terrible way to manage network
- * resources.
- *
- * It is far better to limit the transmit window prior to the failure
- * condition being achieved.  There are two general ways to do this:  First
- * you can 'scan' through different transmit window sizes and locate the
- * point where the RTT stops increasing, indicating that you have filled the
- * pipe, then scan backwards until you note that RTT stops decreasing, then
- * repeat ad-infinitum.  This method works in principle but has severe
- * implementation issues due to RTT variances, timer granularity, and
- * instability in the algorithm which can lead to many false positives and
- * create oscillations as well as interact badly with other TCP streams
- * implementing the same algorithm.
- *
- * The second method is to limit the window to the bandwidth delay product
- * of the link.  This is the method we implement.  RTT variances and our
- * own manipulation of the congestion window, bwnd, can potentially
- * destabilize the algorithm.  For this reason we have to stabilize the
- * elements used to calculate the window.  We do this by using the minimum
- * observed RTT, the long term average of the observed bandwidth, and
- * by adding two segments worth of slop.  It isn't perfect but it is able
- * to react to changing conditions and gives us a very stable basis on
- * which to extend the algorithm.
- */
-void
-tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
-{
-	u_long bw;
-	u_long bwnd;
-	int save_ticks;
-
-	INP_LOCK_ASSERT(tp->t_inpcb);
-
-	/*
-	 * If inflight_enable is disabled in the middle of a tcp connection,
-	 * make sure snd_bwnd is effectively disabled.
-	 */
-	if (tcp_inflight_enable == 0 || tp->t_rttlow < tcp_inflight_rttthresh) {
-		tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
-		tp->snd_bandwidth = 0;
-		return;
-	}
-
-	/*
-	 * Figure out the bandwidth.  Due to the tick granularity this
-	 * is a very rough number and it MUST be averaged over a fairly
-	 * long period of time.  XXX we need to take into account a link
-	 * that is not using all available bandwidth, but for now our
-	 * slop will ramp us up if this case occurs and the bandwidth later
-	 * increases.
-	 *
-	 * Note: if ticks rollover 'bw' may wind up negative.  We must
-	 * effectively reset t_bw_rtttime for this case.
-	 */
-	save_ticks = ticks;
-	if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
-		return;
-
-	bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz /
-	    (save_ticks - tp->t_bw_rtttime);
-	tp->t_bw_rtttime = save_ticks;
-	tp->t_bw_rtseq = ack_seq;
-	if (tp->t_bw_rtttime == 0 || (int)bw < 0)
-		return;
-	bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
-
-	tp->snd_bandwidth = bw;
-
-	/*
-	 * Calculate the semi-static bandwidth delay product, plus two maximal
-	 * segments.  The additional slop puts us squarely in the sweet
-	 * spot and also handles the bandwidth run-up case and stabilization.
-	 * Without the slop we could be locking ourselves into a lower
-	 * bandwidth.
-	 *
-	 * Situations Handled:
-	 *	(1) Prevents over-queueing of packets on LANs, especially on
-	 *	    high speed LANs, allowing larger TCP buffers to be
-	 *	    specified, and also does a good job preventing
-	 *	    over-queueing of packets over choke points like modems
-	 *	    (at least for the transmit side).
-	 *
-	 *	(2) Is able to handle changing network loads (bandwidth
-	 *	    drops so bwnd drops, bandwidth increases so bwnd
-	 *	    increases).
-	 *
-	 *	(3) Theoretically should stabilize in the face of multiple
-	 *	    connections implementing the same algorithm (this may need
-	 *	    a little work).
-	 *
-	 *	(4) Stability value (defaults to 20 = 2 maximal packets) can
-	 *	    be adjusted with a sysctl but typically only needs to be
-	 *	    on very slow connections.  A value no smaller then 5
-	 *	    should be used, but only reduce this default if you have
-	 *	    no other choice.
-	 */
-#define USERTT	((tp->t_srtt + tp->t_rttbest) / 2)
-	bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10;
-#undef USERTT
-
-	if (tcp_inflight_debug > 0) {
-		static int ltime;
-		if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
-			ltime = ticks;
-			printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
-			    tp,
-			    bw,
-			    tp->t_rttbest,
-			    tp->t_srtt,
-			    bwnd
-			);
-		}
-	}
-	if ((long)bwnd < tcp_inflight_min)
-		bwnd = tcp_inflight_min;
-	if (bwnd > tcp_inflight_max)
-		bwnd = tcp_inflight_max;
-	if ((long)bwnd < tp->t_maxseg * 2)
-		bwnd = tp->t_maxseg * 2;
-	tp->snd_bwnd = bwnd;
-}
-
 #ifdef TCP_SIGNATURE
 /*
  * Callback function invoked by m_apply() to digest TCP segment data

==== //depot/projects/tcp_new/netinet/tcp_usrreq.c#2 (text+ko) ====

@@ -1230,7 +1230,6 @@
 	 */
 	ti->tcpi_rcv_space = tp->rcv_wnd;
 	ti->tcpi_snd_wnd = tp->snd_wnd;
-	ti->tcpi_snd_bwnd = tp->snd_bwnd;
 }
 
 /*
@@ -1814,13 +1813,12 @@
 	    tp->rcv_adv, tp->rcv_wnd, tp->rcv_up);
 
 	db_print_indent(indent);
-	db_printf("snd_wnd: %lu   snd_cwnd: %lu   snd_bwnd: %lu\n",
-	   tp->snd_wnd, tp->snd_cwnd, tp->snd_bwnd);
+	db_printf("snd_wnd: %lu   snd_cwnd: %lu,
+	   tp->snd_wnd, tp->snd_cwnd);
 
 	db_print_indent(indent);
-	db_printf("snd_ssthresh: %lu   snd_bandwidth: %lu   snd_recover: "
-	    "0x%08x\n", tp->snd_ssthresh, tp->snd_bandwidth,
-	    tp->snd_recover);
+	db_printf("snd_ssthresh: %lu   snd_recover: "
+	    "0x%08x\n", tp->snd_ssthresh, tp->snd_recover);
 
 	db_print_indent(indent);
 	db_printf("t_maxopd: %u   t_rcvtime: %lu   t_startime: %lu\n",

==== //depot/projects/tcp_new/netinet/tcp_var.h#2 (text+ko) ====

@@ -144,12 +144,10 @@
 
 	u_long	snd_wnd;		/* send window */
 	u_long	snd_cwnd;		/* congestion-controlled window */
-	u_long	snd_bwnd;		/* bandwidth-controlled window */
 	u_long	snd_ssthresh;		/* snd_cwnd size threshold for
 					 * for slow start exponential to
 					 * linear switch
 					 */
-	u_long	snd_bandwidth;		/* calculated bandwidth or 0 */
 	tcp_seq	snd_recover;		/* for use in NewReno Fast Recovery */
 
 	u_int	t_maxopd;		/* mss plus options */
@@ -561,7 +559,6 @@
 void	 tcp_timer_activate(struct tcpcb *, int, u_int);
 int	 tcp_timer_active(struct tcpcb *, int);
 void	 tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int);
-void	 tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq);
 /*
  * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo)
  */

==== //depot/projects/tcp_reass/netinet/tcp_output.c#9 (text+ko) ====