PERFORCE change 141720 for review
Andre Oppermann
andre at FreeBSD.org
Fri May 16 14:09:33 UTC 2008
http://perforce.freebsd.org/chv.cgi?CH=141720
Change 141720 by andre at andre_flirtbox on 2008/05/16 14:08:46
Remove TCP inflight code. Anything like this belongs into
the congestion control algrithms, not into a separate things.
On top of it some assumptions of this code are not true anymore.
Affected files ...
.. //depot/projects/tcp_new/netinet/tcp.h#2 edit
.. //depot/projects/tcp_new/netinet/tcp_input.c#4 edit
.. //depot/projects/tcp_new/netinet/tcp_output.c#2 edit
.. //depot/projects/tcp_new/netinet/tcp_subr.c#2 edit
.. //depot/projects/tcp_new/netinet/tcp_usrreq.c#2 edit
.. //depot/projects/tcp_new/netinet/tcp_var.h#2 edit
.. //depot/projects/tcp_reass/netinet/tcp_output.c#9 edit
Differences ...
==== //depot/projects/tcp_new/netinet/tcp.h#2 (text+ko) ====
@@ -210,10 +210,9 @@
/* FreeBSD extensions to tcp_info. */
u_int32_t tcpi_snd_wnd; /* Advertised send window. */
- u_int32_t tcpi_snd_bwnd; /* Bandwidth send window. */
/* Padding to grow without breaking ABI. */
- u_int32_t __tcpi_pad[32]; /* Padding. */
+ u_int32_t __tcpi_pad[36]; /* Padding. */
};
#endif
==== //depot/projects/tcp_new/netinet/tcp_input.c#4 (text+ko) ====
@@ -2525,8 +2525,6 @@
tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh);
tcpstat.tcps_usedssthresh++;
}
- if (metrics.rmx_bandwidth)
- tp->snd_bandwidth = metrics.rmx_bandwidth;
/*
* Check the interface for TSO capabilities.
==== //depot/projects/tcp_new/netinet/tcp_output.c#2 (text+ko) ====
@@ -200,7 +200,6 @@
sendalot = 0;
off = tp->snd_nxt - tp->snd_una;
sendwin = min(tp->snd_wnd, tp->snd_cwnd);
- sendwin = min(sendwin, tp->snd_bwnd);
flags = tcp_outflags[tp->t_state];
/*
==== //depot/projects/tcp_new/netinet/tcp_subr.c#2 (text+ko) ====
@@ -161,39 +161,6 @@
SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
&tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
-/*
- * TCP bandwidth limiting sysctls. Note that the default lower bound of
- * 1024 exists only for debugging. A good production default would be
- * something like 6100.
- */
-SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0,
- "TCP inflight data limiting");
-
-static int tcp_inflight_enable = 1;
-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW,
- &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
-
-static int tcp_inflight_debug = 0;
-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW,
- &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
-
-static int tcp_inflight_rttthresh;
-SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, CTLTYPE_INT|CTLFLAG_RW,
- &tcp_inflight_rttthresh, 0, sysctl_msec_to_ticks, "I",
- "RTT threshold below which inflight will deactivate itself");
-
-static int tcp_inflight_min = 6144;
-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW,
- &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
-
-static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW,
- &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
-
-static int tcp_inflight_stab = 20;
-SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW,
- &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
-
uma_zone_t sack_hole_zone;
static struct inpcb *tcp_notify(struct inpcb *, int);
@@ -265,7 +232,6 @@
if (tcp_rexmit_min < 1)
tcp_rexmit_min = 1;
tcp_rexmit_slop = TCPTV_CPU_VAR;
- tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH;
tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT;
INP_INFO_LOCK_INIT(&tcbinfo, "tcp");
@@ -623,7 +589,6 @@
tp->t_rttmin = tcp_rexmit_min;
tp->t_rxtcur = TCPTV_RTOBASE;
tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
- tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->t_rcvtime = ticks;
tp->t_bw_rtttime = ticks;
@@ -734,7 +699,6 @@
metrics.rmx_rtt = tp->t_srtt;
metrics.rmx_rttvar = tp->t_rttvar;
/* XXX: This wraps if the pipe is more than 4 Gbit per second */
- metrics.rmx_bandwidth = tp->snd_bandwidth;
metrics.rmx_cwnd = tp->snd_cwnd;
metrics.rmx_sendpipe = 0;
metrics.rmx_recvpipe = 0;
@@ -1696,153 +1660,6 @@
}
#endif /* IPSEC */
-/*
- * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
- *
- * This code attempts to calculate the bandwidth-delay product as a
- * means of determining the optimal window size to maximize bandwidth,
- * minimize RTT, and avoid the over-allocation of buffers on interfaces and
- * routers. This code also does a fairly good job keeping RTTs in check
- * across slow links like modems. We implement an algorithm which is very
- * similar (but not meant to be) TCP/Vegas. The code operates on the
- * transmitter side of a TCP connection and so only effects the transmit
- * side of the connection.
- *
- * BACKGROUND: TCP makes no provision for the management of buffer space
- * at the end points or at the intermediate routers and switches. A TCP
- * stream, whether using NewReno or not, will eventually buffer as
- * many packets as it is able and the only reason this typically works is
- * due to the fairly small default buffers made available for a connection
- * (typicaly 16K or 32K). As machines use larger windows and/or window
- * scaling it is now fairly easy for even a single TCP connection to blow-out
- * all available buffer space not only on the local interface, but on
- * intermediate routers and switches as well. NewReno makes a misguided
- * attempt to 'solve' this problem by waiting for an actual failure to occur,
- * then backing off, then steadily increasing the window again until another
- * failure occurs, ad-infinitum. This results in terrible oscillation that
- * is only made worse as network loads increase and the idea of intentionally
- * blowing out network buffers is, frankly, a terrible way to manage network
- * resources.
- *
- * It is far better to limit the transmit window prior to the failure
- * condition being achieved. There are two general ways to do this: First
- * you can 'scan' through different transmit window sizes and locate the
- * point where the RTT stops increasing, indicating that you have filled the
- * pipe, then scan backwards until you note that RTT stops decreasing, then
- * repeat ad-infinitum. This method works in principle but has severe
- * implementation issues due to RTT variances, timer granularity, and
- * instability in the algorithm which can lead to many false positives and
- * create oscillations as well as interact badly with other TCP streams
- * implementing the same algorithm.
- *
- * The second method is to limit the window to the bandwidth delay product
- * of the link. This is the method we implement. RTT variances and our
- * own manipulation of the congestion window, bwnd, can potentially
- * destabilize the algorithm. For this reason we have to stabilize the
- * elements used to calculate the window. We do this by using the minimum
- * observed RTT, the long term average of the observed bandwidth, and
- * by adding two segments worth of slop. It isn't perfect but it is able
- * to react to changing conditions and gives us a very stable basis on
- * which to extend the algorithm.
- */
-void
-tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
-{
- u_long bw;
- u_long bwnd;
- int save_ticks;
-
- INP_LOCK_ASSERT(tp->t_inpcb);
-
- /*
- * If inflight_enable is disabled in the middle of a tcp connection,
- * make sure snd_bwnd is effectively disabled.
- */
- if (tcp_inflight_enable == 0 || tp->t_rttlow < tcp_inflight_rttthresh) {
- tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
- tp->snd_bandwidth = 0;
- return;
- }
-
- /*
- * Figure out the bandwidth. Due to the tick granularity this
- * is a very rough number and it MUST be averaged over a fairly
- * long period of time. XXX we need to take into account a link
- * that is not using all available bandwidth, but for now our
- * slop will ramp us up if this case occurs and the bandwidth later
- * increases.
- *
- * Note: if ticks rollover 'bw' may wind up negative. We must
- * effectively reset t_bw_rtttime for this case.
- */
- save_ticks = ticks;
- if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
- return;
-
- bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz /
- (save_ticks - tp->t_bw_rtttime);
- tp->t_bw_rtttime = save_ticks;
- tp->t_bw_rtseq = ack_seq;
- if (tp->t_bw_rtttime == 0 || (int)bw < 0)
- return;
- bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
-
- tp->snd_bandwidth = bw;
-
- /*
- * Calculate the semi-static bandwidth delay product, plus two maximal
- * segments. The additional slop puts us squarely in the sweet
- * spot and also handles the bandwidth run-up case and stabilization.
- * Without the slop we could be locking ourselves into a lower
- * bandwidth.
- *
- * Situations Handled:
- * (1) Prevents over-queueing of packets on LANs, especially on
- * high speed LANs, allowing larger TCP buffers to be
- * specified, and also does a good job preventing
- * over-queueing of packets over choke points like modems
- * (at least for the transmit side).
- *
- * (2) Is able to handle changing network loads (bandwidth
- * drops so bwnd drops, bandwidth increases so bwnd
- * increases).
- *
- * (3) Theoretically should stabilize in the face of multiple
- * connections implementing the same algorithm (this may need
- * a little work).
- *
- * (4) Stability value (defaults to 20 = 2 maximal packets) can
- * be adjusted with a sysctl but typically only needs to be
- * on very slow connections. A value no smaller then 5
- * should be used, but only reduce this default if you have
- * no other choice.
- */
-#define USERTT ((tp->t_srtt + tp->t_rttbest) / 2)
- bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10;
-#undef USERTT
-
- if (tcp_inflight_debug > 0) {
- static int ltime;
- if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
- ltime = ticks;
- printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
- tp,
- bw,
- tp->t_rttbest,
- tp->t_srtt,
- bwnd
- );
- }
- }
- if ((long)bwnd < tcp_inflight_min)
- bwnd = tcp_inflight_min;
- if (bwnd > tcp_inflight_max)
- bwnd = tcp_inflight_max;
- if ((long)bwnd < tp->t_maxseg * 2)
- bwnd = tp->t_maxseg * 2;
- tp->snd_bwnd = bwnd;
-}
-
#ifdef TCP_SIGNATURE
/*
* Callback function invoked by m_apply() to digest TCP segment data
==== //depot/projects/tcp_new/netinet/tcp_usrreq.c#2 (text+ko) ====
@@ -1230,7 +1230,6 @@
*/
ti->tcpi_rcv_space = tp->rcv_wnd;
ti->tcpi_snd_wnd = tp->snd_wnd;
- ti->tcpi_snd_bwnd = tp->snd_bwnd;
}
/*
@@ -1814,13 +1813,12 @@
tp->rcv_adv, tp->rcv_wnd, tp->rcv_up);
db_print_indent(indent);
- db_printf("snd_wnd: %lu snd_cwnd: %lu snd_bwnd: %lu\n",
- tp->snd_wnd, tp->snd_cwnd, tp->snd_bwnd);
+ db_printf("snd_wnd: %lu snd_cwnd: %lu,
+ tp->snd_wnd, tp->snd_cwnd);
db_print_indent(indent);
- db_printf("snd_ssthresh: %lu snd_bandwidth: %lu snd_recover: "
- "0x%08x\n", tp->snd_ssthresh, tp->snd_bandwidth,
- tp->snd_recover);
+ db_printf("snd_ssthresh: %lu snd_recover: "
+ "0x%08x\n", tp->snd_ssthresh, tp->snd_recover);
db_print_indent(indent);
db_printf("t_maxopd: %u t_rcvtime: %lu t_startime: %lu\n",
==== //depot/projects/tcp_new/netinet/tcp_var.h#2 (text+ko) ====
@@ -144,12 +144,10 @@
u_long snd_wnd; /* send window */
u_long snd_cwnd; /* congestion-controlled window */
- u_long snd_bwnd; /* bandwidth-controlled window */
u_long snd_ssthresh; /* snd_cwnd size threshold for
* for slow start exponential to
* linear switch
*/
- u_long snd_bandwidth; /* calculated bandwidth or 0 */
tcp_seq snd_recover; /* for use in NewReno Fast Recovery */
u_int t_maxopd; /* mss plus options */
@@ -561,7 +559,6 @@
void tcp_timer_activate(struct tcpcb *, int, u_int);
int tcp_timer_active(struct tcpcb *, int);
void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int);
-void tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq);
/*
* All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo)
*/
==== //depot/projects/tcp_reass/netinet/tcp_output.c#9 (text+ko) ====
More information about the p4-projects
mailing list