PERFORCE change 157800 for review
Andre Oppermann
andre at FreeBSD.org
Mon Feb 16 08:53:46 PST 2009
http://perforce.freebsd.org/chv.cgi?CH=157800
Change 157800 by andre at andre_flirtbox on 2009/02/16 16:53:08
Checkpoint WIP.
Affected files ...
.. //depot/projects/tcp_new/netinet/tcp_input.c#7 edit
.. //depot/projects/tcp_new/netinet/tcp_output.c#4 edit
.. //depot/projects/tcp_new/netinet/tcp_var.h#3 edit
Differences ...
==== //depot/projects/tcp_new/netinet/tcp_input.c#7 (text+ko) ====
@@ -179,20 +179,6 @@
#endif
/*
- * Indicate whether this ack should be delayed. We can delay the ack if
- * - there is no delayed ack timer in progress and
- * - our last ack wasn't a 0-sized window. We never want to delay
- * the ack that opens up a 0-sized window and
- * - delayed acks are enabled or
- * - this is a half-synchronized T/TCP connection.
- */
-#define DELAY_ACK(tp) \
- ((!tcp_timer_active(tp, TT_DELACK) && \
- (tp->t_flags & TF_RXWIN0SENT) == 0) && \
- (tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
-
-
-/*
* TCP input handling is split into multiple parts:
* tcp6_input is a thin wrapper around tcp_input for the extended
* ip6_protox[] call format in ip6_input
@@ -362,7 +348,7 @@
tcpstat.tcps_rcvbadoff++;
goto drop;
}
- tlen -= off; /* tlen is used instead of ti->ti_len */
+ tlen -= off; /* tlen is used instead of th->th_len */
if (off > sizeof (struct tcphdr)) {
if (isipv6) {
#ifdef INET6
@@ -932,9 +918,10 @@
* discouraged to shrink the window.
* RFC793: section 3.7, page 42-44
* RFC1122: section 4.2.2.16
+ *
+ * XXXAO: Fix up. rcv_wnd is an absolute pointer in seq space.
*/
- rwin = sbspace(&so->so_rcv);
- rwin = imax(rwin, (int)(tp->rcv_advwin - tp->rcv_nxt));
+ rwin = tp->rcv_wnd - tp->rcv_nxt;
/*
* Validation checks on any incoming segment.
@@ -947,7 +934,7 @@
* into established state and initializations of the timers.
*/
case TCPS_SYN_RECEIVED:
- tp->t_starttime = tcp_uptime();
+ tp->t_starttime = time_uptime;
TCPS_TRANS(tp, TCPS_ESTABLISHED);
soisconnected(so);
@@ -963,7 +950,7 @@
*/
case TCPS_SYN_SENT:
/*
- * RST is handled separately below.
+ * RST is handled separatetly below.
* RFC793: section 3.9, page 66-67, second check
*/
if (thflags & TH_RST)
@@ -1029,11 +1016,14 @@
* RFC793: section 3.1, page 18-19
* RFC1122: section 4.2.2.6
* RFC1191: section 3.1
+ *
+ * NB: MSS is computed twice. Once when we send the inital
+ * SYN and once when get back the SYN-ACK.
*/
if (to.to_flags & TOF_MSS)
- tcp_mss(tp, to.to_mss);
+ tp->snd_mss = tcp_mss(tptoinpinc(tp), to.to_mss, 0);
else
- tcp_mss(tp, tcp_mssdflt);
+ tp->snd_mss = tcp_mss(tptoinpinc(tp), 0, 0);
/*
* Do window scaling on this connection?
@@ -1129,7 +1119,7 @@
tp->snd_wu_ack = th->th_ack;
th->th_seq++; /* SYN is acked */
- tp->t_starttime = tcp_uptime();
+ tp->t_starttime = time_uptime;
TCPS_TRANS(tp, TCPS_ESTABLISHED);
#ifdef MAC
SOCK_LOCK(so);
@@ -1218,7 +1208,7 @@
*
* We store the receive time as uptime with second
* resolution. This makes us independent from the
- * wrap-around after 2^32 / hz (24.8 days at 1ms hz).
+ * wrap-around after 2^32 / 2 / hz (24.8 days at 1ms hz).
*
* XXXAO: Linux says PAWS is broken. Analyze if true or not.
* Retransmitted segments are not presented for further processing.
@@ -1425,6 +1415,7 @@
case TCPS_SYN_SENT:
/*
* In TCPS_SYN_SENT the RST MUST carry the ACK flag.
+ * RFC793: section 3.4, page 37, Reset Processing
* RFC793: section 3.9, page 66, first check
*/
if (!(thflags & TH_ACK)) {
@@ -1434,12 +1425,17 @@
}
/*
- * The ACK must be within what we sent but does
- * not have to ACK the SYN.
+ * The ACK must acknowledge the SYN and any data
+ * we may have sent with the original SYN.
+ * RFC793: section 3.4, page 37, Reset Processing
* RFC793: section 3.9, page 66, first check
+ *
+ * NB: We accept ACKing the SYN w/o and with data
+ * as some implementations refuse to ACK data in
+ * a SYN.
*/
- if (SEQ_LT(th->th_ack, tp->snd_una) ||
- SEQ_GT(th->th_ack, th->snd_nxt)) {
+ if (th->th_ack != tp->snd_una ||
+ th->th_ack != th->snd_nxt) {
tcplog("RST does not match, segment ignored");
tcpstat.tcps_badrst++;
goto drop;
@@ -1735,6 +1731,8 @@
/*
* Update send SACK information and tell us how much more
* data has left the network (relative to last SACK we got).
+ * XXXAO: Determine if there was a duplicate ACK going on
+ * based on the changes of the SACK information.
*/
if ((to.to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes))
sacked = tcp_sack_doack(tp, &to, th->th_ack);
@@ -1759,7 +1757,7 @@
/*
* Update congestion control information.
*/
- nudgeoutput = tcp_congest(tp, th, tiwin, acked, tlen, sacked);
+ nudgeoutput |= tcp_congest(tp, th, tiwin, acked, tlen, sacked);
/*
* Drop acknowledged data from send socket buffer
@@ -1783,12 +1781,10 @@
* data from the socket buffer.
*/
if (acked > so->so_snd.sb_cc) {
- tp->snd_wnd -= so->so_snd.sb_cc;
sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc);
ourfinisacked = 1;
} else {
sbdrop_locked(&so->so_snd, acked);
- tp->snd_wnd -= acked;
ourfinisacked = 0;
}
@@ -1896,6 +1892,19 @@
* NB: Continue with segment.
*/
}
+
+ /*
+ * Stop the retransmit timer if all data we sent
+ * has been acknowledged. Otherwise restart it
+ * if we still have outstanding data.
+ *
+ * XXXAO: Refine the test. The TF_NEEDFIN may not
+ * enough.
+ */
+ if (tp->snd_una == tp->snd_nxt && !(tp->t_flags & TF_NEEDFIN))
+ tcp_timer_activate(TT_RXMIT, 0);
+ else
+ tcp_timer_activate(TT_RXMIT, tp->snd_rto);
}
/*
@@ -1918,7 +1927,7 @@
*/
if ((thflags & TH_URG) && th->th_urp > 0 && tlen > 0 &&
!TCPS_HAVERCVDFIN(tp->t_state)) {
- tcp_do_urg(tp, th, tlen);
+ tcp_do_urg(tp, th, &tlen);
} else if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) {
tp->rcv_up = tp->rcv_nxt;
}
@@ -2090,6 +2099,11 @@
}
/*
+ * Update size of receive window.
+ */
+ tp->rcv_wnd = sbspace(so->so_rcv);
+
+ /*
* NB: sorwakeup_locked implicitly unlocks.
*/
sorwakeup_locked(so);
@@ -2249,12 +2263,15 @@
*
* XXXAO: Multi-delack?
*/
- if (nudgeoutput || (tp->t_flags & TF_ACKNOW))
+ if ((tp->t_flags & TF_ACKNOW) || tp->snd_delack > 1 ||
+ nudgeoutput || (tp->t_flags & TF_RXWIN0SENT) ||
+ !tcp_delack_enabled) {
(void) tcp_output(tp);
- else if (tp->t_flags & TF_DELACK) {
- tp->t_flags &= ~TF_DELACK;
+ } else if (SEQ_GT(tp->rcv_nxt, tp->snd_lastack)) {
+ tp->snd_delack++;
tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
}
+
INP_UNLOCK(tp->t_inpcb);
return;
@@ -2374,7 +2391,7 @@
* XXXAO: Report violations of the options specs.
*/
static void
-tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
+tcp_do_options(struct tcpopt *to, u_char *cp, int cnt, int flags)
{
int opt, optlen;
@@ -2471,68 +2488,106 @@
* Finish this function and validate against all relevant RFCs.
* Use bintime second part for t_rcvtime.
* And a couple of other things.
- *
- * XXXAO: Linux talks about some problem with the RTO algorithm.
- * Figure out what the problem is.
- *
- * XXXAO: The sliding window of eight measurements from RFC793 is
- * way too little when using timestamps in fast networks.
- * Average 10ms of measurements and integrate that into a 1000ms
- * sliding window. The same for the variance. When using timestamps.
*/
static void
tcp_do_time(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to,
int acked, int tlen, int sacked)
{
- int delta, rtt;
+ int rtt;
int tick = tcp_ticks;
+ INP_LOCK_ASSERT(tp->t_inpcb);
KASSERT(tp != NULL && th != NULL && to != NULL,
- ("%s: ", __func__));
- INP_LOCK_ASSERT(tp->t_inpcb);
+ ("%s: insufficient parameters", __func__));
/*
+ * 1. We received a valid segment.
+ *
* Make note of most recent segment received time.
*/
- tp->t_rcvtime = tcp_ticks; /* XXX: ticks64 */
+ tp->t_rcvtime = tcp_uptime();
+ tp->t_rcvticks = tick;
/*
+ * 2. If timestamps are used decide which to reflect.
+ *
* When using timestamps and delayed ACKs we should reply
* with the TSval from the earliest unacknowledged segment.
- * RFC1323: Section 3.4, Page 15, Case (A)
+ * RFC1323: section 3.4, Page 15, Case (A)
*
* On packet loss echo the TSval from the latest segment
* that filled a hole. Only reflect timestamps that advance
* the left edge of the window.
- * RFC1323: Section 3.4, Page 15, Case (B & C)
+ * RFC1323: section 3.4, Page 15, Case (B & C)
+ *
+ * Corrected algorithm.
+ * Stevens Vol.2: section 26.6, page 870
+ * Braden93
+ *
+ * If SACK is enabled we should be able to reflect every
+ * timestamp as long as it GEQ than the one before. This
+ * way we avoid late out-of-order segments. Whenever more
+ * data was sacked advance reflected timestamp.
*
- * XXXAO: With SACK we could do better.
- * if (sacked > 0) ...
+ * Does this give PAWS problems?
*/
if (to->to_flags & TOF_TS) {
- if ((!(tp->t_flags & TF_DELACK) && th->th_seq == tp->rcv_nxt) ||
- (!TAILQ_EMPTY(tp->rcv_trq) && th->th_seq == tp->rcv_nxt))
+#ifdef TCP_RFC1323_BRADEN
+ if (TS_GEQ(to->to_tsval, tp->snd_tsecr) &&
+ SEQ_LEQ(th->th_ack, tp->snd_lastack)) {
+#endif
+#ifdef TCP_RFC1323bis_plusSACK
+ if (TS_GT(to->to_tsval, tp->snd_tsecr) &&
+ ((th->th_seq == tp->rcv_nxt && tp->snd_delack == 0) ||
+ sacked > 0) {
+#endif
tp->snd_tsecr = to->to_tsval;
- tp->snd_tsecrts = tcp_ticks; /* XXX: ticks64 */
+ tp->snd_tsecrts = tcp_ticks;
+ }
+ KASSERT(!TS_GT(to->to_secr, tick),
+ ("%s: timestamp newer than our time", __func__));
/*
* Remember highest most recent reflected TS.
*/
- if (to->to_tsecr > tp->ts_recent)
+ if (SEQ_LEQ(th->th_seq, tp->snd_lastack) &&
+ TS_GT(to->to_tsecr > tp->ts_recent))
tp->ts_recent = to->to_tsecr;
+ tp->ts_recentts = tick;
}
/*
+ * 3. If timestamps are used calculate the current RTT.
+ */
+ if (to->to_flags & TOF_TS) {
+ rtt = tick - to->to_tsecr;
+ } else if (acked > 0 && tp->snd_rtseq != 0 &&
+ SEQ_GT(th->th_ack, tp->snd_rtseq) &&
+ TAILQ_EMPTY(tp->rcv_trq) && tp->snd_rtoshift == 0) {
+ rtt = tick - tp->snd_rtts;
+ tp->snd_rtseq = 0;
+ } else
+ return;
+
+ /*
+ * 4. If no timestamps are used see whether new data was ack'ed
+ * and if so, calculate the current RTT.
+ *
* We can only measure the RTT if new data was acknowledged.
* That means we can only update the RTT estimates when we
* are sending data.
*
* XXXAO: Not really true with timestamps and a steady receive
* stream.
+ *
+ * Karns algorithm. Only update on non-retransmitted segments.
+ * Compute the time delta in ticks (1/hz).
+ *
+ * XXXAO: How to deal with retransmits when using timestamps?
*/
- if (acked == 0)
- return;
/*
+ * 5. Update at all?
+ *
* If we haven't sent anything for more than one RTO ignore
* the time measurement or our estimate will be way off.
*/
@@ -2543,35 +2598,85 @@
}
/*
- * Karns algorithm. Only update on non-retransmitted segments.
+ * Remember the lowest RTT we've ever seen.
+ * Must be at least 1 tick.
+ */
+ if (tp->t_rttlowest > rtt)
+ tp->t_rttlowest = max(rtt, 1);
+
+ /*
+ * Recompute the SRTT, RTTVAR and RTO.
*
- * XXXAO: How to deal with retransmits when using timestamps?
+ * XXXAO: Make it pluggable so that different algorithms
+ * can be tested.
*/
+ tp->snd_rto = tcp_do_rto(tp, rtt);
/*
- * Compute the time delta in ticks (1/hz).
+ * We received an ack for a packet that wasn't retransmitted;
+ * it is probably safe to discard any error indications we've
+ * received recently. This isn't quite right, but close enough
+ * for now (a route might have failed after we sent a segment,
+ * and the return path might not be symmetrical).
+ * XXXAO: Doesn't belong here.
*/
- if (to->to_flags & TOF_TS) {
- rtt = tick - to->to_tsecr;
- } else if (tp->t_rtseq != 0 && SEQ_GT(th->th_ack, tp->t_rtseq) &&
- TAILQ_EMPTY(tp->rcv_trq) && tp->snd_rtoshift == 0) {
- rtt = tick - tp->t_rtseq;
- tp->t_rtseq = 0;
- } else
- return;
+ tp->t_softerror = 0;
/*
- * Limit delta to some reasonable amount.
+ * Statistics.
*/
- rtt = min(60*hz, max(1, rtt));
+ tp->t_rttupdated++;
+ tcpstat.tcps_rttupdated++;
+
+ return;
+}
+
+/*
+ * Compute the SRTT, RTTVAR and return the updated RTO.
+ * RFC1122: section 4.2.3.1
+ * RFC2988: entire document
+ *
+ * External parameters that affect the RTO calculation:
+ * minimum RTO value (fixed sysctl)
+ * maximum RTO value (fixed sysctl)
+ * initial RTO value (fixed sysctl)
+ *
+ * XXXAO: Linux talks about some problem with the RTO algorithm.
+ * Figure out what the problem is.
+ *
+ * XXXAO: The sliding window of eight measurements from RFC793 is
+ * way too little when using timestamps in fast networks.
+ * Average 10ms of measurements and integrate that into a 1000ms
+ * sliding window. The same for the variance. When using timestamps.
+ * Or integrate over one RTO.
+ *
+ * XXXAO: We should use rttlowest as base and all deviations from it
+ * count as RTT variance. Use a squared algorithm to bias it to the
+ * upper level. Trying to calculate the actual RTT is futile and
+ * very volatile. rttlowest is a very good and fairly stable statistic
+ * baseline. One can't get better than speed of light in optical media.
+ * Everything faster than one tick doesn't concern us anyway. Having
+ * stable baseline simplifies and improves a number of statistical
+ * calculations and assumptions. Some magic has to be applied when
+ * a better lower baseline is measured though.
+ */
+static int
+tcp_do_rto(struct tcpcb *tp, int rtt)
+{
+ int delta, rto;
+ INP_LOCK_ASSERT(tp->t_inpcb);
+ KASSERT(tp != NULL,
+ ("%s: insufficient parameters", __func__));
+
/*
- * Remember the lowest RTT we've ever seen.
+ * Limit delta to some reasonable amount.
*/
- if (tp->t_rttlowest > rtt)
- tp->rttlowest = rtt;
+ rtt = min(60 * hz, max(1, rtt));
/*
+ * 6. Integrate new measurement.
+ *
* Compute smoothed RTT and smoothed RTT variance.
*/
if (tp->t_srtt) {
@@ -2595,7 +2700,7 @@
* rttvar is stored as fixed point with 4 bits after the
* binary point (scaled by 16). The following is
* equivalent to rfc793 smoothing with an alpha of .75
- * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
+ * (rttvar = rttvar * 3/4 + |delta| / 4). This replaces
* rfc793's wired-in beta.
*/
if (delta < 0)
@@ -2616,29 +2721,17 @@
tp->t_rxtshift = 0;
/*
+ * 7. Recompute RTO timer.
+ *
* The retransmit should happen at rtt + 4 * rttvar.
* XXX: Backoff.
* RFC2988, Section 2, Page 2-3, Cases 2.1 through 2.5
*/
- tp->snd_rto = max(((tp->t_srtt >> TCP_RTT_SHIFT) +
- max(4 * (tp->t_rttvar >> TCP_RTTVAR_SHIFT), TCPTV_REXMTMAX)),
- tcp_rexmit_min);
+ rto = max(((tp->t_srtt >> TCP_RTT_SHIFT) +
+ max(4 * (tp->t_rttvar >> TCP_RTTVAR_SHIFT), TCPTV_REXMTMAX)),
+ tcp_rexmit_min);
- /*
- * We received an ack for a packet that wasn't retransmitted;
- * it is probably safe to discard any error indications we've
- * received recently. This isn't quite right, but close enough
- * for now (a route might have failed after we sent a segment,
- * and the return path might not be symmetrical).
- * XXX: Doesn't belong here.
- */
- tp->t_softerror = 0;
-
- /*
- * Statistics.
- */
- tp->t_rttupdated++;
- tcpstat.tcps_rttupdated++;
+ return (rto);
}
/*
@@ -2757,7 +2850,7 @@
*/
int
tcp_do_wu(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to,
- int tiwin, in acked, int tlen, int sacked)
+ int tiwin, int acked, int tlen, int sacked)
{
KASSERT(tp != NULL && th != NULL,
@@ -2810,14 +2903,14 @@
* RFC793: section 3.7, page 42-44, "Managing the Window"
* RFC1122: section 4.2.2.16
*/
- if (SEQ_DELTA(tp->snd_nxt, tp->snd_una) + tiwin < tp->snd_wnd)
+ if (SEQ_DELTA(tp->snd_nxt, tp->snd_una + acked) + tiwin < tp->snd_wnd)
tcplog("peer shrank the window");
/*
* Update the window and keep track of this update.
*/
tp->snd_wnd = tiwin;
- if (th->th_seq > tp->snd_wu_seq)
+ if (SEQ_GT(th->th_seq, tp->snd_wu_seq))
tp->snd_wu_seq = th->th_seq;
if (tp->snd_wnd > tp->snd_maxwnd)
tp->snd_maxwnd = tp->snd_wnd;
@@ -2936,61 +3029,63 @@
}
/*
- * Determine a reasonable value for maxseg size.
- * If the route is known, check route for mtu.
- * If none, use an mss that can be handled on the outgoing
- * interface without forcing IP to fragment.
+ * Determine a reasonable value for MSS size. If the route is known,
+ * check route for mtu. If none, use an MSS that can be handled on
+ * the outgoing interface without forcing IP to fragment.
* If no route is found, route has no mtu, or the destination
* isn't local, use a default, hopefully conservative size (usually
* 512 or the default IP max size, but no more than the mtu of the
* interface), as we can't discover anything about intervening
* gateways or networks.
- * We also initialize the congestion/slow start window to be a single
- * segment if the destination isn't local.
- * While looking at the routing entry, we also initialize other
- * path-dependent parameters from pre-set or cached values in the
- * routing entry.
+ * RFC793: section x
*
- * Also take into account the space needed for options that we
- * send regularly. Make maxseg shorter by that amount to assure
- * that we can send maxseg amount of data even when the options
- * are present. Store the upper limit of the length of options plus
- * data in maxopd. XXX: No longer needed.
- *
- * NOTE that this routine is only called when we process an incoming
- * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt().
- *
- * XXXAO:
- * Split up and simplify this function.
- * Move initialization of cached values into its own function.
+ * NB: If no offer received pass as zero.
*/
-void
-tcp_mss(struct tcpcb *tp, int offer)
+uint16_t
+tcp_mss(struct in_conninfo *inc, int offer, int mtuflags)
{
- struct inpcb *inp = tp->t_inpcb;
- struct socket *so = inp->inp_socket;
- u_long bufsize;
- u_long maxmtu;
- int rtt, mss;
- int origoffer = offer;
- int mtuflags = 0;
+ uint16_t mss = 0;
+ uint32_t maxmtu = 0;
+ uint32_t thcmtu = 0;
+ int min_protoh;
+#ifdef INET6
+ int isipv6 = inc->inc_isipv6 ? 1 : 0;
+#endif
+
+ KASSERT(inc != NULL,
+ ("%s: NULL in_conninfo pointer", __func__));
+
#ifdef INET6
- int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
+ if (isipv6) {
+ mss = tcp_v6mssdflt;
+ maxmtu = tcp_maxmtu6(inc, mtuflags);
+ min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+ } else
#endif
- struct hc_metrics_lite metrics;
+ {
+ mss = tcp_mssdflt;
+ maxmtu = tcp_maxmtu(inc, mtuflags);
+ min_protoh = sizeof(struct tcpiphdr);
+ }
+ thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
/*
- * Initialize.
- * If there is no route to sender,
- * we stay with the default mss.
+ * Determine MTU.
*/
- mss = tcp_mssopt(tcpcbtoinc(tp), &mtuflags);
+ if (maxmtu && thcmtu)
+ mss = min(maxmtu, thcmtu) - min_protoh;
+ else if (maxmtu || thcmtu)
+ mss = max(maxmtu, thcmtu) - min_protoh;
+
+ if (offer == 0)
+ return (mss);
/*
* Prevent DoS attack with too small MSS. Round up
* to at least minmss.
*/
offer = max(offer, tcp_minmss);
+
/*
* Sanity check: make sure that maxopd will be large
* enough to allow some data on segments even if the
@@ -3000,14 +3095,61 @@
offer = max(offer, 64);
/*
- * maxopd stores the maximum length of data AND options
- * in a segment; maxseg is the amount of data in a normal
- * segment. We need to store this value (maxopd) apart
- * from maxseg, because now every segment carries options
- * and thus we normally have somewhat less data in segments.
+ * Use a symmetric MSS. It is very unlikely that we
+ * have a different MSS in on the way back.
+ *
+ * XXXAO: More comment
*/
- tp->snd_mss = mss = min(mss, offer);
- tp->t_maxopd = mss;
+ mss = min(mss, offer);
+
+ return (mss);
+}
+
+/*
+ * Return the initial send window for a new connection or
+ * after an idle timeout.
+ * RFC3390: entire document
+ *
+ * min(4*MSS, max(2*MSS, 4380 bytes))
+ *
+ * NB: MSS must already be initialized.
+ */
+int
+tcp_init_cwnd(struct tcpcb *tp)
+{
+ int cwnd;
+
+ if (tcp_do_rfc3390)
+ cwnd = min(4 * tp->snd_mss, max(2 * tp->snd_mss, 4380));
+#ifdef INET6
+ else if (isipv6 && in6_localaddr(&inp->in6p_faddr))
+ cwnd = tp->snd_mss * ss_fltsz_local;
+#endif
+ else if (in_localaddr(inp->inp_faddr))
+ cwnd = tp->snd_mss * ss_fltsz_local;
+ else
+ cwnd = tp->snd_mss * ss_fltsz;
+
+ return (cwnd);
+}
+
+/*
+ * Prime some TCP variables from cached values.
+ */
+static void
+tcp_init_values(struct tcpcb *tp)
+{
+ struct inpcb *inp = tp->t_inpcb;
+ struct socket *so = inp->inp_socket;
+ u_long bufsize;
+ u_long maxmtu;
+ int rtt, mss;
+ int origoffer = offer;
+ int mtuflags = 0;
+#ifdef INET6
+ int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
+#endif
+ struct hc_metrics_lite metrics;
/*
* rmx information is now retrieved from tcp_hostcache.
@@ -3038,17 +3180,6 @@
min(tp->snd_wnd, so->so_snd.sb_hiwat)));
else
#endif
- if (tcp_do_rfc3390)
- tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
-#ifdef INET6
- else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
- (!isipv6 && in_localaddr(inp->inp_faddr)))
-#else
- else if (in_localaddr(inp->inp_faddr))
-#endif
- tp->snd_cwnd = mss * ss_fltsz_local;
- else
- tp->snd_cwnd = mss * ss_fltsz;
/*
* If there's a pipesize, change the socket buffer to that size,
@@ -3125,40 +3256,3 @@
tp->t_flags |= TF_TSO;
}
-
-/*
- * Determine the MSS option to send on an outgoing SYN.
- */
-int
-tcp_mssopt(struct in_conninfo *inc, int mtuflags)
-{
- int mss = 0;
- u_long maxmtu = 0;
- u_long thcmtu = 0;
- size_t min_protoh;
-#ifdef INET6
- int isipv6 = inc->inc_isipv6 ? 1 : 0;
-#endif
-
- KASSERT(inc != NULL, ("%s: NULL in_conninfo pointer", __func__));
-
-#ifdef INET6
- if (isipv6) {
- mss = tcp_v6mssdflt;
- maxmtu = tcp_maxmtu6(inc, mtuflags);
- min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
- } else
-#endif
- {
- mss = tcp_mssdflt;
- maxmtu = tcp_maxmtu(inc, mtuflags);
- min_protoh = sizeof(struct tcpiphdr);
- }
- thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
- if (maxmtu && thcmtu)
- mss = min(maxmtu, thcmtu) - min_protoh;
- else if (maxmtu || thcmtu)
- mss = max(maxmtu, thcmtu) - min_protoh;
-
- return (mss);
-}
==== //depot/projects/tcp_new/netinet/tcp_output.c#4 (text+ko) ====
@@ -27,11 +27,9 @@
* SUCH DAMAGE.
*
* @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
+ * $FreeBSD: src/sys/netinet/tcp_output.c,v 1.139 2007/07/01 11:38:27 gnn Exp $
*/
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/netinet/tcp_output.c,v 1.145 2007/11/30 23:46:51 bz Exp $");
-
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ipsec.h"
@@ -74,85 +72,82 @@
#include <netinet/tcp_debug.h>
#endif
-#ifdef IPSEC
+#ifdef FAST_IPSEC
#include <netipsec/ipsec.h>
-#endif /*IPSEC*/
+#endif /*FAST_IPSEC*/
#include <machine/in_cksum.h>
#include <security/mac/mac_framework.h>
-#ifdef notyet
-extern struct mbuf *m_copypack();
-#endif
-
-int path_mtu_discovery = 1;
+int tcp_do_pmtud = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
- &path_mtu_discovery, 1, "Enable Path MTU Discovery");
-
-int ss_fltsz = 1;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW,
- &ss_fltsz, 1, "Slow start flight size");
+ &tcp_do_pmtud, 1, "Enable Path MTU Discovery");
-int ss_fltsz_local = 4;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW,
- &ss_fltsz_local, 1, "Slow start flight size for local networks");
-
-int tcp_do_newreno = 1;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW,
- &tcp_do_newreno, 0, "Enable NewReno Algorithms");
-
int tcp_do_tso = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
- &tcp_do_tso, 0, "Enable TCP Segmentation Offload");
+ &tcp_do_tso, 0, "Enable TCP Segmentation Offload");
int tcp_do_autosndbuf = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW,
- &tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing");
+ &tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing");
int tcp_autosndbuf_inc = 8*1024;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW,
- &tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer");
+ &tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer");
int tcp_autosndbuf_max = 256*1024;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
- &tcp_autosndbuf_max, 0, "Max size of automatic send buffer");
+ &tcp_autosndbuf_max, 0, "Max size of automatic send buffer");
-
/*
* Tcp output routine: figure out what should be sent and send it.
+ *
+ * 1. How much to send, if any
+ * 1.1 subject to nagles algorithm (don't send small segments)
+ * 1.2 subject to send window
+ * 1.3 subject to congestion window
+ * 2. Send window probe (persist mode)
+ * 3. Send an outstanding ACK
+ * 3.1 subject to delayed ack
+ * 4. Send a window update
+ * 4.1 subject to silly window avoidance
+ * 4.2 subject to delayed ack
+ * 5. Send retransmit
+ * 6. Send urgent data
+ * 7. Send based on flags
*/
int
tcp_output(struct tcpcb *tp)
{
- struct socket *so = tp->t_inpcb->inp_socket;
- long len, recwin, sendwin;
- int off, flags, error;
- struct mbuf *m;
- struct ip *ip = NULL;
- struct ipovly *ipov = NULL;
- struct tcphdr *th;
+ int off, flags, error, optlen;
+ tcp_win len, recwin, swin;
+ struct inpcb *inp = tp->t_inpcb;
+ struct socket *so = inp->inp_socket;
+ struct tcphdr ths;
+ struct tcpopt to;
u_char opt[TCP_MAXOLEN];
- unsigned ipoptlen, optlen, hdrlen;
-#ifdef IPSEC
- unsigned ipsec_optlen = 0;
+#ifdef TCP_SIGNATURE
+ int sigoff = 0;
#endif
- int idle, sendalot;
- int sack_rxmit, sack_bytes_rxmt;
- struct sackhole *p;
- int tso = 0;
- struct tcpopt to;
-#if 0
- int maxburst = TCP_MAXBURST;
-#endif
-#ifdef INET6
- struct ip6_hdr *ip6 = NULL;
- int isipv6;
+ INP_LOCK_ASSERT(tp->t_inpcb);
+ KASSERT(tp->t_state > TCPS_LISTEN,
+ ("%s: TCPS_LISTEN invalid", __func__));
+ KASSERT(tp->t_state != TCPS_SYN_RECEIVED,
+ ("%s: TCPS_SYN_RECEIVED invalid", __func__));
+ KASSERT(tp->t_state < TCPS_TIME_WAIT,
+ ("%s: TCPS_TIME_WAIT invalid", __func__));
- isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
-#endif
+ KASSERT(SEQ_GEQ(tp->snd_rxmit, tp->snd_una),
+ ("%s: snd_rxmit < snd_una", __func__))
+ KASSERT(SEQ_LEQ(tp->snd_rxmit, tp->snd_nxt),
+ ("%s: snd_rxmit > snd_nxt", __func__))
- INP_LOCK_ASSERT(tp->t_inpcb);
+ /*
+ * Get standard flags. Removal of inappropriate flags for a
+ * specific segment is handled by the segmentation code.
+ */
+ flags = tcp_outflags[tp->t_state];
/*
* Determine length of data that should be transmitted,
@@ -160,792 +155,610 @@
* If there is some data or critical controls (SYN, RST)
* to send, then transmit; otherwise, investigate further.
*/
- idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
- if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
- /*
- * We have been idle for "a while" and no acks are
- * expected to clock out any data we send --
- * slow start to get ack "clock" running again.
- *
- * Set the slow-start flight size depending on whether
- * this is a local network or not.
- */
- int ss = ss_fltsz;
-#ifdef INET6
- if (isipv6) {
- if (in6_localaddr(&tp->t_inpcb->in6p_faddr))
- ss = ss_fltsz_local;
- } else
-#endif /* INET6 */
- if (in_localaddr(tp->t_inpcb->inp_faddr))
- ss = ss_fltsz_local;
- tp->snd_cwnd = tp->t_maxseg * ss;
- }
- tp->t_flags &= ~TF_LASTIDLE;
- if (idle) {
- if (tp->t_flags & TF_MORETOCOME) {
- tp->t_flags |= TF_LASTIDLE;
- idle = 0;
- }
- }
-again:
- /*
- * If we've recently taken a timeout, snd_max will be greater than
- * snd_nxt. There may be SACK information that allows us to avoid
- * resending already delivered data. Adjust snd_nxt accordingly.
- */
- if ((tp->t_flags & TF_SACK_PERMIT) &&
- SEQ_LT(tp->snd_nxt, tp->snd_max))
- tcp_sack_adjust(tp);
- sendalot = 0;
- off = tp->snd_nxt - tp->snd_una;
- sendwin = min(tp->snd_wnd, tp->snd_cwnd);
+
+
- flags = tcp_outflags[tp->t_state];
/*
- * Send any SACK-generated retransmissions. If we're explicitly trying
- * to send out new data (when sendalot is 1), bypass this function.
- * If we retransmit in fast recovery mode, decrement snd_cwnd, since
- * we're replacing a (future) new transmission with a retransmission
- * now, and we previously incremented snd_cwnd in tcp_input().
+ * We have been idle for "a while" and no acks are
+ * expected to clock out any data we send --
+ * slow start to get ack "clock" running again.
+ *
+ * Set the slow-start flight size depending on whether
+ * this is a local network or not.
*/
- /*
- * Still in sack recovery , reset rxmit flag to zero.
- */
- sack_rxmit = 0;
- sack_bytes_rxmt = 0;
- len = 0;
- p = NULL;
- if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp) &&
- (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
- long cwin;
-
- cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
- if (cwin < 0)
- cwin = 0;
- /* Do not retransmit SACK segments beyond snd_recover */
- if (SEQ_GT(p->end, tp->snd_recover)) {
- /*
- * (At least) part of sack hole extends beyond
- * snd_recover. Check to see if we can rexmit data
- * for this hole.
- */
>>> TRUNCATED FOR MAIL (1000 lines) <<<
More information about the p4-projects
mailing list