PERFORCE change 142643 for review
Andre Oppermann
andre at FreeBSD.org
Sat May 31 20:02:36 UTC 2008
http://perforce.freebsd.org/chv.cgi?CH=142643
Change 142643 by andre at andre_flirtbox on 2008/05/31 20:01:48
WIP Checkpoint of tcp_do_segment() reworking.
o Some further major rototiling.
o Get data flow structure firmly in place.
o Add extensive comments.
o Add extensive RFC references to almost any part.
o Validate a large part against relevant RFCs.
o Remaining work and unclarities marked with XXXAO.
o Does not yet compile.
o Window update and timestamp code not yet complete.
Affected files ...
.. //depot/projects/tcp_new/netinet/tcp_input.c#5 edit
Differences ...
==== //depot/projects/tcp_new/netinet/tcp_input.c#5 (text+ko) ====
@@ -154,7 +154,10 @@
static void tcp_do_segment(struct mbuf *, struct tcphdr *,
struct socket *, struct tcpcb *, int, int);
static void tcp_do_time(struct tcpcb *tp, struct tcphdr *th,
- struct tcpopt *to);
+ struct tcpopt *to, int acked, int tlen);
+static void tcp_do_urg(struct tcpcb *tp, struct tcphdr *th, int tlen);
+static void tcp_do_wu(struct tcpcb *tp, struct tcphdr *th,
+ struct tcpopt *to, int tiwin, int tlen);
static void tcp_dropwithreset(struct mbuf *, struct tcphdr *,
struct tcpcb *, int, int);
static void tcp_pulloutofband(struct socket *,
@@ -881,24 +884,30 @@
INP_INFO_WLOCK_ASSERT(&tcbinfo);
INP_LOCK_ASSERT(tp->t_inpcb);
KASSERT(tp->t_state > TCPS_LISTEN,
- ("%s: TCPS_LISTEN", __func__));
+ ("%s: TCPS_LISTEN invalid", __func__));
KASSERT(tp->t_state != TCPS_TIME_WAIT,
- ("%s: TCPS_TIME_WAIT", __func__));
+ ("%s: TCPS_TIME_WAIT invalid", __func__));
/*
- * Store the flags in a variable for easy manipulation.
+ * Store the flags in a variable for easy manipulation
+ * and because we won't have access to th->th_flags in
+ * later stages.
*/
thflags = th->th_flags;
/*
* Unscale the window into a 32-bit value.
+ * RFC1122: section 4.2.2.3
+ * RFC1323bis: section 2.3
*
- * NB: For the SYN_SENT state the scale is zero.
+ * NB: In SYN_SENT state the scale is zero.
*/
tiwin = th->th_win << tp->snd_scale;
/*
* Parse options on any incoming segment (if present).
+ * RFC793: section 3.1, page 17-19
+ * RFC1122: section 4.2.2.5
*/
if ((th->th_off << 2) != sizeof(struct tcphdr))
tcp_dooptions(&to, (u_char *)(th + 1),
@@ -908,29 +917,31 @@
to.to_flags = 0;
/*
- * Normalize timestamp if syncookies were used when this
+ * Normalize our timestamp if syncookies were used when this
* connection was established.
*/
if (to.to_flags & TOF_TS)
to.to_tsecr -= tp->ts_offset;
/*
- * Calculate amount of space in receive window.
- * Receive window is amount of space in rcv queue,
- * but not less than advertised window.
+ * Calculate amount of space in our receive window.
+ * Receive window is the amount of space in rcv queue,
+ * but not less than last advertised window.
+ * RFC793: section 3.7, page 42-44
+ * RFC1122: section 4.2.2.16
*/
rwin = sbspace(&so->so_rcv);
- rwin = imax(rwin, (int)(tp->rcv_adv - tp->rcv_nxt));
+ rwin = imax(rwin, (int)(tp->rcv_advwin - tp->rcv_nxt));
/*
- * Validation checks. We may get any shit here. Have to be careful.
+ * Validation checks on any incoming segment.
+ * We may get anything here. Have to be careful.
*/
switch (tp->t_state) {
/*
- * If the state is SYN_RECEIVED:
- * syncache handled all validation, socket, inpcb and tcpcb
- * setup for us. All that is left is the state transition
- * into established state and initializations of the timers.
+ * Syncache handled all validation, socket, inpcb and tcpcb
+ * setup for us. All that is left is the state transition
+ * into established state and initializations of the timers.
*/
case TCPS_SYN_RECEIVED:
tp->t_starttime = ticks;
@@ -943,74 +954,104 @@
break;
/*
- * If the state is SYN_SENT:
- * if seg contains a RST, then drop the connection.
- * if seg does not contain SYN and ACK, then drop it.
- * if seg contains an ACK, but not for our SYN, drop the input.
- * Otherwise this is an acceptable SYN segment
- * initialize tp->rcv_nxt and tp->irs
- * if seg contains ack then advance tp->snd_una
- * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
- * arrange for segment to be acked (eventually)
- * continue processing rest of data/controls, beginning with URG
+ * Validate the returned SYN-ACK, process the negotiated
+ * options, complete the initialization of the tcpcb and
+ * transition into ESTABLISHED state.
*/
case TCPS_SYN_SENT:
/*
- * RST is handled below.
+ * RST is handled separately below.
+ * RFC793: section 3.9, page 66-67, second check
*/
if (thflags & TH_RST)
break;
/*
- * SYN|ACK must be present.
+ * FIN is not valid yet and the segment to be ignored.
+ * RFC793: section 3.9, page 75, eighth check, first paragraph
+ */
+ if (thflags & TH_FIN) {
+ tcplog("FIN invalid, segment ignored");
+ goto drop;
+ }
+
+ /*
+ * SYN must be present.
+ * RFC793: section 3.9, page 67-68, fourth and fifth check
+ *
+ * NB: We have to remove SYN from thflags to
+ * prevent the later bogus SYN check from
+ * triggering.
*/
- if (thflags & (TH_SYN|TH_ACK) != (TH_SYN|TH_ACK)) {
- tcplog("Missing SYN|ACK, segment ignored");
+ if (!(thflags & TH_SYN)) {
+ tcplog("Missing SYN, segment ignored");
goto drop;
}
+ thflags &= ~TH_SYN; /* SYN is processed. */
+
+ /*
+ * ACK must be present.
+ * RFC793: section 3.9, page 67-68, fourth check
+ *
+ * XXXAO: Simultaneous open to be handled.
+ * RFC1122: section 4.2.2.10
+ */
+ if (!(thflags & TH_ACK)) {
+ tcplog("Missing ACK, segment rejected");
+ goto dropwithreset;
+ }
/*
- * ACK must ack our ISN and any data we may
- * have sent with our SYN.
+ * ACK must ack our ISN, SYN and possibly data we may
+ * have sent with our SYN. The latter is not strictly
+ * necessary as many implementations chose to ignore
+ * data in a SYN. We want to make use of it again in
+ * a future incarnation of T/TCP with a 'quick' 3WHS.
+ * RFC793: section 3.9, page 66, first check
*/
- if (SEQ_LEQ(th->th_ack, tp->snd_iss) ||
- SEQ_GEQ(th->th_ack, tp->snd_nxt) ||
- SEQ_LT(th->th_ack, tp->snd_una)) {
- tcplog("Incorrect ACK, segment rejected");
- /* XXXAO: Close connection? Or ignore. */
+ if (SEQ_LEQ(th->th_ack, tp->snd_una) ||
+ SEQ_GT(th->th_ack, tp->snd_nxt)) {
+ tcplog("Incorrect ACK, segment ignored");
goto dropwithreset;
}
/*
* Option processing:
*
- * If there wasn't a MSS option fall back to
- * default mss.
+ * If there wasn't a MSS option fall back to default mss.
+ * tcp_mss will calculate the largest possible MSS for us.
+ * The MSS option is not a fully negotiated option and may
+ * be returned even if we haven't sent it with our initial
+ * SYN. It is not common practice to do so however.
+ * RFC793: section 3.1, page 18-19
+ * RFC1122: section 4.2.2.6
+ * RFC1191: section 3.1
*/
- if (!(tp->t_flags & TF_NOOPT) && (to.to_flags & TOF_MSS))
+ if (to.to_flags & TOF_MSS)
tcp_mss(tp, to.to_mss);
- else if (tcp_do_path_mtu_discovery)
- /* MTU of interface... */
else
tcp_mss(tp, tcp_mssdflt);
/*
* Do window scaling on this connection?
+ * RFC1323bis: section 2.2 and 2.3
*
- * NB: According to RFC1323 the window field
- * in a SYN (i.e., a <SYN> or <SYN,ACK>)
- * segment itself is never scaled.
+ * NB: The window field in a SYN (i.e., a <SYN>
+ * or <SYN,ACK>) segment itself is never scaled.
*/
if ((tp->t_flags & TF_WINSCALE) &&
(to.to_flags & TOF_SCALE)) {
tp->snd_scale = to.to_wscale;
} else if (tp->t_flags & TF_WINSCALE) {
+ /* No window scaling. */
tp->t_flags &= ~TF_WINSCALE;
+ tp->snd_scale = 0;
tp->rcv_scale = 0;
} else if (to.to_flags & TOF_SCALE) {
/*
* The remote end doesn't play right with us
* and introduces options we haven't sent.
+ * RFC1323bis: section 1.3, second paragraph
*/
tcplog("Window Scaling Option unexpected, "
"connection aborted");
@@ -1022,6 +1063,7 @@
/*
* Do timestamps on this connection?
+ * RFC1323bis: section 3.2, first and last sentence
*/
if ((tp->t_flags & TF_TIMESTAMP) &&
!(to.to_flags & TOF_TS))
@@ -1031,8 +1073,9 @@
/*
* The remote end doesn't play right with us
* and introduces options we haven't sent.
+ * RFC1323bis: section 1.3, second paragraph
*/
- tcplog("Timestamp unexpected, "
+ tcplog("Timestamp Option unexpected, "
"connection aborted");
tp->t_error = ENETRESET;
tp = tcp_close(tp);
@@ -1042,6 +1085,7 @@
/*
* Do SACK on this connection?
+ * RFC2018: section 2
*/
if ((tp->t_flags & TF_SACK_PERMIT) &&
!(to.to_flags & TOF_SACKPERM))
@@ -1051,6 +1095,7 @@
/*
* The remote end doesn't play right with us
* and introduces options we haven't sent.
+ * RFC2018: section 1, page 2, last paragraph
*/
tcplog("SACK Permitted unexpected, "
"connection aborted");
@@ -1062,20 +1107,24 @@
/*
* Initialize receive structure.
+ * XXXAO: TODO
*/
tp->rcv_adv += rwin; /* XXX */
tp->irs = th->th_seq;
tp->rcv_up = th->th_seq;
tcp_rcvseqinit(tp);
+ tcp_init_rcv(tp, seq); /* XXXAO */
+ tcp_init_snd(tp, ack); /* XXXAO */
+
/*
* Process SYN and integrate sequence number.
+ * XXXAO: TODO
*/
tp->snd_una++;
tp->snd_wu_seq = th->th_seq;
tp->snd_wu_ack = th->th_ack;
th->th_seq++; /* SYN is acked */
- thflags &= ~TH_SYN; /* SYN is processed */
tp->t_starttime = ticks;
tp->t_state = TCPS_ESTABLISHED;
@@ -1089,8 +1138,12 @@
tcpstat.tcps_connects++;
break;
+
/*
* All other states where a connection was established before.
+ * The test are ordered by their knock-out precedence.
+ * The simplest checks that fail a segment come first.
+ * The logical ordering of RFC793 is still maintained.
*/
case TCPS_ESTABLISHED:
case TCPS_CLOSE_WAIT:
@@ -1100,12 +1153,15 @@
case TCPS_FIN_WAIT_2:
/*
* SYN and RST are handled separately below.
+ * RST takes precedence in all flag combinations.
+ * RFC793: section 3.9, page 70-71
*/
if (thflags & (TH_SYN|TH_RST))
break;
/*
* Segments without ACK are invalid.
+ * RFC793: section 3.9, page 72, fifth check, first sentence
*/
if (!(thflags & TH_ACK)) {
tcplog("ACK missing, segment ignored");
@@ -1113,78 +1169,145 @@
}
/*
- * Don't accept ack'ing of older than previously ack'd data.
- * XXXAO: Careful with out-of-order data. Must check seq too.
- * reordering and bidirectional data transfer.
- * XXXAO: Is this check really useful?
+ * Don't accept missing TS when TS was negotiated and
+ * vice versa.
+ * RFC1323bis: section 3.2, last paragraph, last sentence
*/
- if (SEQ_LT(th->th_ack, tp->snd_una) &&
- SEQ_LT(th->th_seq, tp->rcv_nxt)) {
- tcplog("Acking old data, segment ignored, "
- "sending challenge ACK");
- goto dropafterack;
+ if ((tp->t_flags & TF_TIMESTAMP) && !(to.to_flags & TOF_TS)) {
+ tcplog("Timestamp missing, segment ignored");
+ goto drop;
+ }
+ if (!(tp->t_flags & TF_TIMESTAMP) && (to.to_flags & TOF_TS)) {
+ tcplog("Timestamp unexpected, segment ignored");
+ goto drop;
}
/*
- * Don't accept ack'ing of more than actually sent data.
+ * Don't accept SACK when it wasn't negotiated at
+ * connection setup time.
+ * RFC2018: section 1, page 2, last paragraph
*/
- if (SEQ_GT(th->th_ack, tp->snd_max)) {
- tcplog("Acking data not yet sent, segment ignored, "
- "sending challenge ACK");
- tcpstat.tcps_rcvacktoomuch++;
- goto dropafterack;
+ if ((to.to_flags & TOF_SACK) &&
+ !(tp->t_flags & TF_SACK_PERM)) {
+ tcplog("SACK unexpected, segment ignored");
+ goto drop;
}
/*
- * Don't accept start of SEQ beyond receive window.
- * Allow for a window probe with one byte.
- * XXXAO: Window probe statistics.
+ * PAWS: Protection against wrapped sequence numbers.
+ *
+ * Don't accept remote ts older than already seen,
+ * or reflected ts newer than what we send last.
+ *
+ * We store the receive time as uptime with second
+ * resolution. This makes us independent from the
+ * wrap-around after 2^32 / hz (24.8 days at 1ms hz).
+ *
+ * RFC1323bis: section 4.2.1 and 4.2.3
*/
- if (SEQ_GT(th->th_seq, tp->rcv_nxt + tp->rcv_win)) {
- tcplog("Data beyond window, segment ignored, "
- "sending challenge ACK");
- goto dropafterack;
- }
+ if (to.to_flags & TOF_TS) {
+ struct bintime bt;
- /*
- * Don't accept too old retransmits.
- * XXXAO: Use largest window we've ever sent.
- * sb_hiwat is pretty much that. We normally
- * don't shrink the receive socket buffer.
- */
- if (SEQ_LT(th->th_seq,
- tp->rcv_nxt - so->so_rcv.sb_hiwat - tlen)) {
- tcplog("Too old retransmit, segment ignored, "
- "sending challenge ACK");
- goto dropafterack;
+ getbinuptime(&bt);
+ if (bt.sec - tp->t_rcvtime < ((tcp_ts)0x0 - 1) / hz) {
+ if (TSTMP_LT(to.to_tsval, tp->snd_tsecr) {
+ tcplog("Timestamp too old, "
+ "sending challenge ack");
+ goto dropafterack;
+ }
+ if (TSTMP_GT(to.to_tsecr, tp->snd_tsval) {
+ tcplog("Timestamp too new, "
+ "sending challenge ack");
+ goto dropafterack;
+ }
+ }
}
/*
- * Don't accept missing TS when TS was negotiated and
- * vice versa.
+ * Validate the segment against the window. We don't accept
+ * segments with SEQ totally outside of receive window, unless
+ * one of the special cases described below applies. Any data
+ * portion hanging outside the window to either side will be
+ * chopped off later.
+ * RFC793: section 3.3, page 24-26
+ * RFC793: section 3.9, page 69, first check, the four cases
+ *
+ * Window probes are sent when we advertized a zero window
+ * because the receive socket buffer is full. When the
+ * application reads enough data from it we send a window
+ * update. This is not reliable though and the remote host
+ * periodically probes if we've got space again. It does
+ * this by sending one byte of data. We let the through this
+ * one byte, truncate it later and process the remainder of
+ * the segment including any options. This way timers are
+ * updated properly. TH_FIN is let through even on a zero
+ * receive window.
+ * RFC793: section 3.7, page 42, seventh and eigth paragraph
+ * RFC1122: section 4.2.2.17
+ *
+ * Urgent data in segments must be processed even if the window
+ * is partially or completely closed. Any normal data will
+ * be truncated and not processed depending on the current
+ * size of rwin.
+ * RFC793: section 3.3, page 26, third paragraph
+ *
+ * A keepalive is sent to solicit a response on an otherwise
+ * idle connection. Two methods exist: send a segment with
+ * seq=rcv_nxt-1 without data, or with exactly one byte of
+ * (bogus) data. Normally this segment would be rejected
+ * and a resynchronization ACK be sent (as intended). However
+ * we want to further process the segment to update a couple
+ * of timers and to further look at possible TCP options.
+ * RFC1122: section 4.2.3.6
*/
- if ((tp->t_flags & TF_TIMESTAMP) && !(to.to_flags & TOF_TS)) {
- tcplog("Timestamp missing, segment ignored");
- goto drop;
+ if (SEQ_LT(th->th_seq + tlen, tp->rcv_nxt) ||
+ SEQ_GT(th->th_seq, tp->rcv_nxt + (rwin ? (rwin - 1) : rwin))) {
+ /*
+ * The connection is idle and this
+ * is a keepalive.
+ */
+ if (th->th_seq == tp->rcv_nxt - 1 &&
+ th->th_ack == tp->snd_nxt &&
+ tlen <= 1 && !(thflags & TH_URG)) {
+ if (tlen == 0) {
+ th->th_seq = tp->rcv_nxt;
+ thflags &= ~TH_FIN;
+ }
+ tp->t_flags |= TF_ACKNOW;
+ tcps.tcps_rcv_keepalive++;
+ } else if ((thflags & TH_URG) && th->th_urg && tlen &&
+ th->th_seq == tp->rcv_nxt) {
+ /* Continue. XXXAO: tighter check. */
+ } else {
+ tcplog("Data outside window, segment ignored, "
+ "sending challenge ACK");
+ goto dropafterack;
+ }
+ /*
+ * NB: Continue with segment to update
+ * last received timestamp, to capture
+ * window updates, ACKs and congestion
+ * control algorithms. Any data above
+ * the window is truncated later.
+ */
}
- if (!(tp->t_flags & TF_TIMESTAMP) && (to.to_flags & TOF_TS)) {
- tcplog("Timestamp unexpected, segment ignored");
- goto drop;
- }
/*
- * Don't accept remote ts older than already seen,
- * reflected ts newer than what we send last.
- *
- * TODO-AO:
- * PAWS
+ * Don't accept ack'ing of more than actually sent data.
+ * Neither accept segments with a too old ACK.
+ * RFC793: section 3.9, page 72, fifth check, ESTABLISHED STATE,
+ * first paragraph, last sentence
+ * tcpsecure: section 5.2, mitigation of blind data injection
+ * tcpsecure: section 5.2 changes this to just drop
+ * XXXAO: why?
*/
- if ((to.to_flags & TOF_TS) &&
- ticks - tp->t_rcvtime < PAWS &&
- (!TSTMP_LT(to.to_tsval, tp->snd_tsecr)) ||
- TSTMP_GT(to.to_tsecr, tp->snd_tsval))) {
- tcplog("Timestamp too old or new, segment ignored");
- goto drop;
+ if (SEQ_GEQ(th->th_ack, tp->snd_nxt) ||
+ SEQ_LT(th->th_ack, tp->snd_una - tp->snd_maxwnd)) {
+ tcplog("Acking data not yet sent or too old, "
+ "segment ignored, sending challenge ACK");
+ tcpstat.tcps_rcvacktoomuch++;
+ goto dropafterack;
+ /* goto drop; */
}
/*
@@ -1195,20 +1318,11 @@
*/
if (SEQ_GT(th->th_seq, tp->snd_lastack) &&
SEQ_LT(th->th_seq, tp->rcv_nxt) {
- tcplog("Received retransmit before we sent delayed ACK, no action");
+ tcplog("Received retransmit before we sent delayed ACK,"
+ " no action");
}
- /*
- * Don't accept SACK when is wasn't negotiated at
- * connection setup time.
- */
- if ((to.to_flags & TOF_SACK) &&
- !(tp->t_flags & TF_SACK_PERM)) {
- tcplog("SACK unexpected, segment ignored");
- goto drop;
- }
-
- /* XXX: stats */
+ /* XXXAO: stats? */
break;
/*
@@ -1226,6 +1340,8 @@
* expensive MD5 hash computation.
* In SYN_RECEIVED case syncache verified the signature
* already.
+ * RFC2385: section 2.0, 3.0
+ * XXXAO: Make work.
*/
if ((tp->t_flags & TF_SIGNATURE) && notalreadydone) {
/* Copy signature and compare. */
@@ -1241,64 +1357,83 @@
/*
* Fast path for ACK-only segments.
*/
- if (tlen == 0 && (thflags & (TH_ACK|TH_RST|TH_SYN)) == TH_ACK)
+ if (tlen == 0 && (thflags & (TH_ACK|TH_RST|TH_SYN|TH_URG)) == TH_ACK)
goto doack;
/*
* Handle SYN and RST flags for existing connections.
*
- * NB: The SYN_SENT case has removed the SYN bit from thflags
- * if the segment was accepted.
+ * NB: The SYN_SENT case has removed SYN from thflags.
*/
if (thflags & TH_RST) {
/*
- * Any RST after TCPS_SYN_SENT must NOT carry the ACK flag.
- * RFC 793 page 65, section SEGMENT ARRIVES.
+ * Filter out what we determine NOT to be legitimate RST's.
*/
- if (tp->t_state > TCPS_SYN_SENT &&
- (thflags & TH_ACK)) {
- tcplog("RST with ACK invalid, segment ignored");
- tcpstat.tcps_badrst++;
- goto drop;
- }
- /*
- * Check if the sequence number is NOT acceptable to us.
- */
- if (tp->t_state == TCPS_SYN_SENT) {
+ switch (tp->t_state) {
+ case TCPS_SYN_SENT:
/*
* In TCPS_SYN_SENT the RST MUST carry the ACK flag.
+ * RFC793: section 3.9, page 66, first check
*/
if (!(thflags & TH_ACK)) {
tcplog("RST without ACK invalid, "
"segment ignored");
goto drop;
}
- if (th->th_ack != tp->snd_iss + 1) {
- /*
- * XXX: Account for end of window
- * if we had data sent with SYN.
- */
+ /*
+ * The ACK must be within what we sent but does
+ * not have to ACK the SYN.
+ * RFC793: section 3.9, page 66, first check
+ */
+ if (SEQ_LT(th->th_ack, tp->snd_una) ||
+ SEQ_GT(th->th_ack, th->snd_nxt)) {
tcplog("RST does not match, segment ignored");
tcpstat.tcps_badrst++;
goto drop;
}
- } else if (tcp_insecure_rst == 0 &&
- (SEQ_DELTA(th->th_seq, tp_rcv_nxt) > 1 ||
- SEQ_DELTA(th->th_seq, tp_snd_last_ack) > 1)) {
- tcplog("RST does not match (secure), segment ignored");
- tcpstat.tcps_badrst++;
- goto drop;
- } else if (tcp_insecure_rst == 1 &&
- (SEQ_LT(th->th_seq, tp->snd_last_ack - 1) ||
- SEQ_GT(th->th_seq, tp->snd_last_ack + rwin))) {
- tcplog("RST does not match (insecure), segment ignored");
- tcpstat.tcps_badrst++;
- goto drop;
+ break;
+
+ case TCPS_ESTABLISHED:
+ case TCPS_FIN_WAIT_1:
+ case TCPS_FIN_WAIT_2:
+ case TCPS_CLOSE_WAIT:
+ case TCPS_CLOSING:
+ case TCPS_LAST_ACK:
+ /*
+ * Any RST after TCPS_SYN_SENT must NOT carry the ACK flag.
+ * RFC 793: page 65
+ */
+ if (thflags & TH_ACK) {
+ tcplog("RST with ACK invalid, segment ignored");
+ tcpstat.tcps_badrst++;
+ goto drop;
+ }
+ /*
+ * Check if the sequence number is NOT acceptable to us.
+ * RFC793: page 70, second check
+ * RFC4953: section 3.1.2 (discussion of various methods)
+ * XXXAO: Three points: rcv_nxt, snd_last_ack, rcv_nxt+rwin?
+ * XXXAO: Description of +-1 variance.
+ */
+ if (tcp_secure_rst &&
+ (SEQ_DELTA(th->th_seq, tp->rcv_nxt) > 1 ||
+ SEQ_DELTA(th->th_seq, tp->snd_last_ack) > 1)) {
+ tcplog("RST does not match (secure), segment ignored");
+ tcpstat.tcps_badrst++;
+ goto drop;
+ } else if (!tcp_secure_rst &&
+ (SEQ_LT(th->th_seq, tp->snd_last_ack - 1) ||
+ SEQ_GT(th->th_seq, tp->snd_last_ack + rwin))) {
+ tcplog("RST does not match (insecure), segment ignored");
+ tcpstat.tcps_badrst++;
+ goto drop;
+ }
+ break;
}
tcplog("RST received, closing connection");
/*
- * Treat the different states appropriately.
+ * Unwind the connection according to its state.
*/
switch (tp->t_state) {
case TCPS_SYN_SENT:
@@ -1306,54 +1441,82 @@
case TCPS_FIN_WAIT_1:
case TCPS_FIN_WAIT_2:
case TCPS_CLOSE_WAIT:
+ /*
+ * The socket is still around and
+ * we have to inform the application.
+ * RFC1122: section 4.2.2.12 [RST cause]
+ * RFC1122: section 4.2.2.13
+ */
so->so_error = ECONNRESET;
+ /* XXXAO: Macro encapsulating state transition changes? */
tp->t_state = TCPS_CLOSED;
tcpstat.tcps_drops++;
tp = tcp_close(tp);
break;
+
case TCPS_CLOSING:
case TCPS_LAST_ACK:
+ /*
+ * The socket is already gone,
+ * just clean up and be done.
+ */
tp = tcp_close(tp);
break;
}
goto drop;
} else if (thflags & TH_SYN) {
/*
+ * We may get a SYN in these cases:
+ * remote host went down and comes back up
+ * retransmitted SYN-ACK when our ACK was lost
+ * malicous SYN
+ * simultaneous open in SYN_SENT case is handled there
+ *
* Instead of dropping the connection right away
* we send a challenge ACK back. If the connection
* is dead or retried we get back a proper RST which
* will be validated and close the connection then.
* tcp-secure modified recommends behavior to protect
* against "blind in the window" attacks.
+ * tcpsecure: section 4.2
*/
- tcplog("SYN received, segment ignored, sending challenge ACK");
+ tcplog("SYN received, segment ignored, "
+ "sending challenge ACK");
goto dropafterack;
}
/*
- * From here we start looking at the segment content and
- * the meat is real.
+ * From here we start looking at the segment content.
+ * Assert any assumption the code from this point on makes.
*/
KASSERT((thflags & (TH_SYN|TH_RST|TH_ACK)) == TH_ACK,
- ("%s: no ACK flag", __func__));
+ ("%s: no ACK flag or TH_SYN, TH_RST present", __func__));
+ KASSERT(SEQ_GEQ(th->th_seq + tlen, tp->rcv_nxt),
+ ("%s: th_seq+tlen < rcv_nxt", __func__));
+ KASSERT(SEQ_LT(th->th_seq, tp->rcv_nxt + rwin),
+ ("%s: th_seq >= rcv_nxt+rwin", __func__));
/*
* Trim segment on the left, i.e. it starts before rcv_nxt.
+ *
+ * XXXAO: we may have a zero window but have to process URG data.
+ * Trim off all normal payload and leave only the urg data intact.
*/
todrop = tp->rcv_nxt - th->th_seq;
if (todrop > 0) {
- if (todrop > tlen)
- todrop = tlen;
+ KASSERT(todrop <= tlen,
+ ("%s: left todrop > tlen", __func__));
if (todrop == tlen) {
/*
* Any valid FIN must be to the left of the window.
* At this point the FIN must be a duplicate or out
* of sequence; drop it.
+ * XXXAO: Fixup
*/
thflags &= ~TH_FIN;
/*
* Send an ACK to resynchronize and drop any data.
- * But keep on processing for ACK.
+ * But keep on processing for ACK and options.
*/
tp->t_flags |= TF_ACKNOW;
tcpstat.tcps_rcvduppack++;
@@ -1362,7 +1525,12 @@
tcpstat.tcps_rcvpartduppack++;
tcpstat.tcps_rcvpartdupbyte += todrop;
}
- drop_hdrlen += todrop; /* Drop from the front afterwards. */
+ /*
+ * Drop from front later together with
+ * delayed header drop and adjust segment
+ * length variables.
+ */
+ drop_hdrlen += todrop;
th->th_seq += todrop;
tlen -= todrop;
/*
@@ -1376,16 +1544,20 @@
th->th_urp = 0;
}
}
+
/*
* Trim segment to the right, ie. it ends after window,
* drop trailing data (and PUSH and FIN);
* if nothing left, just ACK.
+ * XXXAO: for window probe we may have to trim off the one byte.
*/
- todrop = (th->th_seq + tlen) - (tp->rcv_nxt + rwin);
+ if (!TCPS_HAVERCVDFIN(tp->t_state))
+ todrop = th->th_seq + tlen, tp->rcv_nxt + rwin;
+ else
+ todrop = tlen;
if (todrop > 0) {
- KASSERT(todrop <= tlen, ("%s: todrop > tlen", __func__));
- if (todrop >= tlen) {
- tcpstat.tcps_rcvbyteafterwin += tlen;
+ KASSERT(todrop <= tlen, ("%s: right todrop > tlen", __func__));
+ if (todrop == tlen) {
/*
* If window is closed we can only take segments at
* the window edge, and have to drop data and PUSH
@@ -1396,20 +1568,33 @@
if (rwin == 0 && th->th_seq == tp->rcv_nxt) {
tp->t_flags |= TF_ACKNOW;
tcpstat.tcps_rcvwinprobe++;
- } else {
- tcolog("");
- goto dropafterack;
}
- todrop = tlen;
- } else
- tcpstat.tcps_rcvbyteafterwin += todrop;
- m_adj(m, -todrop);
+ }
+ m_adj(m, -todrop); /* Drop from tail. */
tlen -= todrop;
thflags &= ~(TH_PUSH|TH_FIN);
tcpstat.tcps_rcvpackafterwin++;
- /* XXX: Urgent pointer? */
+ tcpstat.tcps_rcvbyteafterwin += todrop;
+ /* XXXAO: Urgent pointer? */
+ if ((thflags & TH_URG) && th->th_urp > todrop)
+ th->th_urp -= todrop;
+ else if (thflags & TH_URG) {
+ thflags &= ~TH_URG;
+ th->th_urp = 0;
+ }
+ }
+
+ /*
+ * Urgent pointer is invalid when we dont have any data.
+ * XXXAO: Not really. Urgent data is a hack.
+ */
+ if ((thflags & TH_URG) && tlen == 0) {
+ thflags &= ~TH_URG;
+ th->th_urp = 0;
}
- KASSERT(tlen >= 0, ("%s: tlen < 0", __func__));
+
+ KASSERT(tlen >= 0,
+ ("%s: tlen < 0", __func__));
/*
* If new data is received on a connection after the
@@ -1429,19 +1614,18 @@
doack:
/*
- * Ack processing.
+ * ACK, timing, option and congestion control processing.
*/
- KASSERT(SEQ_GEQ(th->th_ack, tp->snd_una),
- ("%s: th_ack < snd_una", __func__));
-
acked = th->th_ack - tp->snd_una;
+ if (acked < 0)
+ acked = 0;
tcpstat.tcps_rcvackpack++;
tcpstat.tcps_rcvackbyte += acked;
/*
- * Update window information.
+ * Update send window information.
*/
- nudgeoutput = tcp_do_wu(tp, th, tiwin, tlen);
+ nudgeoutput = tcp_do_wu(tp, th, &to, tiwin, tlen);
/*
* Update and recompute connection timing information.
@@ -1450,12 +1634,10 @@
tcp_do_time(tp, th, &to, acked, tlen);
/*
- * Update SACK information.
+ * Update send SACK information.
*/
- if (((tp->t_flags & TF_SACK) && (to.to_flags & TOF_SACK)) ||
- !TAILQ_EMPTY(&tp->snd_holes)) {
+ if ((to.to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes))
tcp_sack_doack(tp, &to, th->th_ack);
- }
/*
* Update congestion control information.
@@ -1464,10 +1646,16 @@
/*
* Drop acknowledged data from send socket buffer.
+ * RFC793: section 3.9, page 72, fifth check
*/
if (acked > 0)
+ int cantrcvmore;
+
SOCKBUF_LOCK(&so->so_snd);
- KASSERT(acked + 1 <= so->so_snd.sb_cc,
+
+ KASSERT(SEQ_GT(th->th_ack, tp->snd_nxt),
+ ("%s: ", __func__));
+ KASSERT(acked <= so->so_snd.sb_cc + 1,
("%s: more acked than in send buffer", __func__));
/*
@@ -1485,15 +1673,29 @@
tp->snd_wnd -= acked;
ourfinisacked = 0;
}
+
/*
* Advance the unacknowledged pointer.
+ * RFC793: section 3.9, page 72, fifth check
*/
tp->snd_una = th->th_ack;
+
+ /*
+ * Obtain sb_state before unlock for later use.
+ */
+ cantrcvmore = so->so_rcv.sb_state & SBS_CANTRCVMORE;
+
/*
+ * Wake up and inform any writers on the socket.
+ *
* NB: sowwakeup_locked() does an implicit unlock.
*/
sowwakeup_locked(so);
+ /*
+ * When our FIN was ack'ed perform the appropriate
+ * state transitions and release unnessary resources.
+ */
if (ourfinisacked) {
KASSERT((tp->t_flags & TF_SENTFIN) &&
tp->t_state > TCPS_CLOSE_WAIT &&
@@ -1501,21 +1703,22 @@
("%s: got ack for FIN but haven't sent FIN yet",
__func__));
- switch (tp->t_state) {
/*
- * In FIN_WAIT_1 state enter the FIN-WAIT-2 state.
- * Any transition to CLOSING happens later.
- * XXX: comment.
+ * Handle ack'ed FIN according to previous state.
*/
+ switch (tp->t_state) {
case TCPS_FIN_WAIT_1:
/*
- * If we can't receive any more
- * data, then closing user can proceed.
- * Starting the timer is contrary to the
+ * If we can't receive any more data,
+ * then closing user can proceed.
+ * XXXAO: better description and reference
+ * to discussion.
+ *
+ * NB: Starting the timer is contrary to the
* specification, but if we don't get a FIN
* we'll hang forever.
*/
- if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ if (cantrcvmore) {
soisdisconnected(so);
tcp_timer_activate(tp, TT_2MSL,
(tcp_fast_finwait2_recycle ?
@@ -1526,12 +1729,12 @@
tp->t_state = TCPS_FIN_WAIT_2;
break;
- /*
- * In CLOSING state enter the TIME-WAIT state.
- * tcp_twstart() discards this tcpcb and creates
- * a compressed state.
- */
case TCPS_CLOSING:
+ /*
+ * Create a compressed TIME-WAIT state
+ * with minimal information and discard
+ * this tcpcb to save memory.
+ */
tcp_twstart(tp);
tp = NULL;
INP_INFO_WUNLOCK(&tcbinfo);
@@ -1540,23 +1743,21 @@
goto done;
break;
- /*
- * In LAST_ACK, we may still be waiting for data
- * to drain and/or to be acked, as well as for
>>> TRUNCATED FOR MAIL (1000 lines) <<<
More information about the p4-projects
mailing list