PERFORCE change 142643 for review

Andre Oppermann andre at FreeBSD.org
Sat May 31 20:02:36 UTC 2008


http://perforce.freebsd.org/chv.cgi?CH=142643

Change 142643 by andre at andre_flirtbox on 2008/05/31 20:01:48

	WIP Checkpoint of tcp_do_segment() reworking.
	
	o Some further major rototiling.
	o Get data flow structure firmly in place.
	o Add extensive comments.
	o Add extensive RFC references to almost any part.
	o Validate a large part against relevant RFCs.
	o Remaining work and unclarities marked with XXXAO.
	o Does not yet compile.
	o Window update and timestamp code not yet complete.

Affected files ...

.. //depot/projects/tcp_new/netinet/tcp_input.c#5 edit

Differences ...

==== //depot/projects/tcp_new/netinet/tcp_input.c#5 (text+ko) ====

@@ -154,7 +154,10 @@
 static void	 tcp_do_segment(struct mbuf *, struct tcphdr *,
 		     struct socket *, struct tcpcb *, int, int);
 static void	 tcp_do_time(struct tcpcb *tp, struct tcphdr *th,
-		     struct tcpopt *to);
+		     struct tcpopt *to, int acked, int tlen);
+static void	 tcp_do_urg(struct tcpcb *tp, struct tcphdr *th, int tlen);
+static void	 tcp_do_wu(struct tcpcb *tp, struct tcphdr *th,
+		     struct tcpopt *to, int tiwin, int tlen);
 static void	 tcp_dropwithreset(struct mbuf *, struct tcphdr *,
 		     struct tcpcb *, int, int);
 static void	 tcp_pulloutofband(struct socket *,
@@ -881,24 +884,30 @@
 	INP_INFO_WLOCK_ASSERT(&tcbinfo);
 	INP_LOCK_ASSERT(tp->t_inpcb);
 	KASSERT(tp->t_state > TCPS_LISTEN,
-	    ("%s: TCPS_LISTEN", __func__));
+	    ("%s: TCPS_LISTEN invalid", __func__));
 	KASSERT(tp->t_state != TCPS_TIME_WAIT,
-	    ("%s: TCPS_TIME_WAIT", __func__));
+	    ("%s: TCPS_TIME_WAIT invalid", __func__));
 
 	/*
-	 * Store the flags in a variable for easy manipulation.
+	 * Store the flags in a variable for easy manipulation
+	 * and because we won't have access to th->th_flags in
+	 * later stages.
 	 */
 	thflags = th->th_flags;
 
 	/*
 	 * Unscale the window into a 32-bit value.
+	 *  RFC1122: section 4.2.2.3
+	 *  RFC1323bis: section 2.3
 	 * 
-	 * NB: For the SYN_SENT state the scale is zero.
+	 * NB: In SYN_SENT state the scale is zero.
 	 */
 	tiwin = th->th_win << tp->snd_scale;
 
 	/*
 	 * Parse options on any incoming segment (if present).
+	 *  RFC793: section 3.1, page 17-19
+	 *  RFC1122: section 4.2.2.5
 	 */
 	if ((th->th_off << 2) != sizeof(struct tcphdr))
 		tcp_dooptions(&to, (u_char *)(th + 1),
@@ -908,29 +917,31 @@
 		to.to_flags = 0;
 
 	/*
-	 * Normalize timestamp if syncookies were used when this
+	 * Normalize our timestamp if syncookies were used when this
 	 * connection was established.
 	 */
 	if (to.to_flags & TOF_TS)
 		to.to_tsecr -= tp->ts_offset;
 
 	/*
-	 * Calculate amount of space in receive window.
-	 * Receive window is amount of space in rcv queue,
-	 * but not less than advertised window.
+	 * Calculate amount of space in our receive window.
+	 * Receive window is the amount of space in rcv queue,
+	 * but not less than last advertised window.
+	 *  RFC793: section 3.7, page 42-44
+	 *  RFC1122: section 4.2.2.16
 	 */
 	rwin = sbspace(&so->so_rcv);
-	rwin = imax(rwin, (int)(tp->rcv_adv - tp->rcv_nxt));
+	rwin = imax(rwin, (int)(tp->rcv_advwin - tp->rcv_nxt));
 
 	/*
-	 * Validation checks.  We may get any shit here.  Have to be careful.
+	 * Validation checks on any incoming segment.
+	 * We may get anything here.  Have to be careful.
 	 */
 	switch (tp->t_state) {
 	/*
-	 * If the state is SYN_RECEIVED:
-	 *	syncache handled all validation, socket, inpcb and tcpcb
-	 *	setup for us.  All that is left is the state transition
-	 *	into established state and initializations of the timers.
+	 * Syncache handled all validation, socket, inpcb and tcpcb
+	 * setup for us.  All that is left is the state transition
+	 * into established state and initializations of the timers.
 	 */
 	case TCPS_SYN_RECEIVED:
 		tp->t_starttime = ticks;
@@ -943,74 +954,104 @@
 		break;
 
 	/*
-	 * If the state is SYN_SENT:
-	 *	if seg contains a RST, then drop the connection.
-	 *	if seg does not contain SYN and ACK, then drop it.
-	 *	if seg contains an ACK, but not for our SYN, drop the input.
-	 * Otherwise this is an acceptable SYN segment
-	 *	initialize tp->rcv_nxt and tp->irs
-	 *	if seg contains ack then advance tp->snd_una
-	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
-	 *	arrange for segment to be acked (eventually)
-	 *	continue processing rest of data/controls, beginning with URG
+	 * Validate the returned SYN-ACK, process the negotiated
+	 * options, complete the initialization of the tcpcb and
+	 * transition into ESTABLISHED state.
 	 */
 	case TCPS_SYN_SENT:
 		/*
-		 * RST is handled below.
+		 * RST is handled separately below.
+		 *  RFC793: section 3.9, page 66-67, second check
 		 */
 		if (thflags & TH_RST)
 			break;
 
 		/*
-		 * SYN|ACK must be present.
+		 * FIN is not valid yet and the segment to be ignored.
+		 *  RFC793: section 3.9, page 75, eighth check, first paragraph
+		 */
+		if (thflags & TH_FIN) {
+			tcplog("FIN invalid, segment ignored");
+			goto drop;
+		}
+
+		/*
+		 * SYN must be present.
+		 *  RFC793: section 3.9, page 67-68, fourth and fifth check
+		 *
+		 * NB: We have to remove SYN from thflags to
+		 * prevent the later bogus SYN check from
+		 * triggering.
 		 */
-		if (thflags & (TH_SYN|TH_ACK) != (TH_SYN|TH_ACK)) {
-			tcplog("Missing SYN|ACK, segment ignored");
+		if (!(thflags & TH_SYN)) {
+			tcplog("Missing SYN, segment ignored");
 			goto drop;
 		}
+		thflags &= ~TH_SYN;	/* SYN is processed. */
+
+		/*
+		 * ACK must be present.
+		 *  RFC793: section 3.9, page 67-68, fourth check
+		 *
+		 * XXXAO: Simultaneous open to be handled.
+		 *  RFC1122: section 4.2.2.10
+		 */
+		if (!(thflags & TH_ACK)) {
+			tcplog("Missing ACK, segment rejected");
+			goto dropwithreset;
+		}
 
 		/*
-		 * ACK must ack our ISN and any data we may
-		 * have sent with our SYN.
+		 * ACK must ack our ISN, SYN and possibly data we may
+		 * have sent with our SYN.  The latter is not strictly
+		 * necessary as many implementations chose to ignore
+		 * data in a SYN.  We want to make use of it again in
+		 * a future incarnation of T/TCP with a 'quick' 3WHS.
+		 *  RFC793: section 3.9, page 66, first check
 		 */
-		if (SEQ_LEQ(th->th_ack, tp->snd_iss) ||
-		    SEQ_GEQ(th->th_ack, tp->snd_nxt) ||
-		    SEQ_LT(th->th_ack, tp->snd_una)) {
-			tcplog("Incorrect ACK, segment rejected");
-			/* XXXAO: Close connection? Or ignore. */
+		if (SEQ_LEQ(th->th_ack, tp->snd_una) ||
+		    SEQ_GT(th->th_ack, tp->snd_nxt)) {
+			tcplog("Incorrect ACK, segment ignored");
 			goto dropwithreset;
 		}
 
 		/*
 		 * Option processing:
 		 *
-		 * If there wasn't a MSS option fall back to
-		 * default mss.
+		 * If there wasn't a MSS option fall back to default mss.
+		 * tcp_mss will calculate the largest possible MSS for us.
+		 * The MSS option is not a fully negotiated option and may
+		 * be returned even if we haven't sent it with our initial
+		 * SYN.  It is not common practice to do so however.
+		 *  RFC793: section 3.1, page 18-19
+		 *  RFC1122: section 4.2.2.6
+		 *  RFC1191: section 3.1
 		 */
-		if (!(tp->t_flags & TF_NOOPT) && (to.to_flags & TOF_MSS))
+		if (to.to_flags & TOF_MSS)
 			tcp_mss(tp, to.to_mss);
-		else if (tcp_do_path_mtu_discovery)
-			/* MTU of interface... */
 		else
 			tcp_mss(tp, tcp_mssdflt);
 
 		/*
 		 * Do window scaling on this connection?
+		 *  RFC1323bis: section 2.2 and 2.3
 		 *
-		 * NB: According to RFC1323 the window field
-		 * in a SYN (i.e., a <SYN> or <SYN,ACK>)
-		 * segment itself is never scaled.
+		 * NB: The window field in a SYN (i.e., a <SYN>
+		 * or <SYN,ACK>) segment itself is never scaled.
 		 */
 		if ((tp->t_flags & TF_WINSCALE) &&
 		    (to.to_flags & TOF_SCALE)) {
 			tp->snd_scale = to.to_wscale;
 		} else if (tp->t_flags & TF_WINSCALE) {
+			/* No window scaling. */
 			tp->t_flags &= ~TF_WINSCALE;
+			tp->snd_scale = 0;
 			tp->rcv_scale = 0;
 		} else if (to.to_flags & TOF_SCALE) {
 			/*
 			 * The remote end doesn't play right with us
 			 * and introduces options we haven't sent.
+			 *  RFC1323bis: section 1.3, second paragraph
 			 */
 			tcplog("Window Scaling Option unexpected, "
 			    "connection aborted");
@@ -1022,6 +1063,7 @@
 
 		/*
 		 * Do timestamps on this connection?
+		 *  RFC1323bis: section 3.2, first and last sentence
 		 */
 		if ((tp->t_flags & TF_TIMESTAMP) &&
 		    !(to.to_flags & TOF_TS))
@@ -1031,8 +1073,9 @@
 			/*
 			 * The remote end doesn't play right with us
 			 * and introduces options we haven't sent.
+			 *  RFC1323bis: section 1.3, second paragraph
 			 */
-			tcplog("Timestamp unexpected, "
+			tcplog("Timestamp Option unexpected, "
 			    "connection aborted");
 			tp->t_error = ENETRESET;
 			tp = tcp_close(tp);
@@ -1042,6 +1085,7 @@
 
 		/*
 		 * Do SACK on this connection?
+		 *  RFC2018: section 2
 		 */
 		if ((tp->t_flags & TF_SACK_PERMIT) &&
 		    !(to.to_flags & TOF_SACKPERM))
@@ -1051,6 +1095,7 @@
 			/*
 			 * The remote end doesn't play right with us
 			 * and introduces options we haven't sent.
+			 *  RFC2018: section 1, page 2, last paragraph
 			 */
 			tcplog("SACK Permitted unexpected, "
 			    "connection aborted");
@@ -1062,20 +1107,24 @@
 
 		/*
 		 * Initialize receive structure.
+		 * XXXAO: TODO
 		 */
 		tp->rcv_adv += rwin;	/* XXX */
 		tp->irs = th->th_seq;
 		tp->rcv_up = th->th_seq;
 		tcp_rcvseqinit(tp);
 
+		tcp_init_rcv(tp, seq);	/* XXXAO */
+		tcp_init_snd(tp, ack);	/* XXXAO */
+
 		/*
 		 * Process SYN and integrate sequence number.
+		 * XXXAO: TODO
 		 */
 		tp->snd_una++;
 		tp->snd_wu_seq = th->th_seq;
 		tp->snd_wu_ack = th->th_ack;
 		th->th_seq++;		/* SYN is acked */
-		thflags &= ~TH_SYN;	/* SYN is processed */
 
 		tp->t_starttime = ticks;
 		tp->t_state = TCPS_ESTABLISHED;
@@ -1089,8 +1138,12 @@
 
 		tcpstat.tcps_connects++;
 		break;
+
 	/*
 	 * All other states where a connection was established before.
+	 * The test are ordered by their knock-out precedence.
+	 * The simplest checks that fail a segment come first.
+	 * The logical ordering of RFC793 is still maintained.
 	 */
 	case TCPS_ESTABLISHED:
 	case TCPS_CLOSE_WAIT:
@@ -1100,12 +1153,15 @@
 	case TCPS_FIN_WAIT_2:
 		/*
 		 * SYN and RST are handled separately below.
+		 * RST takes precedence in all flag combinations.
+		 *  RFC793: section 3.9, page 70-71
 		 */
 		if (thflags & (TH_SYN|TH_RST))
 			break;
 
 		/*
 		 * Segments without ACK are invalid.
+		 *  RFC793: section 3.9, page 72, fifth check, first sentence
 		 */
 		if (!(thflags & TH_ACK)) {
 			tcplog("ACK missing, segment ignored");
@@ -1113,78 +1169,145 @@
 		}
 
 		/*
-		 * Don't accept ack'ing of older than previously ack'd data.
-		 * XXXAO: Careful with out-of-order data. Must check seq too.
-		 * reordering and bidirectional data transfer.
-		 * XXXAO: Is this check really useful?
+		 * Don't accept missing TS when TS was negotiated and
+		 * vice versa.
+		 *  RFC1323bis: section 3.2, last paragraph, last sentence
 		 */
-		if (SEQ_LT(th->th_ack, tp->snd_una) &&
-		    SEQ_LT(th->th_seq, tp->rcv_nxt)) {
-			tcplog("Acking old data, segment ignored, "
-			    "sending challenge ACK");
-			goto dropafterack;
+		if ((tp->t_flags & TF_TIMESTAMP) && !(to.to_flags & TOF_TS)) {
+			tcplog("Timestamp missing, segment ignored");
+			goto drop;
+		}
+		if (!(tp->t_flags & TF_TIMESTAMP) && (to.to_flags & TOF_TS)) {
+			tcplog("Timestamp unexpected, segment ignored");
+			goto drop;
 		}
 
 		/*
-		 * Don't accept ack'ing of more than actually sent data.
+		 * Don't accept SACK when it wasn't negotiated at
+		 * connection setup time.
+		 *  RFC2018: section 1, page 2, last paragraph
 		 */
-		if (SEQ_GT(th->th_ack, tp->snd_max)) {
-			tcplog("Acking data not yet sent, segment ignored, "
-			    "sending challenge ACK");
-			tcpstat.tcps_rcvacktoomuch++;
-			goto dropafterack;
+		if ((to.to_flags & TOF_SACK) &&
+		    !(tp->t_flags & TF_SACK_PERM)) {
+			tcplog("SACK unexpected, segment ignored");
+			goto drop;
 		}
 
 		/*
-		 * Don't accept start of SEQ beyond receive window.
-		 * Allow for a window probe with one byte.
-		 * XXXAO: Window probe statistics.
+		 * PAWS: Protection against wrapped sequence numbers.
+		 *
+		 * Don't accept remote ts older than already seen,
+		 * or reflected ts newer than what we send last.
+		 *
+		 * We store the receive time as uptime with second
+		 * resolution.  This makes us independent from the
+		 * wrap-around after 2^32 / hz (24.8 days at 1ms hz).
+		 *
+		 *  RFC1323bis: section 4.2.1 and 4.2.3
 		 */
-		if (SEQ_GT(th->th_seq, tp->rcv_nxt + tp->rcv_win)) {
-			tcplog("Data beyond window, segment ignored, "
-			    "sending challenge ACK");
-			goto dropafterack;
-		}
+		if (to.to_flags & TOF_TS) {
+			struct bintime bt;
 
-		/*
-		 * Don't accept too old retransmits.
-		 * XXXAO: Use largest window we've ever sent.
-		 * sb_hiwat is pretty much that.  We normally
-		 * don't shrink the receive socket buffer.
-		 */
-		if (SEQ_LT(th->th_seq,
-		    tp->rcv_nxt - so->so_rcv.sb_hiwat - tlen)) {
-			tcplog("Too old retransmit, segment ignored, "
-			    "sending challenge ACK");
-			goto dropafterack;
+			getbinuptime(&bt);
+			if (bt.sec - tp->t_rcvtime < ((tcp_ts)0x0 - 1) / hz) {
+				if (TSTMP_LT(to.to_tsval, tp->snd_tsecr) {
+					tcplog("Timestamp too old, "
+					    "sending challenge ack");
+					goto dropafterack;
+				}
+				if (TSTMP_GT(to.to_tsecr, tp->snd_tsval) {
+					tcplog("Timestamp too new, "
+					    "sending challenge ack");
+					goto dropafterack;
+				}
+			}
 		}
 
 		/*
-		 * Don't accept missing TS when TS was negotiated and
-		 * vice versa.
+		 * Validate the segment against the window.  We don't accept
+		 * segments with SEQ totally outside of receive window, unless
+		 * one of the special cases described below applies.  Any data
+		 * portion hanging outside the window to either side will be
+		 * chopped off later.
+		 *  RFC793: section 3.3, page 24-26
+		 *  RFC793: section 3.9, page 69, first check, the four cases
+		 *
+		 * Window probes are sent when we advertized a zero window
+		 * because the receive socket buffer is full.  When the
+		 * application reads enough data from it we send a window
+		 * update.  This is not reliable though and the remote host
+		 * periodically probes if we've got space again.  It does
+		 * this by sending one byte of data.  We let the through this
+		 * one byte, truncate it later and process the remainder of
+		 * the segment including any options.  This way timers are
+		 * updated properly.  TH_FIN is let through even on a zero
+		 * receive window.
+		 *  RFC793: section 3.7, page 42, seventh and eigth paragraph
+		 *  RFC1122: section 4.2.2.17
+		 *
+		 * Urgent data in segments must be processed even if the window
+		 * is partially or completely closed.  Any normal data will
+		 * be truncated and not processed depending on the current
+		 * size of rwin.
+		 *  RFC793: section 3.3, page 26, third paragraph
+		 *
+		 * A keepalive is sent to solicit a response on an otherwise
+		 * idle connection.  Two methods exist: send a segment with
+		 * seq=rcv_nxt-1 without data, or with exactly one byte of
+		 * (bogus) data.  Normally this segment would be rejected
+		 * and a resynchronization ACK be sent (as intended). However
+		 * we want to further process the segment to update a couple
+		 * of timers and to further look at possible TCP options.
+		 *  RFC1122: section 4.2.3.6
 		 */
-		if ((tp->t_flags & TF_TIMESTAMP) && !(to.to_flags & TOF_TS)) {
-			tcplog("Timestamp missing, segment ignored");
-			goto drop;
+		if (SEQ_LT(th->th_seq + tlen, tp->rcv_nxt) ||
+		    SEQ_GT(th->th_seq, tp->rcv_nxt + (rwin ? (rwin - 1) : rwin))) {
+			/*
+			 * The connection is idle and this
+			 * is a keepalive.
+			 */
+			if (th->th_seq == tp->rcv_nxt - 1 &&
+			    th->th_ack == tp->snd_nxt &&
+			    tlen <= 1 && !(thflags & TH_URG)) {
+				if (tlen == 0) {
+					th->th_seq = tp->rcv_nxt;
+					thflags &= ~TH_FIN;
+				}
+				tp->t_flags |= TF_ACKNOW;
+				tcps.tcps_rcv_keepalive++;
+			} else if ((thflags & TH_URG) && th->th_urg && tlen &&
+			    th->th_seq == tp->rcv_nxt) {
+				/* Continue. XXXAO: tighter check. */
+			} else {
+				tcplog("Data outside window, segment ignored, "
+				    "sending challenge ACK");
+				goto dropafterack;
+			}
+			/*
+			 * NB: Continue with segment to update
+			 * last received timestamp, to capture
+			 * window updates, ACKs and congestion
+			 * control algorithms.  Any data above
+			 * the window is truncated later.
+			 */
 		}
-		if (!(tp->t_flags & TF_TIMESTAMP) && (to.to_flags & TOF_TS)) {
-			tcplog("Timestamp unexpected, segment ignored");
-			goto drop;
-		}
 
 		/*
-		 * Don't accept remote ts older than already seen,
-		 * reflected ts newer than what we send last.
-		 *
-		 * TODO-AO:
-		 * PAWS
+		 * Don't accept ack'ing of more than actually sent data.
+		 * Neither accept segments with a too old ACK.
+		 *  RFC793: section 3.9, page 72, fifth check, ESTABLISHED STATE,
+		 *   first paragraph, last sentence
+		 *  tcpsecure: section 5.2, mitigation of blind data injection
+		 *  tcpsecure: section 5.2 changes this to just drop
+		 * XXXAO: why?
 		 */
-		if ((to.to_flags & TOF_TS) &&
-		    ticks - tp->t_rcvtime < PAWS &&
-		    (!TSTMP_LT(to.to_tsval, tp->snd_tsecr)) ||
-		     TSTMP_GT(to.to_tsecr, tp->snd_tsval))) {
-			tcplog("Timestamp too old or new, segment ignored");
-			goto drop;
+		if (SEQ_GEQ(th->th_ack, tp->snd_nxt) ||
+		    SEQ_LT(th->th_ack, tp->snd_una - tp->snd_maxwnd)) {
+			tcplog("Acking data not yet sent or too old, "
+			    "segment ignored, sending challenge ACK");
+			tcpstat.tcps_rcvacktoomuch++;
+			goto dropafterack;
+			/* goto drop; */
 		}
 
 		/*
@@ -1195,20 +1318,11 @@
 		 */
 		if (SEQ_GT(th->th_seq, tp->snd_lastack) &&
 		    SEQ_LT(th->th_seq, tp->rcv_nxt) {
-			tcplog("Received retransmit before we sent delayed ACK, no action");
+			tcplog("Received retransmit before we sent delayed ACK,"
+			    " no action");
 		}
 
-		/*
-		 * Don't accept SACK when is wasn't negotiated at
-		 * connection setup time.
-		 */
-		if ((to.to_flags & TOF_SACK) &&
-		    !(tp->t_flags & TF_SACK_PERM)) {
-			tcplog("SACK unexpected, segment ignored");
-			goto drop;
-		}
-
-		/* XXX: stats */
+		/* XXXAO: stats? */
 		break;
 
 	/*
@@ -1226,6 +1340,8 @@
 	 * expensive MD5 hash computation.
 	 * In SYN_RECEIVED case syncache verified the signature
 	 * already.
+	 *  RFC2385: section 2.0, 3.0
+	 * XXXAO: Make work.
 	 */
 	if ((tp->t_flags & TF_SIGNATURE) && notalreadydone) {
 		/* Copy signature and compare. */
@@ -1241,64 +1357,83 @@
 	/*
 	 * Fast path for ACK-only segments.
 	 */
-	if (tlen == 0 && (thflags & (TH_ACK|TH_RST|TH_SYN)) == TH_ACK)
+	if (tlen == 0 && (thflags & (TH_ACK|TH_RST|TH_SYN|TH_URG)) == TH_ACK)
 		goto doack;
 
 	/*
 	 * Handle SYN and RST flags for existing connections.
 	 *
-	 * NB: The SYN_SENT case has removed the SYN bit from thflags
-	 * if the segment was accepted.
+	 * NB: The SYN_SENT case has removed SYN from thflags.
 	 */
 	if (thflags & TH_RST) {
 		/*
-		 * Any RST after TCPS_SYN_SENT must NOT carry the ACK flag.
-		 * RFC 793 page 65, section SEGMENT ARRIVES.
+		 * Filter out what we determine NOT to be legitimate RST's.
 		 */
-		if (tp->t_state > TCPS_SYN_SENT &&
-		    (thflags & TH_ACK)) {
-			tcplog("RST with ACK invalid, segment ignored");
-			tcpstat.tcps_badrst++;
-			goto drop;
-		}
-		/*
-		 * Check if the sequence number is NOT acceptable to us.
-		 */
-		if (tp->t_state == TCPS_SYN_SENT) {
+		switch (tp->t_state) {
+		case TCPS_SYN_SENT:
 			/*
 			 * In TCPS_SYN_SENT the RST MUST carry the ACK flag.
+			 *  RFC793: section 3.9, page 66, first check
 			 */
 			if (!(thflags & TH_ACK)) {
 				tcplog("RST without ACK invalid, "
 				    "segment ignored");
 				goto drop;
 			}
-			if (th->th_ack != tp->snd_iss + 1) {
-				/*
-				 * XXX: Account for end of window
-				 * if we had data sent with SYN.
-				 */
+			/*
+			 * The ACK must be within what we sent but does
+			 * not have to ACK the SYN.
+			 *  RFC793: section 3.9, page 66, first check
+			 */
+			if (SEQ_LT(th->th_ack, tp->snd_una) ||
+			    SEQ_GT(th->th_ack, th->snd_nxt)) {
 				tcplog("RST does not match, segment ignored");
 				tcpstat.tcps_badrst++;
 				goto drop;
 			}
-		} else if (tcp_insecure_rst == 0 &&
-		    (SEQ_DELTA(th->th_seq, tp_rcv_nxt) > 1 ||
-		     SEQ_DELTA(th->th_seq, tp_snd_last_ack) > 1)) {
-			tcplog("RST does not match (secure), segment ignored");
-			tcpstat.tcps_badrst++;
-			goto drop;
-		} else if (tcp_insecure_rst == 1 &&
-		    (SEQ_LT(th->th_seq, tp->snd_last_ack - 1) ||
-		     SEQ_GT(th->th_seq, tp->snd_last_ack + rwin))) {
-			tcplog("RST does not match (insecure), segment ignored");
-			tcpstat.tcps_badrst++;
-			goto drop;
+			break;
+
+		case TCPS_ESTABLISHED:
+		case TCPS_FIN_WAIT_1:
+		case TCPS_FIN_WAIT_2:
+		case TCPS_CLOSE_WAIT:
+		case TCPS_CLOSING:
+		case TCPS_LAST_ACK:
+			/*
+			 * Any RST after TCPS_SYN_SENT must NOT carry the ACK flag.
+			 *  RFC 793: page 65
+			 */
+			if (thflags & TH_ACK) {
+				tcplog("RST with ACK invalid, segment ignored");
+				tcpstat.tcps_badrst++;
+				goto drop;
+			}
+			/*
+			 * Check if the sequence number is NOT acceptable to us.
+			 *  RFC793: page 70, second check
+			 *  RFC4953: section 3.1.2 (discussion of various methods)
+			 * XXXAO: Three points: rcv_nxt, snd_last_ack, rcv_nxt+rwin?
+			 * XXXAO: Description of +-1 variance.
+			 */
+			if (tcp_secure_rst &&
+			    (SEQ_DELTA(th->th_seq, tp->rcv_nxt) > 1 ||
+			     SEQ_DELTA(th->th_seq, tp->snd_last_ack) > 1)) {
+				tcplog("RST does not match (secure), segment ignored");
+				tcpstat.tcps_badrst++;
+				goto drop;
+			} else if (!tcp_secure_rst &&
+			    (SEQ_LT(th->th_seq, tp->snd_last_ack - 1) ||
+			     SEQ_GT(th->th_seq, tp->snd_last_ack + rwin))) {
+				tcplog("RST does not match (insecure), segment ignored");
+				tcpstat.tcps_badrst++;
+				goto drop;
+			}
+			break;
 		}
 		tcplog("RST received, closing connection");
 
 		/*
-		 * Treat the different states appropriately.
+		 * Unwind the connection according to its state.
 		 */
 		switch (tp->t_state) {
 		case TCPS_SYN_SENT:
@@ -1306,54 +1441,82 @@
 		case TCPS_FIN_WAIT_1:
 		case TCPS_FIN_WAIT_2:
 		case TCPS_CLOSE_WAIT:
+			/*
+			 * The socket is still around and
+			 * we have to inform the application.
+			 *  RFC1122: section 4.2.2.12 [RST cause]
+			 *  RFC1122: section 4.2.2.13
+			 */
 			so->so_error = ECONNRESET;
+			/* XXXAO: Macro encapsulating state transition changes? */
 			tp->t_state = TCPS_CLOSED;
 			tcpstat.tcps_drops++;
 			tp = tcp_close(tp);
 			break;
+
 		case TCPS_CLOSING:
 		case TCPS_LAST_ACK:
+			/*
+			 * The socket is already gone,
+			 * just clean up and be done.
+			 */
 			tp = tcp_close(tp);
 			break;
 		}
 		goto drop;
 	} else if (thflags & TH_SYN) {
 		/*
+		 * We may get a SYN in these cases:
+		 *  remote host went down and comes back up
+		 *  retransmitted SYN-ACK when our ACK was lost
+		 *  malicous SYN
+		 *  simultaneous open in SYN_SENT case is handled there
+		 *
 		 * Instead of dropping the connection right away
 		 * we send a challenge ACK back.  If the connection
 		 * is dead or retried we get back a proper RST which
 		 * will be validated and close the connection then.
 		 * tcp-secure modified recommends behavior to protect
 		 * against "blind in the window" attacks.
+		 *  tcpsecure: section 4.2
 		 */
-		tcplog("SYN received, segment ignored, sending challenge ACK");
+		tcplog("SYN received, segment ignored, "
+		    "sending challenge ACK");
 		goto dropafterack;
 	}
 
 	/*
-	 * From here we start looking at the segment content and
-	 * the meat is real.
+	 * From here we start looking at the segment content.
+	 * Assert any assumption the code from this point on makes.
 	 */
 	KASSERT((thflags & (TH_SYN|TH_RST|TH_ACK)) == TH_ACK,
-	    ("%s: no ACK flag", __func__));
+	    ("%s: no ACK flag or TH_SYN, TH_RST present", __func__));
+	KASSERT(SEQ_GEQ(th->th_seq + tlen, tp->rcv_nxt),
+	    ("%s: th_seq+tlen < rcv_nxt", __func__));
+	KASSERT(SEQ_LT(th->th_seq, tp->rcv_nxt + rwin),
+	    ("%s: th_seq >= rcv_nxt+rwin", __func__));
 
 	/*
 	 * Trim segment on the left, i.e. it starts before rcv_nxt.
+	 *
+	 * XXXAO: we may have a zero window but have to process URG data.
+	 * Trim off all normal payload and leave only the urg data intact.
 	 */
 	todrop = tp->rcv_nxt - th->th_seq;
 	if (todrop > 0) {
-		if (todrop > tlen)
-			todrop = tlen;
+		KASSERT(todrop <= tlen,
+		    ("%s: left todrop > tlen", __func__));
 		if (todrop == tlen) {
 			/*
 			 * Any valid FIN must be to the left of the window.
 			 * At this point the FIN must be a duplicate or out
 			 * of sequence; drop it.
+			 * XXXAO: Fixup
 			 */
 			thflags &= ~TH_FIN;
 			/*
 			 * Send an ACK to resynchronize and drop any data.
-			 * But keep on processing for ACK.
+			 * But keep on processing for ACK and options.
 			 */
 			tp->t_flags |= TF_ACKNOW;
 			tcpstat.tcps_rcvduppack++;
@@ -1362,7 +1525,12 @@
 			tcpstat.tcps_rcvpartduppack++;
 			tcpstat.tcps_rcvpartdupbyte += todrop;
 		}
-		drop_hdrlen += todrop;	/* Drop from the front afterwards. */
+		/*
+		 * Drop from front later together with
+		 * delayed header drop and adjust segment
+		 * length variables.
+		 */
+		drop_hdrlen += todrop;
 		th->th_seq += todrop;
 		tlen -= todrop;
 		/*
@@ -1376,16 +1544,20 @@
 			th->th_urp = 0;
 		}
 	}
+
 	/*
 	 * Trim segment to the right, ie. it ends after window,
 	 * drop trailing data (and PUSH and FIN);
 	 * if nothing left, just ACK.
+	 * XXXAO: for window probe we may have to trim off the one byte.
 	 */
-	todrop = (th->th_seq + tlen) - (tp->rcv_nxt + rwin);
+	if (!TCPS_HAVERCVDFIN(tp->t_state))
+		todrop = th->th_seq + tlen, tp->rcv_nxt + rwin;
+	else
+		todrop = tlen;
 	if (todrop > 0) {
-		KASSERT(todrop <= tlen, ("%s: todrop > tlen", __func__));
-		if (todrop >= tlen) {
-			tcpstat.tcps_rcvbyteafterwin += tlen;
+		KASSERT(todrop <= tlen, ("%s: right todrop > tlen", __func__));
+		if (todrop == tlen) {
 			/*
 			 * If window is closed we can only take segments at
 			 * the window edge, and have to drop data and PUSH
@@ -1396,20 +1568,33 @@
 			if (rwin == 0 && th->th_seq == tp->rcv_nxt) {
 				tp->t_flags |= TF_ACKNOW;
 				tcpstat.tcps_rcvwinprobe++;
-			} else {
-				tcolog("");
-				goto dropafterack;
 			}
-			todrop = tlen;
-		} else
-			tcpstat.tcps_rcvbyteafterwin += todrop;
-		m_adj(m, -todrop);
+		}
+		m_adj(m, -todrop);		/* Drop from tail. */
 		tlen -= todrop;
 		thflags &= ~(TH_PUSH|TH_FIN);
 		tcpstat.tcps_rcvpackafterwin++;
-		/* XXX: Urgent pointer? */
+		tcpstat.tcps_rcvbyteafterwin += todrop;
+		/* XXXAO: Urgent pointer? */
+		if ((thflags & TH_URG) && th->th_urp > todrop)
+			th->th_urp -= todrop;
+		else if (thflags & TH_URG) {
+			thflags &= ~TH_URG;
+			th->th_urp = 0;
+		}
+	}
+
+	/*
+	 * Urgent pointer is invalid when we dont have any data.
+	 * XXXAO: Not really.  Urgent data is a hack.
+	 */
+	if ((thflags & TH_URG) && tlen == 0) {
+		thflags &= ~TH_URG;
+		th->th_urp = 0;
 	}
-	KASSERT(tlen >= 0, ("%s: tlen < 0", __func__));
+
+	KASSERT(tlen >= 0,
+	    ("%s: tlen < 0", __func__));
 
 	/*
 	 * If new data is received on a connection after the
@@ -1429,19 +1614,18 @@
 
 doack:
 	/*
-	 * Ack processing.
+	 * ACK, timing, option and congestion control processing.
 	 */
-	KASSERT(SEQ_GEQ(th->th_ack, tp->snd_una),
-	    ("%s: th_ack < snd_una", __func__));
-
 	acked = th->th_ack - tp->snd_una;
+	if (acked < 0)
+		acked = 0;
 	tcpstat.tcps_rcvackpack++;
 	tcpstat.tcps_rcvackbyte += acked;
 
 	/*
-	 * Update window information.
+	 * Update send window information.
 	 */
-	nudgeoutput = tcp_do_wu(tp, th, tiwin, tlen);
+	nudgeoutput = tcp_do_wu(tp, th, &to, tiwin, tlen);
 
 	/*
 	 * Update and recompute connection timing information.
@@ -1450,12 +1634,10 @@
 	tcp_do_time(tp, th, &to, acked, tlen);
 
 	/*
-	 * Update SACK information.
+	 * Update send SACK information.
 	 */
-	if (((tp->t_flags & TF_SACK) && (to.to_flags & TOF_SACK)) ||
-	    !TAILQ_EMPTY(&tp->snd_holes)) {
+	if ((to.to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes))
 		tcp_sack_doack(tp, &to, th->th_ack);
-	}
 
 	/*
 	 * Update congestion control information.
@@ -1464,10 +1646,16 @@
 
 	/*
 	 * Drop acknowledged data from send socket buffer.
+	 *  RFC793: section 3.9, page 72, fifth check
 	 */
 	if (acked > 0)
+		int cantrcvmore;
+
 		SOCKBUF_LOCK(&so->so_snd);
-		KASSERT(acked + 1 <= so->so_snd.sb_cc,
+
+		KASSERT(SEQ_GT(th->th_ack, tp->snd_nxt),
+		    ("%s: ", __func__));
+		KASSERT(acked <= so->so_snd.sb_cc + 1,
 		    ("%s: more acked than in send buffer", __func__));
 
 		/*
@@ -1485,15 +1673,29 @@
 			tp->snd_wnd -= acked;
 			ourfinisacked = 0;
 		}
+
 		/*
 		 * Advance the unacknowledged pointer.
+		 *  RFC793: section 3.9, page 72, fifth check
 		 */
 		tp->snd_una = th->th_ack;
+
+		/*
+		 * Obtain sb_state before unlock for later use.
+		 */
+		cantrcvmore = so->so_rcv.sb_state & SBS_CANTRCVMORE;
+
 		/*
+		 * Wake up and inform any writers on the socket.
+		 *
 		 * NB: sowwakeup_locked() does an implicit unlock.
 		 */
 		sowwakeup_locked(so);
 
+		/*
+		 * When our FIN was ack'ed perform the appropriate
+		 * state transitions and release unnessary resources.
+		 */
 		if (ourfinisacked) {
 			KASSERT((tp->t_flags & TF_SENTFIN) &&
 			    tp->t_state > TCPS_CLOSE_WAIT &&
@@ -1501,21 +1703,22 @@
 			    ("%s: got ack for FIN but haven't sent FIN yet",
 			    __func__));
 
-			switch (tp->t_state) {
 			/*
-			 * In FIN_WAIT_1 state enter the FIN-WAIT-2 state.
-			 * Any transition to CLOSING happens later.
-			 * XXX: comment.
+			 * Handle ack'ed FIN according to previous state.
 			 */
+			switch (tp->t_state) {
 			case TCPS_FIN_WAIT_1:
 				/*
-				 * If we can't receive any more
-				 * data, then closing user can proceed.
-				 * Starting the timer is contrary to the
+				 * If we can't receive any more data,
+				 * then closing user can proceed.
+				 * XXXAO: better description and reference
+				 * to discussion.
+				 *
+				 * NB: Starting the timer is contrary to the
 				 * specification, but if we don't get a FIN
 				 * we'll hang forever.
 				 */
-				if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+				if (cantrcvmore) {
 					soisdisconnected(so);
 					tcp_timer_activate(tp, TT_2MSL,
 					    (tcp_fast_finwait2_recycle ?
@@ -1526,12 +1729,12 @@
 				tp->t_state = TCPS_FIN_WAIT_2;
 				break;
 
-			/*
-			 * In CLOSING state enter the TIME-WAIT state.
-			 * tcp_twstart() discards this tcpcb and creates
-			 * a compressed state.
-			 */
 			case TCPS_CLOSING:
+				/*
+				 * Create a compressed TIME-WAIT state
+				 * with minimal information and discard
+				 * this tcpcb to save memory.
+				 */
 				tcp_twstart(tp);
 				tp = NULL;
 				INP_INFO_WUNLOCK(&tcbinfo);
@@ -1540,23 +1743,21 @@
 				goto done;
 				break;
 
-			/*
-			 * In LAST_ACK, we may still be waiting for data
-			 * to drain and/or to be acked, as well as for

>>> TRUNCATED FOR MAIL (1000 lines) <<<


More information about the p4-projects mailing list