PERFORCE change 166586 for review

Andre Oppermann andre at FreeBSD.org
Sun Jul 26 14:28:15 UTC 2009


http://perforce.freebsd.org/chv.cgi?CH=166586

Change 166586 by andre at andre_t61 on 2009/07/26 14:27:39

	First round of comment and annotation improvements.
	Move tcp_reass_enable check further up.
	Enhance D-SACK full duplicate test.
	On new block insert start reassembly timer.
	Some style improvements.
	Update TODO.

Affected files ...

.. //depot/projects/tcp_reass/netinet/tcp_reass.c#53 edit

Differences ...

==== //depot/projects/tcp_reass/netinet/tcp_reass.c#53 (text+ko) ====

@@ -39,31 +39,65 @@
  * It is the purpose of tcp reassembly to store segments that are received
  * out of order.  This happens when packets are lost along the way due to
  * various reasons.  The most common one is traffic overload which causes
- * routers to stop accepting packets for brief moments.
+ * routers to drop packets for brief moments.
+ *
+ * Upon arrival of the missing segment the whole chain of stored contiguous
+ * segments is moved into the socket buffer at once.  In case of multiple
+ * missing segments the remainder kept in the queue until the next missing
+ * segment arrives.
+ *
+ * When the TCP receiver is in reassembly state *all* arrving segments are
+ * passed through the reassembly queue until all missing segments have been
+ * received and all data is dequeued.
+ *
+ * Implementation details and choices:
+ *
+ * Instead of storing all segments on their own and tracking each with
+ * with a queue structure we build blocks of contiguous segments merged
+ * together.  This way one missing segment needs only one queue structure
+ * tracking the whole block of following contiguous segments.  If a new
+ * segment matches the end of one block and the start of the next block
+ * the two are joined together.  If no match is found a new block is created.
+ *
+ * Instead of a classical list or tailqueue a red-black tree is used to
+ * cope with arbitrary complexity of the blocks and missing segments.  A
+ * queue has O(n) worst case behavior whereas the red-black tree is
+ * O(log n).  This prevents complexity attacks where a long chain of
+ * blocks would have to be traversed to find the right place for the new
+ * segment.
  *
- * Upon arrival of the missing segment(s) the whole chain of stored segments
- * is moved into the socket buffer.  In case of multiple missing segments
- * the first consequtive part is moved with the remainder being kept in
- * store until the next missing segment arrives.
+ * For the segment merging into a block queue structure the operator can
+ * chose between time and space efficiency.  For time efficiency only the
+ * head or tail mbuf chain pointers are updated with the new segments mbuf.
+ * This results in a minimal amount of data accesses and no memory allocations
+ * unless a new block is created.  For space efficiency mbufs and mbuf chains
+ * are compacted and possibly merged together.  It may even allocate a new
+ * and larger mbuf(-chain) to store the segment data with less overhead and
+ * less wasted space.  This makes it immune against mbuf exhaustion attacks
+ * where only tiny amounts of data are received per segment, possibly only
+ * one byte.  Usually network drivers allocate only mbuf cluster of 2KBytes
+ * on receive no matter how large the packet actually is for efficiency
+ * reasons and because can't easily know at DMA time how large the packet
+ * effectively actually is.
  *
- * While in reassembly mode *all* arrving segments are put into the reassembly
- * queue.
+ * Limits, timeout. XXX
  *
- * Instead of storing all segments on their own we build blocks of consequtive
- * segments chained together.  We use a red-black tree to cope with arbitrary
- * complexity.  If a segment matches the end of one block and the start of the
- * next block the two blocks are joined together.  If no match is found a
- * new block is created.
+ * The reassembly queue block structure is also used to track SACK
+ * information as the data receiver.  A double-linked list is added
+ * that tracks the blocks LIFO order of their arrival or updating.
  *
- * The reassembly queues block structure is also used to track SACK
- * information as a data receiver.  A double-linked list is added
- * that links the blocks the reverse order of their arrival or updating.
- * This makes us fully compliant to RFC2018 Section 4 including all
- * optional parts marked as "SHOULD".
+ * Implemented / relevant RFC's:
+ *  RFC793: Transmission Control Protocol
+ *  RFC1123: 
+ *  RFC2018: This makes us fully compliant to RFC2018 Section 4 including all optional parts marked as "SHOULD".
+ *  RFC2883:
  *
  * TODO:
  * - Improve comments and annotate RFC references.
  * - Style improvements.
+ * - Acticate timeout on first insert.
+ * - Partial D-SACK support.
+ * - Return flags should be same minus FIN.
  * - Lots of testing.
  */
 
@@ -132,7 +166,9 @@
 static struct tcp_reass_block *
     tcp_reass_merge(struct tcp_reass_block *, struct tcp_reass_block *);
 
-/* Trim empty mbufs from head of chain. */
+/*
+ * Trim empty mbufs from the head of an mbuf chain.
+ */
 static struct mbuf *
 m_trimhead(struct mbuf *m)
 {
@@ -165,6 +201,7 @@
 static __inline int
 tcp_reass_cmp(struct tcp_reass_block *a, struct tcp_reass_block *b)
 {
+
 	if (SEQ_LT(a->trb_seqe, b->trb_seqs))
 		return (-1);
 	else if (SEQ_GT(a->trb_seqs, b->trb_seqe))
@@ -176,6 +213,9 @@
 RB_PROTOTYPE_STATIC(tcp_ra, tcp_reass_block, trb_rb, tcp_reass_cmp);
 RB_GENERATE_STATIC(tcp_ra, tcp_reass_block, trb_rb, tcp_reass_cmp);
 
+/*
+ * Verify the integrity of the reassembly queue.
+ */
 #ifdef INVARIANTS
 static int
 tcp_reass_verify(struct tcpcb *tp, int prepost)
@@ -214,8 +254,12 @@
 
 	return (1);
 }
-#endif
+#endif /* INVARIANTS */
 
+/*
+ * Remove a single block from the reassembly queue
+ * and free all its mbufs, if any.
+ */
 static void
 tcp_reass_free(struct tcpcb *tp, struct tcp_reass_block *trb)
 {
@@ -229,6 +273,9 @@
 	uma_zfree(tcp_reass_zone, trb);
 }
 
+/*
+ * Remove and free all blocks from the reassembly queue.
+ */
 void
 tcp_reass_flush(struct tcpcb *tp)
 {
@@ -246,6 +293,10 @@
 	    ("%s: rcv_reass_size not zero after flushing", __func__));
 }
 
+/*
+ * Move block to front of SACK list to report SACK blocks in LIFO order.
+ *  RFC2018: section x
+ */
 static __inline void
 tcp_reass_sacktrack(struct tcpcb *tp, struct tcp_reass_block *trb)
 {
@@ -257,7 +308,8 @@
 }
 
 /*
- * Insert segments into the reassembly queue.
+ * Insert segment into the reassembly queue and 
+ * XXX append to socket buffer.
  *
  * NB: We must always consume the mbuf.  Either by appeding it to
  * the queue or by freeing it.
@@ -274,6 +326,12 @@
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/*
+	 * Check if it is really neccessary to do all the work.
+	 */
+	if (!tcp_reass_enable && RB_EMPTY(&tp->rcv_reass))
+		goto done;
+
+	/*
 	 * Call with th==NULL after becoming established to
 	 * force pre-ESTABLISHED data up to user socket.
 	 * XXX: Was used for T/TCP of which code remains.
@@ -303,10 +361,6 @@
 	th_seq = th->th_seq;
 	th = NULL;		/* Prevent further use. */
 
-	/* Check if it is really neccessary to do all the work. */
-	if (!tcp_reass_enable && RB_EMPTY(&tp->rcv_reass))
-		goto done;
-
 	KASSERT(SEQ_GEQ(th_seq, tp->rcv_nxt),
 	    ("%s: sequence number below rcv_nxt", __func__));
 	KASSERT(!(tp->rcv_nxt == th_seq) || !(RB_EMPTY(&tp->rcv_reass)),
@@ -374,9 +428,9 @@
 	}
 
 	/*
-	 * Get rid of packet header and mtags.
+	 * Remove and free packet header and mtags.
 	 * Trim empty mbufs from head of chain.
-	 * Compact mbuf chain.
+	 * Compact the mbuf chain.
 	 */
 	m_demote(m, 1);
 	m_adj(m, hlen);
@@ -393,50 +447,68 @@
 	trbs.trb_mt = m_last(m);
 
 	/*
-	 * Return match that has at least partial overlap to either side or
-	 * insert a new reassembly block.
+	 * Find a block that has at least partial overlap to either side.
+	 * If no block is found either insert a new one or use the stack
+	 * if the segment directly fits rcv_nxt.
+	 *  RFC793: xxx
+	 *  RFC2018: section 3
 	 */
 	if ((trb = RB_FIND(tcp_ra, &tp->rcv_reass, &trbs)) != NULL) {
-
-		/* Within an already received block. */
+		/*
+		 * The new segment is a retransmit and within an already
+		 * received block and thus a full duplicate.
+		 *
+		 * Report it with D-SACK only if it is a subset of and
+		 * not equal to the already existing block.
+		 *  RFC2883: section 4, part (1) and (4)
+		 *  RFC2883: section 4.1, Reporting Full Duplicate Segments
+		 */
 		if (SEQ_GEQ(trbs.trb_seqs, trb->trb_seqs) &&
 		    SEQ_LEQ(trbs.trb_seqe, trb->trb_seqe)) {
 			tcp_reass_sacktrack(tp, trb);
-			tp->rcv_reass_dsack.start = trbs.trb_seqs;
-			tp->rcv_reass_dsack.end = trbs.trb_seqe;
+			if (SEQ_GT(trbs.trb_seqs, trb->trb_seqs) ||
+			    SEQ_LT(trbs.trb_seqe, trb->trb_seqe)) {
+				tp->rcv_reass_dsack.start = trbs.trb_seqs;
+				tp->rcv_reass_dsack.end = trbs.trb_seqe;
+			}
 			TCPSTAT_INC(tcps_rcvduppack);
 			TCPSTAT_ADD(tcps_rcvdupbyte, len);
 			goto done;
 		}
 
+		/*
+		 * Merge the new segment with the already existing block.
+		 */
 		tp->rcv_reass_size += SEQ_DELTA(trbs.trb_seqs, trbs.trb_seqe);
-
-		/* Merge in the new segment. */
 		(void)tcp_reass_merge(trb, &trbs);
 		tcp_reass_sacktrack(tp, trb);
 
+		/*
+		 * Update XXX
+		 *  RFC2883: section 4.2,  Reporting Partial Duplicate Segments
+		 * XXXAO: Add D-SACK block.
+		 */
 		if ((len = SEQ_DELTA(trbs.trb_seqs, trbs.trb_seqe)) > 0) {
 			tp->rcv_reass_size -= len;
 			TCPSTAT_INC(tcps_rcvpartduppack);
 			TCPSTAT_ADD(tcps_rcvpartdupbyte, len);
 		}
 
-		/* Merge in previous block(s) if there is overlap. */
+		/*
+		 * Merge in previous/next block(s) if there is overlap.
+		 */
 		while ((trbn = RB_PREV(tcp_ra, &tp->rcv_reass, trb)) != NULL &&
 		    SEQ_LEQ(trb->trb_seqs, trbn->trb_seqe)) {
 			trbn = tcp_reass_merge(trb, trbn);
 			tcp_reass_free(tp, trbn);
 			TCPSTAT_INC(tcps_reass_merge);
 		}
-
-		/* Merge in next block(s) if there is overlap. */
 		while ((trbn = RB_NEXT(tcp_ra, &tp->rcv_reass, trb)) != NULL &&
 		    SEQ_GEQ(trb->trb_seqe, trbn->trb_seqs)) {
 			trbn = tcp_reass_merge(trb, trbn);
 			tcp_reass_free(tp, trbn);
 			TCPSTAT_INC(tcps_reass_merge);
 		}
-
 	} else if (tp->rcv_nxt == th_seq) {
 		trb = &trbs;
 	} else if ((trb = (struct tcp_reass_block *)uma_zalloc(tcp_reass_zone, (M_NOWAIT|M_ZERO))) != NULL) {
@@ -449,6 +521,8 @@
 		LIST_INSERT_HEAD(&tp->rcv_reass_sack, trb, trb_sack);
 		tp->rcv_reass_size += SEQ_DELTA(trbs.trb_seqs, trbs.trb_seqe);
 		tp->rcv_reass_blocks++;
+		if (RB_EMPTY(&tp->rcv_reass))
+			tcp_timer_activate(tp, TT_REASS, tcp_reass_timeout);
 		TCPSTAT_INC(tcps_reass_blocks);
 	} else {
 		/* Memory allocation failure. */
@@ -458,8 +532,12 @@
 	KASSERT(tcp_reass_verify(tp, 1),
 	    ("%s: reassembly queue went inconsistent", __func__));
 
+	/*
+	 * Deliver data if we've got the missing segment.
+	 */
 	if (trb->trb_seqs == tp->rcv_nxt)
 		goto present;
+
 	return (0);
 
 present:
@@ -475,7 +553,10 @@
 	TCPSTAT_INC(tcps_reass_missingseg);
 
 	SOCKBUF_LOCK(&so->so_rcv);
-	/* We can only ever dequeue one block. */
+	/*
+	 * We can only ever dequeue one consecutive
+	 * block of data at most.
+	 */
 	if (!(so->so_rcv.sb_state & SBS_CANTRCVMORE)) {
 		sbappendstream_locked(&so->so_rcv, trb->trb_m);
 		tp->rcv_nxt += SEQ_DELTA(trb->trb_seqs, trb->trb_seqe);
@@ -488,13 +569,19 @@
 	else
 		tcp_reass_free(tp, trb);
 
-	/* NB: sorwakeup_locked() does a implicit socket buffer unlock. */
+	/* NB: sorwakeup_locked() does an implicit socket buffer unlock. */
 	sorwakeup_locked(so);
 
 	/*
+	 * Don't hold on to data in the reassembly queue for too long.
+	 * Kernel memory is limited and if the connection doesn't make
+	 * any progress in filling the holes we don't want to wait
+	 * forever to prevent memory exhaustion attacks.  The sender
+	 * can always retransmit again.
+	 *  RFC2018: section 8, Data Receiver Reneging
+	 *
 	 * Restart the reassembly queue flush timer after advancing
-	 * the sequence space and if queue is not empty.  Otherwise
-	 * deactivate it.
+	 * the sequence space and if the queue still isn't empty.
 	 */
 	if (tcp_reass_timeout && !RB_EMPTY(&tp->rcv_reass))
 		tcp_timer_activate(tp, TT_REASS, tcp_reass_timeout);
@@ -510,8 +597,8 @@
 }
 
 /*
- * Trim a block.
- * Positive value is from head, negative from tail.
+ * Trim a SACK block.
+ * A positive value is from head, negative from tail.
  */
 static void
 tcp_reass_trim(struct tcp_reass_block *trb, int i)
@@ -519,6 +606,10 @@
 
 	m_adj(trb->trb_m, i);
 
+	/*
+	 * Update the tail pointer or free empty mbufs
+	 * at the head of the chain.
+	 */
 	if (i < 0) {
 		trb->trb_mt = m_last(trb->trb_m);
 		trb->trb_seqe -= i;
@@ -529,7 +620,10 @@
 }
 
 /*
- * Merge one or more consecutive blocks together.
+ * Merge one or more consecutive blocks together.  The number of
+ * redundant bytes is reported as the difference between trbn->
+ * trb_seqs and trb_seqe.
+ *
  * NB: trbn is always merged into trb!
  */
 static struct tcp_reass_block *
@@ -543,7 +637,9 @@
 	    SEQ_GT(trbn->trb_seqe, trb->trb_seqe),
 	    ("%s: blocks not overlapping", __func__));
 
-	/* Replace, prepend and append. */
+	/*
+	 * Replace, prepend or append a block.
+	 */
 	if (SEQ_LEQ(trbn->trb_seqs, trb->trb_seqs) &&
 	    SEQ_GEQ(trbn->trb_seqe, trb->trb_seqe)) {
 
@@ -596,10 +692,12 @@
 }
 
 /*
- * Put the sequence number of the reassembly queue blocks into
- * the SACK options of an outgoing segment.
- *  RFC2018: section ...
- *  RFC2883: section ...
+ * Put the sequence number of the reassembly queue blocks into the
+ * SACK options of an outgoing segment.  If a D-SACK block is available
+ * insert it in the first position followed by the regular SACK blocks.
+ *  RFC2018: section 3, Sack Option Format
+ *  RFC2018: section 4, Generating Sack Options: Data Receiver Behavior
+ *  RFC2883: section 4, Use of the SACK option for reporting a duplicate segment
  */
 int
 tcp_reass_sack(struct tcpcb *tp, u_char *optp, int numsacks)
@@ -614,7 +712,10 @@
 	KASSERT(!LIST_EMPTY(&tp->rcv_reass_sack),
 	    ("%s: sack list empty", __func__));
 
-	/* Create fake SACK block for D-SACK and prepend it. */
+	/*
+	 * Create fake SACK block on the stack for D-SACK and prepend it.
+	 *  RFC2883: section 4, part (3)
+	 */
 	if (tp->rcv_reass_dsack.start != tp->rcv_reass_dsack.end) {
 		bzero(&trbs, sizeof(trbs));
 		trbs.trb_seqs = htonl(tp->rcv_reass_dsack.start);
@@ -625,7 +726,7 @@
 	/*
 	 * The most recent block must appear first.  Add the other
 	 * blocks in most recent created or updated order.
-	 *  RFC2018: section 4
+	 *  RFC2018: section 3 and 4, part (4) and (5)
 	 */
 	LIST_FOREACH(trb, &tp->rcv_reass_sack, trb_sack) {
 		if (numsacks < 1)
@@ -640,7 +741,11 @@
 		nsacks++;
 	}
 
-	/* Remove fake D-SACK block again. */
+	/*
+	 * Remove fake D-SACK block again and zero out the D-SACK
+	 * information.  It must be reported only once.
+	 *  RFC2883: section 4, part (2)
+	 */
 	if (LIST_FIRST(&tp->rcv_reass_sack) == &trbs) {
 		LIST_REMOVE(&trbs, trb_sack);
 		tp->rcv_reass_dsack.start = 0;


More information about the p4-projects mailing list