PERFORCE change 166586 for review
Andre Oppermann
andre at FreeBSD.org
Sun Jul 26 14:28:15 UTC 2009
http://perforce.freebsd.org/chv.cgi?CH=166586
Change 166586 by andre at andre_t61 on 2009/07/26 14:27:39
First round of comment and annotation improvements.
Move tcp_reass_enable check further up.
Enhance D-SACK full duplicate test.
On new block insert start reassembly timer.
Some style improvements.
Update TODO.
Affected files ...
.. //depot/projects/tcp_reass/netinet/tcp_reass.c#53 edit
Differences ...
==== //depot/projects/tcp_reass/netinet/tcp_reass.c#53 (text+ko) ====
@@ -39,31 +39,65 @@
* It is the purpose of tcp reassembly to store segments that are received
* out of order. This happens when packets are lost along the way due to
* various reasons. The most common one is traffic overload which causes
- * routers to stop accepting packets for brief moments.
+ * routers to drop packets for brief moments.
+ *
+ * Upon arrival of the missing segment the whole chain of stored contiguous
+ * segments is moved into the socket buffer at once. In case of multiple
+ * missing segments the remainder kept in the queue until the next missing
+ * segment arrives.
+ *
+ * When the TCP receiver is in reassembly state *all* arrving segments are
+ * passed through the reassembly queue until all missing segments have been
+ * received and all data is dequeued.
+ *
+ * Implementation details and choices:
+ *
+ * Instead of storing all segments on their own and tracking each with
+ * with a queue structure we build blocks of contiguous segments merged
+ * together. This way one missing segment needs only one queue structure
+ * tracking the whole block of following contiguous segments. If a new
+ * segment matches the end of one block and the start of the next block
+ * the two are joined together. If no match is found a new block is created.
+ *
+ * Instead of a classical list or tailqueue a red-black tree is used to
+ * cope with arbitrary complexity of the blocks and missing segments. A
+ * queue has O(n) worst case behavior whereas the red-black tree is
+ * O(log n). This prevents complexity attacks where a long chain of
+ * blocks would have to be traversed to find the right place for the new
+ * segment.
*
- * Upon arrival of the missing segment(s) the whole chain of stored segments
- * is moved into the socket buffer. In case of multiple missing segments
- * the first consequtive part is moved with the remainder being kept in
- * store until the next missing segment arrives.
+ * For the segment merging into a block queue structure the operator can
+ * chose between time and space efficiency. For time efficiency only the
+ * head or tail mbuf chain pointers are updated with the new segments mbuf.
+ * This results in a minimal amount of data accesses and no memory allocations
+ * unless a new block is created. For space efficiency mbufs and mbuf chains
+ * are compacted and possibly merged together. It may even allocate a new
+ * and larger mbuf(-chain) to store the segment data with less overhead and
+ * less wasted space. This makes it immune against mbuf exhaustion attacks
+ * where only tiny amounts of data are received per segment, possibly only
+ * one byte. Usually network drivers allocate only mbuf cluster of 2KBytes
+ * on receive no matter how large the packet actually is for efficiency
+ * reasons and because can't easily know at DMA time how large the packet
+ * effectively actually is.
*
- * While in reassembly mode *all* arrving segments are put into the reassembly
- * queue.
+ * Limits, timeout. XXX
*
- * Instead of storing all segments on their own we build blocks of consequtive
- * segments chained together. We use a red-black tree to cope with arbitrary
- * complexity. If a segment matches the end of one block and the start of the
- * next block the two blocks are joined together. If no match is found a
- * new block is created.
+ * The reassembly queue block structure is also used to track SACK
+ * information as the data receiver. A double-linked list is added
+ * that tracks the blocks LIFO order of their arrival or updating.
*
- * The reassembly queues block structure is also used to track SACK
- * information as a data receiver. A double-linked list is added
- * that links the blocks the reverse order of their arrival or updating.
- * This makes us fully compliant to RFC2018 Section 4 including all
- * optional parts marked as "SHOULD".
+ * Implemented / relevant RFC's:
+ * RFC793: Transmission Control Protocol
+ * RFC1123:
+ * RFC2018: This makes us fully compliant to RFC2018 Section 4 including all optional parts marked as "SHOULD".
+ * RFC2883:
*
* TODO:
* - Improve comments and annotate RFC references.
* - Style improvements.
+ * - Acticate timeout on first insert.
+ * - Partial D-SACK support.
+ * - Return flags should be same minus FIN.
* - Lots of testing.
*/
@@ -132,7 +166,9 @@
static struct tcp_reass_block *
tcp_reass_merge(struct tcp_reass_block *, struct tcp_reass_block *);
-/* Trim empty mbufs from head of chain. */
+/*
+ * Trim empty mbufs from the head of an mbuf chain.
+ */
static struct mbuf *
m_trimhead(struct mbuf *m)
{
@@ -165,6 +201,7 @@
static __inline int
tcp_reass_cmp(struct tcp_reass_block *a, struct tcp_reass_block *b)
{
+
if (SEQ_LT(a->trb_seqe, b->trb_seqs))
return (-1);
else if (SEQ_GT(a->trb_seqs, b->trb_seqe))
@@ -176,6 +213,9 @@
RB_PROTOTYPE_STATIC(tcp_ra, tcp_reass_block, trb_rb, tcp_reass_cmp);
RB_GENERATE_STATIC(tcp_ra, tcp_reass_block, trb_rb, tcp_reass_cmp);
+/*
+ * Verify the integrity of the reassembly queue.
+ */
#ifdef INVARIANTS
static int
tcp_reass_verify(struct tcpcb *tp, int prepost)
@@ -214,8 +254,12 @@
return (1);
}
-#endif
+#endif /* INVARIANTS */
+/*
+ * Remove a single block from the reassembly queue
+ * and free all its mbufs, if any.
+ */
static void
tcp_reass_free(struct tcpcb *tp, struct tcp_reass_block *trb)
{
@@ -229,6 +273,9 @@
uma_zfree(tcp_reass_zone, trb);
}
+/*
+ * Remove and free all blocks from the reassembly queue.
+ */
void
tcp_reass_flush(struct tcpcb *tp)
{
@@ -246,6 +293,10 @@
("%s: rcv_reass_size not zero after flushing", __func__));
}
+/*
+ * Move block to front of SACK list to report SACK blocks in LIFO order.
+ * RFC2018: section x
+ */
static __inline void
tcp_reass_sacktrack(struct tcpcb *tp, struct tcp_reass_block *trb)
{
@@ -257,7 +308,8 @@
}
/*
- * Insert segments into the reassembly queue.
+ * Insert segment into the reassembly queue and
+ * XXX append to socket buffer.
*
* NB: We must always consume the mbuf. Either by appeding it to
* the queue or by freeing it.
@@ -274,6 +326,12 @@
INP_WLOCK_ASSERT(tp->t_inpcb);
/*
+ * Check if it is really neccessary to do all the work.
+ */
+ if (!tcp_reass_enable && RB_EMPTY(&tp->rcv_reass))
+ goto done;
+
+ /*
* Call with th==NULL after becoming established to
* force pre-ESTABLISHED data up to user socket.
* XXX: Was used for T/TCP of which code remains.
@@ -303,10 +361,6 @@
th_seq = th->th_seq;
th = NULL; /* Prevent further use. */
- /* Check if it is really neccessary to do all the work. */
- if (!tcp_reass_enable && RB_EMPTY(&tp->rcv_reass))
- goto done;
-
KASSERT(SEQ_GEQ(th_seq, tp->rcv_nxt),
("%s: sequence number below rcv_nxt", __func__));
KASSERT(!(tp->rcv_nxt == th_seq) || !(RB_EMPTY(&tp->rcv_reass)),
@@ -374,9 +428,9 @@
}
/*
- * Get rid of packet header and mtags.
+ * Remove and free packet header and mtags.
* Trim empty mbufs from head of chain.
- * Compact mbuf chain.
+ * Compact the mbuf chain.
*/
m_demote(m, 1);
m_adj(m, hlen);
@@ -393,50 +447,68 @@
trbs.trb_mt = m_last(m);
/*
- * Return match that has at least partial overlap to either side or
- * insert a new reassembly block.
+ * Find a block that has at least partial overlap to either side.
+ * If no block is found either insert a new one or use the stack
+ * if the segment directly fits rcv_nxt.
+ * RFC793: xxx
+ * RFC2018: section 3
*/
if ((trb = RB_FIND(tcp_ra, &tp->rcv_reass, &trbs)) != NULL) {
-
- /* Within an already received block. */
+ /*
+ * The new segment is a retransmit and within an already
+ * received block and thus a full duplicate.
+ *
+ * Report it with D-SACK only if it is a subset of and
+ * not equal to the already existing block.
+ * RFC2883: section 4, part (1) and (4)
+ * RFC2883: section 4.1, Reporting Full Duplicate Segments
+ */
if (SEQ_GEQ(trbs.trb_seqs, trb->trb_seqs) &&
SEQ_LEQ(trbs.trb_seqe, trb->trb_seqe)) {
tcp_reass_sacktrack(tp, trb);
- tp->rcv_reass_dsack.start = trbs.trb_seqs;
- tp->rcv_reass_dsack.end = trbs.trb_seqe;
+ if (SEQ_GT(trbs.trb_seqs, trb->trb_seqs) ||
+ SEQ_LT(trbs.trb_seqe, trb->trb_seqe)) {
+ tp->rcv_reass_dsack.start = trbs.trb_seqs;
+ tp->rcv_reass_dsack.end = trbs.trb_seqe;
+ }
TCPSTAT_INC(tcps_rcvduppack);
TCPSTAT_ADD(tcps_rcvdupbyte, len);
goto done;
}
+ /*
+ * Merge the new segment with the already existing block.
+ */
tp->rcv_reass_size += SEQ_DELTA(trbs.trb_seqs, trbs.trb_seqe);
-
- /* Merge in the new segment. */
(void)tcp_reass_merge(trb, &trbs);
tcp_reass_sacktrack(tp, trb);
+ /*
+ * Update XXX
+ * RFC2883: section 4.2, Reporting Partial Duplicate Segments
+ * XXXAO: Add D-SACK block.
+ */
if ((len = SEQ_DELTA(trbs.trb_seqs, trbs.trb_seqe)) > 0) {
tp->rcv_reass_size -= len;
TCPSTAT_INC(tcps_rcvpartduppack);
TCPSTAT_ADD(tcps_rcvpartdupbyte, len);
}
- /* Merge in previous block(s) if there is overlap. */
+ /*
+ * Merge in previous/next block(s) if there is overlap.
+ */
while ((trbn = RB_PREV(tcp_ra, &tp->rcv_reass, trb)) != NULL &&
SEQ_LEQ(trb->trb_seqs, trbn->trb_seqe)) {
trbn = tcp_reass_merge(trb, trbn);
tcp_reass_free(tp, trbn);
TCPSTAT_INC(tcps_reass_merge);
}
-
- /* Merge in next block(s) if there is overlap. */
while ((trbn = RB_NEXT(tcp_ra, &tp->rcv_reass, trb)) != NULL &&
SEQ_GEQ(trb->trb_seqe, trbn->trb_seqs)) {
trbn = tcp_reass_merge(trb, trbn);
tcp_reass_free(tp, trbn);
TCPSTAT_INC(tcps_reass_merge);
}
-
} else if (tp->rcv_nxt == th_seq) {
trb = &trbs;
} else if ((trb = (struct tcp_reass_block *)uma_zalloc(tcp_reass_zone, (M_NOWAIT|M_ZERO))) != NULL) {
@@ -449,6 +521,8 @@
LIST_INSERT_HEAD(&tp->rcv_reass_sack, trb, trb_sack);
tp->rcv_reass_size += SEQ_DELTA(trbs.trb_seqs, trbs.trb_seqe);
tp->rcv_reass_blocks++;
+ if (RB_EMPTY(&tp->rcv_reass))
+ tcp_timer_activate(tp, TT_REASS, tcp_reass_timeout);
TCPSTAT_INC(tcps_reass_blocks);
} else {
/* Memory allocation failure. */
@@ -458,8 +532,12 @@
KASSERT(tcp_reass_verify(tp, 1),
("%s: reassembly queue went inconsistent", __func__));
+ /*
+ * Deliver data if we've got the missing segment.
+ */
if (trb->trb_seqs == tp->rcv_nxt)
goto present;
+
return (0);
present:
@@ -475,7 +553,10 @@
TCPSTAT_INC(tcps_reass_missingseg);
SOCKBUF_LOCK(&so->so_rcv);
- /* We can only ever dequeue one block. */
+ /*
+ * We can only ever dequeue one consecutive
+ * block of data at most.
+ */
if (!(so->so_rcv.sb_state & SBS_CANTRCVMORE)) {
sbappendstream_locked(&so->so_rcv, trb->trb_m);
tp->rcv_nxt += SEQ_DELTA(trb->trb_seqs, trb->trb_seqe);
@@ -488,13 +569,19 @@
else
tcp_reass_free(tp, trb);
- /* NB: sorwakeup_locked() does a implicit socket buffer unlock. */
+ /* NB: sorwakeup_locked() does an implicit socket buffer unlock. */
sorwakeup_locked(so);
/*
+ * Don't hold on to data in the reassembly queue for too long.
+ * Kernel memory is limited and if the connection doesn't make
+ * any progress in filling the holes we don't want to wait
+ * forever to prevent memory exhaustion attacks. The sender
+ * can always retransmit again.
+ * RFC2018: section 8, Data Receiver Reneging
+ *
* Restart the reassembly queue flush timer after advancing
- * the sequence space and if queue is not empty. Otherwise
- * deactivate it.
+ * the sequence space and if the queue still isn't empty.
*/
if (tcp_reass_timeout && !RB_EMPTY(&tp->rcv_reass))
tcp_timer_activate(tp, TT_REASS, tcp_reass_timeout);
@@ -510,8 +597,8 @@
}
/*
- * Trim a block.
- * Positive value is from head, negative from tail.
+ * Trim a SACK block.
+ * A positive value is from head, negative from tail.
*/
static void
tcp_reass_trim(struct tcp_reass_block *trb, int i)
@@ -519,6 +606,10 @@
m_adj(trb->trb_m, i);
+ /*
+ * Update the tail pointer or free empty mbufs
+ * at the head of the chain.
+ */
if (i < 0) {
trb->trb_mt = m_last(trb->trb_m);
trb->trb_seqe -= i;
@@ -529,7 +620,10 @@
}
/*
- * Merge one or more consecutive blocks together.
+ * Merge one or more consecutive blocks together. The number of
+ * redundant bytes is reported as the difference between trbn->
+ * trb_seqs and trb_seqe.
+ *
* NB: trbn is always merged into trb!
*/
static struct tcp_reass_block *
@@ -543,7 +637,9 @@
SEQ_GT(trbn->trb_seqe, trb->trb_seqe),
("%s: blocks not overlapping", __func__));
- /* Replace, prepend and append. */
+ /*
+ * Replace, prepend or append a block.
+ */
if (SEQ_LEQ(trbn->trb_seqs, trb->trb_seqs) &&
SEQ_GEQ(trbn->trb_seqe, trb->trb_seqe)) {
@@ -596,10 +692,12 @@
}
/*
- * Put the sequence number of the reassembly queue blocks into
- * the SACK options of an outgoing segment.
- * RFC2018: section ...
- * RFC2883: section ...
+ * Put the sequence number of the reassembly queue blocks into the
+ * SACK options of an outgoing segment. If a D-SACK block is available
+ * insert it in the first position followed by the regular SACK blocks.
+ * RFC2018: section 3, Sack Option Format
+ * RFC2018: section 4, Generating Sack Options: Data Receiver Behavior
+ * RFC2883: section 4, Use of the SACK option for reporting a duplicate segment
*/
int
tcp_reass_sack(struct tcpcb *tp, u_char *optp, int numsacks)
@@ -614,7 +712,10 @@
KASSERT(!LIST_EMPTY(&tp->rcv_reass_sack),
("%s: sack list empty", __func__));
- /* Create fake SACK block for D-SACK and prepend it. */
+ /*
+ * Create fake SACK block on the stack for D-SACK and prepend it.
+ * RFC2883: section 4, part (3)
+ */
if (tp->rcv_reass_dsack.start != tp->rcv_reass_dsack.end) {
bzero(&trbs, sizeof(trbs));
trbs.trb_seqs = htonl(tp->rcv_reass_dsack.start);
@@ -625,7 +726,7 @@
/*
* The most recent block must appear first. Add the other
* blocks in most recent created or updated order.
- * RFC2018: section 4
+ * RFC2018: section 3 and 4, part (4) and (5)
*/
LIST_FOREACH(trb, &tp->rcv_reass_sack, trb_sack) {
if (numsacks < 1)
@@ -640,7 +741,11 @@
nsacks++;
}
- /* Remove fake D-SACK block again. */
+ /*
+ * Remove fake D-SACK block again and zero out the D-SACK
+ * information. It must be reported only once.
+ * RFC2883: section 4, part (2)
+ */
if (LIST_FIRST(&tp->rcv_reass_sack) == &trbs) {
LIST_REMOVE(&trbs, trb_sack);
tp->rcv_reass_dsack.start = 0;
More information about the p4-projects
mailing list