PERFORCE change 180313 for review
Andre Oppermann
andre at FreeBSD.org
Tue Jun 29 08:26:52 UTC 2010
http://p4web.freebsd.org/@@180313?ac=10
Change 180313 by andre at andre_t61 on 2010/06/29 08:26:05
Import new reassembly queue implementation from //depot/projects/tcp_reass/netinet/tcp_reass.c #56/56
Affected files ...
.. //depot/projects/tcp_new/netinet/tcp_reass.c#3 edit
Differences ...
==== //depot/projects/tcp_new/netinet/tcp_reass.c#3 (text+ko) ====
@@ -1,6 +1,6 @@
/*-
- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
- * The Regents of the University of California. All rights reserved.
+ * Copyright (c) 2007-2009
+ * Andre Oppermann, Internet Business Solutions AG. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -27,14 +27,85 @@
* SUCH DAMAGE.
*
* @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
+ * $FreeBSD: src/sys/netinet/tcp_reass.c,v 1.352 2007/05/13 22:16:13 andre Exp $
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/netinet/tcp_reass.c,v 1.353 2007/10/07 20:44:24 silby Exp $");
+__FBSDID("$FreeBSD: src/sys/netinet/tcp_reass.c,v 1.364 2009/08/01 19:26:27 rwatson Exp $");
+
+/*
+ * Operational overview of TCP reassembly:
+ *
+ * It is the purpose of tcp reassembly to store segments that are received
+ * out of order. This happens when packets are lost along the way due to
+ * various reasons. The most common one is traffic overload which causes
+ * routers to drop packets for brief moments.
+ *
+ * Upon arrival of the missing segment the whole chain of stored contiguous
+ * segments is moved into the socket buffer at once. In case of multiple
+ * missing segments the remainder kept in the queue until the next missing
+ * segment arrives.
+ *
+ * When the TCP receiver is in reassembly state *all* arrving segments are
+ * passed through the reassembly queue until all missing segments have been
+ * received and all data is dequeued.
+ *
+ * Implementation details and choices:
+ *
+ * Instead of storing all segments on their own and tracking each with
+ * with a queue structure we build blocks of contiguous segments merged
+ * together. This way one missing segment needs only one queue structure
+ * tracking the whole block of following contiguous segments. If a new
+ * segment matches the end of one block and the start of the next block
+ * the two are joined together. If no match is found a new block is created.
+ *
+ * Instead of a classical list or tailqueue a red-black tree is used to
+ * cope with arbitrary complexity of the blocks and missing segments. A
+ * queue has O(n) worst case behavior whereas the red-black tree is
+ * O(log n). This prevents complexity attacks where a long chain of
+ * blocks would have to be traversed to find the right place for the new
+ * segment. Especially with high bandwidth*delay product links and large
+ * socket buffers this is a valid concern.
+ *
+ * For the segment merging into a block queue structure the operator can
+ * chose between time and space efficiency. For time efficiency only the
+ * head or tail mbuf chain pointers are updated with the new segments mbuf.
+ * This results in a minimal amount of data accesses and no memory allocations
+ * unless a new block is created. For space efficiency mbufs and mbuf chains
+ * are compacted and possibly merged together. It may even allocate a new
+ * and larger mbuf(-chain) to store the segment data with less overhead and
+ * less wasted space. This makes it immune against mbuf exhaustion attacks
+ * where only tiny amounts of data are received per segment, possibly only
+ * one byte. Usually network drivers allocate only mbuf cluster of 2KBytes
+ * on receive no matter how large the packet actually is for efficiency
+ * reasons and because can't easily know at DMA time how large the packet
+ * effectively actually is.
+ *
+ * To prevent resource exhaustion attacks a local and global limit governs
+ * the number of reassembly blocks. The local limit prevents single connections
+ * from monopolizing the global limit. When used in space efficient mode
+ * the total memory consumption of the reassembly queue can't be more than
+ * the receive socket buffer size. To prevent lost connections from holding
+ * on for too long a timeout causes flushing of all queued data.
+ *
+ * The reassembly queue block structure is also used to track SACK information
+ * as the data receiver. A double-linked list is added that tracks the blocks
+ * LIFO order of their arrival or updating.
+ *
+ * Implemented / relevant RFC's:
+ * RFC793: Transmission Control Protocol
+ * RFC1122: section 4.2.2.20 and section 4.2.2.21
+ * RFC2018: SACK, section 4 including all optional parts marked as "SHOULD"
+ * RFC2883: D-SACK, section 4
+ *
+ * TODO:
+ * - D-SACK when only one SACK slot available?
+ * - Direct pointer to highest seqnum block in RB-tree?
+ * - Remove T/TCP gonk.
+ * - Lots of testing.
+ */
#include "opt_inet.h"
-#include "opt_inet6.h"
-#include "opt_tcpdebug.h"
#include <sys/param.h>
#include <sys/kernel.h>
@@ -50,236 +121,668 @@
#include <net/if.h>
#include <net/route.h>
+#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/in_pcb.h>
#include <netinet/in_systm.h>
-#include <netinet/in_var.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/ip_options.h>
-#include <netinet/ip6.h>
-#include <netinet6/in6_pcb.h>
-#include <netinet6/ip6_var.h>
-#include <netinet6/nd6.h>
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
-#include <netinet6/tcp6_var.h>
#include <netinet/tcpip.h>
-#ifdef TCPDEBUG
-#include <netinet/tcp_debug.h>
-#endif /* TCPDEBUG */
+
+uma_zone_t tcp_reass_zone;
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
"TCP Segment Reassembly Queue");
-static int tcp_reass_maxseg = 0;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN,
- &tcp_reass_maxseg, 0,
- "Global maximum number of TCP Segments in Reassembly Queue");
+static int tcp_reass_enable = 1;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, enable, CTLFLAG_RW,
+ &tcp_reass_enable, 0,
+ "Enable/disable use of TCP reassembly queue");
+
+static int tcp_reass_maxblocks = 32;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxblocks, CTLFLAG_RW,
+ &tcp_reass_maxblocks, 0,
+ "Per connection limit of TCP segment blocks in reassembly queue");
+
+static int tcp_reass_globalmaxblocks = 65535;
+SYSCTL_PROC(_net_inet_tcp_reass, OID_AUTO, globalmaxblocks,
+ CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_TUN, &tcp_reass_zone, 0,
+ sysctl_zonelimit, "I",
+ "Global limit of TCP segment blocks in reassembly queue");
+
+static int tcp_reass_timeout = 0;
+SYSCTL_PROC(_net_inet_tcp_reass, OID_AUTO, timeout, CTLTYPE_INT|CTLFLAG_RW,
+ &tcp_reass_timeout, 0, sysctl_msec_to_ticks, "I",
+ "Reassembly queue flush timeout in milliseconds");
+
+static int tcp_reass_spacetime = 0;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, space_time, CTLFLAG_RW,
+ &tcp_reass_spacetime, 0,
+ "Reassembly queue strategy of space vs. time efficiency");
+
+static struct tcp_reass_block *
+ tcp_reass_merge(struct tcp_reass_block *, struct tcp_reass_block *);
+
+/*
+ * Trim empty mbufs from the head of an mbuf chain.
+ */
+static struct mbuf *
+m_trimhead(struct mbuf *m)
+{
+
+ while (m != NULL && m->m_len == 0)
+ m = m_free(m);
+
+ return (m);
+}
+
+/*
+ * Initialize TCP reassembly zone on startup.
+ */
+void
+tcp_reass_init(void)
+{
+
+ TUNABLE_INT_FETCH("net.inet.tcp.reass.globalmaxblocks",
+ &tcp_reass_globalmaxblocks);
+ tcp_reass_zone = uma_zcreate("tcpreass", sizeof(struct tcp_reass_block),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ uma_zone_set_max(tcp_reass_zone, tcp_reass_globalmaxblocks);
+ tcp_reass_timeout = 2 * TCPTV_MSL;
+}
+
+/*
+ * Compare function implementing the ranged lookup on the RB tree.
+ * NB: The tree must never have any overlapping elements.
+ */
+static __inline int
+tcp_reass_cmp(struct tcp_reass_block *a, struct tcp_reass_block *b)
+{
+
+ if (SEQ_LT(a->trb_seqe, b->trb_seqs))
+ return (-1);
+ else if (SEQ_GT(a->trb_seqs, b->trb_seqe))
+ return (1);
+ else
+ return (0);
+}
+
+RB_PROTOTYPE_STATIC(tcp_ra, tcp_reass_block, trb_rb, tcp_reass_cmp);
+RB_GENERATE_STATIC(tcp_ra, tcp_reass_block, trb_rb, tcp_reass_cmp);
+
+/*
+ * Verify the integrity of the reassembly queue.
+ */
+#ifdef INVARIANTS
+static int
+tcp_reass_verify(struct tcpcb *tp, int prepost)
+{
+ int i = 0, size = 0, total = 0;
+ struct mbuf *m;
+ struct tcp_reass_block *trb, *trbn;
-int tcp_reass_qsize = 0;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD,
- &tcp_reass_qsize, 0,
- "Global number of TCP Segments currently in Reassembly Queue");
+ RB_FOREACH_SAFE(trb, tcp_ra, &tp->rcv_reass, trbn) {
+ KASSERT(SEQ_LT(trb->trb_seqs, trb->trb_seqe),
+ ("%s: trb_seqs >= trb_seqe", __func__));
+ KASSERT(SEQ_GT(trb->trb_seqs, tp->rcv_nxt) ||
+ prepost, ("%s: rcv_nxt >= trb_seqs", __func__));
+ KASSERT(trb->trb_m != NULL,
+ ("%s: trb_m == NULL", __func__));
+ KASSERT(trb->trb_mt != NULL,
+ ("%s: trb_mt == NULL", __func__));
+ size = SEQ_DELTA(trb->trb_seqs, trb->trb_seqe);
+ KASSERT(size == m_length(trb->trb_m, &m),
+ ("%s: seq# size != actual mbuf size", __func__));
+ KASSERT(trb->trb_mt == m,
+ ("%s: trb_mt is not last mbuf", __func__));
+ KASSERT(trbn == NULL || SEQ_LT(trb->trb_seqe, trbn->trb_seqs),
+ ("%s: overlaps into next block", __func__));
+ total += size;
+ i++;
+ }
+ KASSERT(tp->rcv_reass_size == total,
+ ("%s: rcv_reass_size not correct", __func__));
-static int tcp_reass_maxqlen = 48;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxqlen, CTLFLAG_RW,
- &tcp_reass_maxqlen, 0,
- "Maximum number of TCP Segments per individual Reassembly Queue");
+ LIST_FOREACH(trb, &tp->rcv_reass_sack, trb_sack) {
+ i--;
+ }
+ KASSERT(i == 0,
+ ("%s: sack list incorrect", __func__));
-static int tcp_reass_overflows = 0;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD,
- &tcp_reass_overflows, 0,
- "Global number of TCP Segment Reassembly Queue Overflows");
+ return (1);
+}
+#endif /* INVARIANTS */
-/* Initialize TCP reassembly queue */
+/*
+ * Remove a single block from the reassembly queue
+ * and free all of its mbufs, if any.
+ */
static void
-tcp_reass_zone_change(void *tag)
+tcp_reass_free(struct tcpcb *tp, struct tcp_reass_block *trb)
{
- tcp_reass_maxseg = nmbclusters / 16;
- uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg);
+ trb = RB_REMOVE(tcp_ra, &tp->rcv_reass, trb);
+ KASSERT(trb != NULL, ("%s: RB_REMOVE failed", __func__));
+ LIST_REMOVE(trb, trb_sack);
+ m_freem(trb->trb_m);
+ tp->rcv_reass_size -= SEQ_DELTA(trb->trb_seqs, trb->trb_seqe);
+ tp->rcv_reass_blocks--;
+ uma_zfree(tcp_reass_zone, trb);
}
-uma_zone_t tcp_reass_zone;
+/*
+ * Remove and free all blocks from the reassembly queue.
+ */
+void
+tcp_reass_flush(struct tcpcb *tp)
+{
+ struct tcp_reass_block *trb, *trbn;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ KASSERT(tcp_reass_verify(tp, 0),
+ ("%s: reassembly queue inconsistent", __func__));
+
+ RB_FOREACH_SAFE(trb, tcp_ra, &tp->rcv_reass, trbn) {
+ tcp_reass_free(tp, trb);
+ }
+
+ KASSERT(tp->rcv_reass_size == 0,
+ ("%s: rcv_reass_size not zero after flushing", __func__));
+}
-void
-tcp_reass_init(void)
+/*
+ * Move block to front of SACK list to report SACK blocks in LIFO order.
+ * RFC2018: section 4
+ */
+static __inline void
+tcp_reass_sacktrack(struct tcpcb *tp, struct tcp_reass_block *trb)
{
- tcp_reass_maxseg = nmbclusters / 16;
- TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments",
- &tcp_reass_maxseg);
- tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent),
- NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
- uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg);
- EVENTHANDLER_REGISTER(nmbclusters_change,
- tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY);
+ if (LIST_FIRST(&tp->rcv_reass_sack) != trb) {
+ LIST_REMOVE(trb, trb_sack);
+ LIST_INSERT_HEAD(&tp->rcv_reass_sack, trb, trb_sack);
+ }
}
+/*
+ * Integrate the new segment into the reassembly queue. When the segment
+ * matches RCV.NXT append it to the socket buffer including all eglible
+ * data from the reassembly queue.
+ *
+ * NB: We must always consume the mbuf. Either by appeding it to
+ * the queue or by freeing it.
+ */
int
-tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
+tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int len, int hlen)
{
- struct tseg_qent *q;
- struct tseg_qent *p = NULL;
- struct tseg_qent *nq;
- struct tseg_qent *te = NULL;
+ int thflags = 0;
+ tcp_seq th_seq;
struct socket *so = tp->t_inpcb->inp_socket;
- int flags;
+ struct tcp_reass_block *trb = NULL, *trbn;
+ struct tcp_reass_block trbs;
- INP_LOCK_ASSERT(tp->t_inpcb);
+ INP_WLOCK_ASSERT(tp->t_inpcb);
/*
- * XXX: tcp_reass() is rather inefficient with its data structures
- * and should be rewritten (see NetBSD for optimizations). While
- * doing that it should move to its own file tcp_reass.c.
+ * Check if it is really neccessary to do all the work.
*/
+ if (!tcp_reass_enable && RB_EMPTY(&tp->rcv_reass))
+ goto done;
/*
- * Call with th==NULL after become established to
+ * Call with th==NULL after becoming established to
* force pre-ESTABLISHED data up to user socket.
+ * XXX: Was used for T/TCP of which code remains.
*/
- if (th == NULL)
+ if (th == NULL) {
+ if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
+ RB_EMPTY(&tp->rcv_reass) ||
+ ((trb = RB_MIN(tcp_ra, &tp->rcv_reass)) &&
+ trb->trb_seqs != tp->rcv_nxt))
+ return (0);
+ trb = RB_MIN(tcp_ra, &tp->rcv_reass);
goto present;
+ }
+
+ KASSERT(th != NULL, ("%s: th is NULL", __func__));
+ KASSERT(m != NULL, ("%s: m is NULL", __func__));
+ KASSERT(len + hlen == m_length(m, NULL),
+ ("%s: len + hlen != mbuf length", __func__));
+ KASSERT(hlen <= m_length(m, NULL),
+ ("%s: hlen > mbuf length", __func__));
+
+ /*
+ * Store TCP header information in local variables as
+ * we may lose access to it after header dropping and
+ * mbuf compacting.
+ */
+ thflags = th->th_flags;
+ th_seq = th->th_seq;
+ th = NULL; /* Prevent further use. */
+
+ KASSERT(SEQ_GEQ(th_seq, tp->rcv_nxt),
+ ("%s: sequence number below rcv_nxt", __func__));
+ KASSERT(!(tp->rcv_nxt == th_seq) || !(RB_EMPTY(&tp->rcv_reass)),
+ ("%s: got missing segment but queue is empty", __func__));
+ KASSERT(tcp_reass_verify(tp, 0),
+ ("%s: reassembly queue already inconsistent", __func__));
/*
* Limit the number of segments in the reassembly queue to prevent
* holding on to too many segments (and thus running out of mbufs).
* Make sure to let the missing segment through which caused this
- * queue. Always keep one global queue entry spare to be able to
- * process the missing segment.
+ * queue.
+ *
+ * Count the gross space used by the mbufs in the reassembly queue
+ * and limit it to the free space in the socket buffer. This way
+ * the reassembly queue can never consume more mbuf space than the
+ * socket buffer got allocated anyway and it reflects the actual
+ * amount of kernel memory used. This effectively prevents mbuf
+ * exhaustion due to pathological traffic (one byte segments with
+ * a hole each time) on a single connection.
+ *
+ * Counting the gross mbuf space effectively sets the net data
+ * limit lower than the socket buffer would allow.
+ * Don't underestimates the effective free space in the socket
+ * buffer vs. actual real data with 2k clusters and 1500 byte
+ * packets by introducing a correction factor of 11/8th.
*/
- if (th->th_seq != tp->rcv_nxt &&
- (tcp_reass_qsize + 1 >= tcp_reass_maxseg ||
- tp->t_segqlen >= tcp_reass_maxqlen)) {
- tcp_reass_overflows++;
- tcpstat.tcps_rcvmemdrop++;
- m_freem(m);
- *tlenp = 0;
- return (0);
+ if (th_seq != tp->rcv_nxt &&
+ tp->rcv_reass_blocks > tcp_reass_maxblocks) {
+ //(sbspace(&so->so_rcv) / 8 * 11)
+ TCPSTAT_INC(tcps_reass_overflow);
+ TCPSTAT_INC(tcps_rcvmemdrop);
+ goto done;
}
/*
- * Allocate a new queue entry. If we can't, or hit the zone limit
- * just drop the pkt.
+ * FIN handling is a bit tricky.
+ * We cannot trust a FIN that goes into the reassembly queue.
+ * It can be easily spoofed as it may be anywhere in the receive
+ * window (see RST attack mitigation in tcp-secure).
+ * For this reason (and complexity avoidance) we generally ignore
+ * any FIN arriving at the reassembly queue with one exception;
+ * When it exactly matches rcv_nxt together with any data in the
+ * same segment we can conclude it to be genuine and proceed with
+ * flushing any other data waiting in the reassembly queue.
+ * A FIN is part of the sequence space and will get retransmitted
+ * if it was genuine.
+ * This approach is based on a discussion on TCPM mailing list.
*/
- te = uma_zalloc(tcp_reass_zone, M_NOWAIT);
- if (te == NULL) {
- tcpstat.tcps_rcvmemdrop++;
- m_freem(m);
- *tlenp = 0;
- return (0);
+ if ((thflags & TH_FIN) && tp->rcv_nxt == th_seq) {
+ tcp_reass_flush(tp);
+ if (m->m_len == 0) {
+ tcp_timer_activate(tp, TT_REASS, 0);
+ return (thflags);
+ }
+ } else if (len == 0)
+ goto done;
+ else
+ thflags &= ~TH_FIN;
+
+ /* Statistics. */
+ if (tp->rcv_nxt != th_seq) {
+ TCPSTAT_INC(tcps_rcvoopack);
+ TCPSTAT_ADD(tcps_rcvoobyte, len);
}
- tp->t_segqlen++;
- tcp_reass_qsize++;
/*
- * Find a segment which begins after this one does.
+ * Remove and free packet header and mtags.
+ * Trim empty mbufs from head of chain.
+ * Compact the mbuf chain.
*/
- LIST_FOREACH(q, &tp->t_segq, tqe_q) {
- if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
- break;
- p = q;
- }
+ m_demote(m, 1);
+ m_adj(m, hlen);
+ m = m_trimhead(m);
+ if (tcp_reass_spacetime && m->m_next != NULL)
+ m = m_collapse(m, M_DONTWAIT, 1024);
+
+ KASSERT(m != NULL, ("%s: m is NULL after collapse", __func__));
+
+ /* Set up search structure. */
+ trbs.trb_seqs = th_seq;
+ trbs.trb_seqe = th_seq + len;
+ trbs.trb_m = m;
+ trbs.trb_mt = m_last(m);
/*
- * If there is a preceding segment, it may provide some of
- * our data already. If so, drop the data from the incoming
- * segment. If it provides all of our data, drop us.
+ * Find a block that has at least partial overlap to either side.
+ * If no block is found either insert a new one or use the stack
+ * if the segment directly fits rcv_nxt.
+ * RFC793: section 3.9, page 69-76
+ * RFC2018: section 3
*/
- if (p != NULL) {
- int i;
- /* conversion to int (in i) handles seq wraparound */
- i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
- if (i > 0) {
- if (i >= *tlenp) {
- tcpstat.tcps_rcvduppack++;
- tcpstat.tcps_rcvdupbyte += *tlenp;
- m_freem(m);
- uma_zfree(tcp_reass_zone, te);
- tp->t_segqlen--;
- tcp_reass_qsize--;
- /*
- * Try to present any queued data
- * at the left window edge to the user.
- * This is needed after the 3-WHS
- * completes.
- */
- goto present; /* ??? */
+ if ((trb = RB_FIND(tcp_ra, &tp->rcv_reass, &trbs)) != NULL) {
+ /*
+ * The new segment is a retransmit and within an already
+ * received block and thus a full duplicate.
+ *
+ * Report it with D-SACK only if it is a subset of and
+ * not equal to the already existing block.
+ * RFC2883: section 4, part (1) and (4)
+ * RFC2883: section 4.1, Reporting Full Duplicate Segments
+ */
+ if (SEQ_GEQ(trbs.trb_seqs, trb->trb_seqs) &&
+ SEQ_LEQ(trbs.trb_seqe, trb->trb_seqe)) {
+ tcp_reass_sacktrack(tp, trb);
+ if (SEQ_GT(trbs.trb_seqs, trb->trb_seqs) ||
+ SEQ_LT(trbs.trb_seqe, trb->trb_seqe)) {
+ tp->rcv_reass_dsack.start = trbs.trb_seqs;
+ tp->rcv_reass_dsack.end = trbs.trb_seqe;
}
- m_adj(m, i);
- *tlenp -= i;
- th->th_seq += i;
+ TCPSTAT_INC(tcps_rcvduppack);
+ TCPSTAT_ADD(tcps_rcvdupbyte, len);
+ goto done;
+ }
+
+ /*
+ * Merge the new segment with the already existing block.
+ */
+ tp->rcv_reass_size += SEQ_DELTA(trbs.trb_seqs, trbs.trb_seqe);
+ (void)tcp_reass_merge(trb, &trbs);
+ tcp_reass_sacktrack(tp, trb);
+
+ /*
+ * Update the D-SACK information.
+ * RFC2883: section 4.2, Reporting Partial Duplicate Segments
+ */
+ if ((len = SEQ_DELTA(trbs.trb_seqs, trbs.trb_seqe)) > 0) {
+ tp->rcv_reass_size -= len;
+ tp->rcv_reass_dsack.start = trbs.trb_seqs;
+ tp->rcv_reass_dsack.end = trbs.trb_seqe;
+ TCPSTAT_INC(tcps_rcvpartduppack);
+ TCPSTAT_ADD(tcps_rcvpartdupbyte, len);
+ }
+
+ /*
+ * Merge in previous/next block(s) if there is overlap.
+ */
+ while ((trbn = RB_PREV(tcp_ra, &tp->rcv_reass, trb)) != NULL &&
+ SEQ_LEQ(trb->trb_seqs, trbn->trb_seqe)) {
+ trbn = tcp_reass_merge(trb, trbn);
+ tcp_reass_free(tp, trbn);
+ TCPSTAT_INC(tcps_reass_merge);
+ }
+ while ((trbn = RB_NEXT(tcp_ra, &tp->rcv_reass, trb)) != NULL &&
+ SEQ_GEQ(trb->trb_seqe, trbn->trb_seqs)) {
+ trbn = tcp_reass_merge(trb, trbn);
+ tcp_reass_free(tp, trbn);
+ TCPSTAT_INC(tcps_reass_merge);
+ }
+ } else if (tp->rcv_nxt == th_seq) {
+ /*
+ * For segments attaching to RCV.NXT do not allocate
+ * a new block structure to prevent failure under tight
+ * memory conditions. Instead use temporary stack based
+ * storage.
+ */
+ trb = &trbs;
+ } else if ((trb = (struct tcp_reass_block *)uma_zalloc(tcp_reass_zone, (M_NOWAIT|M_ZERO))) != NULL) {
+ /* Insert new block as no eglible existing block for merging was found. */
+ trb->trb_seqs = trbs.trb_seqs;
+ trb->trb_seqe = trbs.trb_seqe;
+ trb->trb_m = trbs.trb_m;
+ trb->trb_mt = trbs.trb_mt;
+ trbn = RB_INSERT(tcp_ra, &tp->rcv_reass, trb);
+ KASSERT(trbn == NULL, ("%s: RB_INSERT failed", __func__));
+ LIST_INSERT_HEAD(&tp->rcv_reass_sack, trb, trb_sack);
+ tp->rcv_reass_size += SEQ_DELTA(trbs.trb_seqs, trbs.trb_seqe);
+ tp->rcv_reass_blocks++;
+ if (tp->rcv_reass_blocks == 1)) {
+ KASSERT(tcp_timer_active(tp, TT_REASS) == 0,
+ ("%s: reassembly timer already active", __func__));
+ tcp_timer_activate(tp, TT_REASS, tcp_reass_timeout);
}
+ TCPSTAT_INC(tcps_reass_blocks);
+ } else {
+ /* Memory allocation failure. */
+ TCPSTAT_INC(tcps_rcvmemdrop);
+ goto done;
}
- tcpstat.tcps_rcvoopack++;
- tcpstat.tcps_rcvoobyte += *tlenp;
+ KASSERT(tcp_reass_verify(tp, 1),
+ ("%s: reassembly queue went inconsistent", __func__));
+
+ /* Deliver data if we've got the missing segment. */
+ if (trb->trb_seqs == tp->rcv_nxt)
+ goto present;
+
+ return (0);
+present:
/*
- * While we overlap succeeding segments trim them or,
- * if they are completely covered, dequeue them.
+ * Present data to user, advancing rcv_nxt through the
+ * completed sequence space.
*/
- while (q) {
- int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
- if (i <= 0)
- break;
- if (i < q->tqe_len) {
- q->tqe_th->th_seq += i;
- q->tqe_len -= i;
- m_adj(q->tqe_m, i);
- break;
- }
+ KASSERT(trb != NULL,
+ ("%s: queue empty at present", __func__));
+ KASSERT(trb->trb_seqs == tp->rcv_nxt,
+ ("%s: first block does not match rcv_nxt", __func__));
+
+ TCPSTAT_INC(tcps_reass_missingseg);
- nq = LIST_NEXT(q, tqe_q);
- LIST_REMOVE(q, tqe_q);
- m_freem(q->tqe_m);
- uma_zfree(tcp_reass_zone, q);
- tp->t_segqlen--;
- tcp_reass_qsize--;
- q = nq;
+ SOCKBUF_LOCK(&so->so_rcv);
+ /*
+ * We can only ever dequeue one consecutive
+ * block of data at most.
+ */
+ if (!(so->so_rcv.sb_state & SBS_CANTRCVMORE)) {
+ sbappendstream_locked(&so->so_rcv, trb->trb_m);
+ tp->rcv_nxt += SEQ_DELTA(trb->trb_seqs, trb->trb_seqe);
+ trb->trb_m = NULL;
+ trb->trb_mt = NULL;
}
- /* Insert the new segment queue entry into place. */
- te->tqe_m = m;
- te->tqe_th = th;
- te->tqe_len = *tlenp;
+ if (trb == &trbs)
+ m_freem(trb->trb_m); /* NB: trb_m can be =! NULL */
+ else
+ tcp_reass_free(tp, trb);
+
+ /* NB: sorwakeup_locked() does an implicit socket buffer unlock. */
+ sorwakeup_locked(so);
+
+ /*
+ * Don't hold on to data in the reassembly queue for too long.
+ * Kernel memory is limited and if the connection doesn't make
+ * any progress in filling the holes we don't want to wait
+ * forever to prevent memory exhaustion attacks. The sender
+ * can always retransmit again.
+ * RFC2018: section 8, Data Receiver Reneging
+ *
+ * Restart the reassembly queue flush timer after advancing
+ * the sequence space and if the queue still isn't empty.
+ */
+ if (tcp_reass_timeout && !RB_EMPTY(&tp->rcv_reass))
+ tcp_timer_activate(tp, TT_REASS, tcp_reass_timeout);
+ else if (tcp_timer_active(tp, TT_REASS))
+ tcp_timer_activate(tp, TT_REASS, 0);
+
+ ND6_HINT(tp);
+ return (thflags);
+
+done:
+ m_freem(m);
+ return (0);
+}
+
+/*
+ * Trim a reassembly block.
+ * A positive value is from head, negative from tail.
+ */
+static void
+tcp_reass_trim(struct tcp_reass_block *trb, int i)
+{
+
+ m_adj(trb->trb_m, i);
- if (p == NULL) {
- LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
+ /*
+ * Update the tail pointer or free empty mbufs
+ * at the head of the chain.
+ */
+ if (i < 0) {
+ trb->trb_mt = m_last(trb->trb_m);
+ trb->trb_seqe -= i;
} else {
- LIST_INSERT_AFTER(p, te, tqe_q);
+ trb->trb_m = m_trimhead(trb->trb_m);
+ trb->trb_seqs += i;
+ }
+}
+
+/*
+ * Merge one or more consecutive blocks together. The number of
+ * redundant bytes is reported as the difference between trbn->
+ * trb_seqs and trb_seqe.
+ *
+ * NB: trbn is always merged into trb!
+ */
+static struct tcp_reass_block *
+tcp_reass_merge(struct tcp_reass_block *trb, struct tcp_reass_block *trbn)
+{
+ int i;
+ tcp_seq s;
+
+ KASSERT(trb != NULL && trbn != NULL,
+ ("%s: incomplete input", __func__));
+ KASSERT(SEQ_LT(trbn->trb_seqs, trb->trb_seqs) ||
+ SEQ_GT(trbn->trb_seqe, trb->trb_seqe),
+ ("%s: blocks not overlapping", __func__));
+
+ /*
+ * Replace, prepend or append a block.
+ */
+ if (SEQ_LEQ(trbn->trb_seqs, trb->trb_seqs) &&
+ SEQ_GEQ(trbn->trb_seqe, trb->trb_seqe)) {
+
+ i = SEQ_DELTA(trb->trb_seqs, trbn->trb_seqe);
+ s = trb->trb_seqs;
+
+ m_freem(trb->trb_m);
+ trb->trb_seqs = trbn->trb_seqs;
+ trb->trb_seqe = trbn->trb_seqe;
+ trb->trb_m = trbn->trb_m;
+ trb->trb_mt = trbn->trb_mt;
+
+ TCPSTAT_INC(tcps_reass_replace);
+
+ } else if (SEQ_LT(trbn->trb_seqs, trb->trb_seqs)) {
+
+ if ((i = SEQ_DELTA(trb->trb_seqs, trbn->trb_seqe)) > 0)
+ tcp_reass_trim(trbn, -i);
+
+ s = trb->trb_seqs;
+ trb->trb_seqs = trbn->trb_seqs;
+ trbn->trb_mt->m_next = trb->trb_m;
+ trb->trb_m = trbn->trb_m;
+
+ if (tcp_reass_spacetime) {
+ trbn->trb_mt = m_collapse(trbn->trb_mt, M_DONTWAIT, 1024);
+ trb->trb_mt = m_last(trbn->trb_mt);
+ }
+
+ } else if (SEQ_GT(trbn->trb_seqe, trb->trb_seqe)) {
+
+ if ((i = SEQ_DELTA(trb->trb_seqe, trbn->trb_seqs)) > 0)
+ tcp_reass_trim(trbn, i);
+
+ s = trb->trb_seqe;
+ trb->trb_seqe = trbn->trb_seqe;
+ trb->trb_mt->m_next = trbn->trb_m;
+
+ if (tcp_reass_spacetime) {
+ trb->trb_mt = m_collapse(trb->trb_mt, M_DONTWAIT, 1024);
+ trb->trb_mt = m_last(trb->trb_mt);
+ } else
+ trb->trb_mt = trbn->trb_mt;
+
+ } else
+ return (NULL);
+
+ trbn->trb_seqs = s;
+ trbn->trb_seqe = trbn->trb_seqs + i;
+ trbn->trb_m = NULL;
+ trbn->trb_mt = NULL;
+ return (trbn);
+}
+
+/*
+ * Put the sequence number of the reassembly queue blocks into the
+ * SACK options of an outgoing segment. If a D-SACK block is available
+ * insert it in the first position followed by the regular SACK blocks.
+ * RFC2018: section 3, Sack Option Format
+ * RFC2018: section 4, Generating Sack Options: Data Receiver Behavior
+ * RFC2883: section 4, Use of the SACK option for reporting a duplicate segment
+ */
+int
+tcp_reass_sack(struct tcpcb *tp, u_char *optp, int numsacks)
+{
+ int nsacks = 0;
+ tcp_seq sack_seq;
+ struct tcp_reass_block *trb, trbs;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ KASSERT(numsacks > 0,
+ ("%s: zero sack blocks to add", __func__));
+ KASSERT(!LIST_EMPTY(&tp->rcv_reass_sack),
+ ("%s: sack list empty", __func__));
+
+ /*
+ * Create fake SACK block on the stack for D-SACK and prepend it.
+ * RFC2883: section 4, part (3)
+ */
+ if (tp->rcv_reass_dsack.start != tp->rcv_reass_dsack.end) {
+ bzero(&trbs, sizeof(trbs));
+ trbs.trb_seqs = htonl(tp->rcv_reass_dsack.start);
+ trbs.trb_seqe = htonl(tp->rcv_reass_dsack.end);
+ LIST_INSERT_HEAD(&tp->rcv_reass_sack, &trbs, trb_sack);
+ }
+
+ /*
+ * The most recent block must appear first. Add the other
+ * blocks in most recent created or updated order.
+ * RFC2018: section 3 and 4, part (4) and (5)
+ */
+ LIST_FOREACH(trb, &tp->rcv_reass_sack, trb_sack) {
+ if (numsacks < 1)
+ break;
+ sack_seq = htonl(trb->trb_seqs);
+ bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
+ optp += sizeof(sack_seq);
+ sack_seq = htonl(trb->trb_seqe);
+ bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
+ optp += sizeof(sack_seq);
+ numsacks--;
+ nsacks++;
}
-present:
/*
- * Present data to user, advancing rcv_nxt through
- * completed sequence space.
+ * Remove fake D-SACK block again and zero out the D-SACK
+ * information. It must be reported only once.
+ * RFC2883: section 4, part (2)
*/
- if (!TCPS_HAVEESTABLISHED(tp->t_state))
- return (0);
- q = LIST_FIRST(&tp->t_segq);
- if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
- return (0);
- SOCKBUF_LOCK(&so->so_rcv);
- do {
- tp->rcv_nxt += q->tqe_len;
- flags = q->tqe_th->th_flags & TH_FIN;
- nq = LIST_NEXT(q, tqe_q);
- LIST_REMOVE(q, tqe_q);
- if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
- m_freem(q->tqe_m);
- else
- sbappendstream_locked(&so->so_rcv, q->tqe_m);
- uma_zfree(tcp_reass_zone, q);
- tp->t_segqlen--;
- tcp_reass_qsize--;
- q = nq;
- } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
- //ND6_HINT(tp);
- sorwakeup_locked(so);
- return (flags);
+ if (LIST_FIRST(&tp->rcv_reass_sack) == &trbs) {
+ LIST_REMOVE(&trbs, trb_sack);
+ tp->rcv_reass_dsack.start = 0;
+ tp->rcv_reass_dsack.end = 0;
+ }
+
+ return (nsacks);
+}
+
+#ifdef DDB
+static void
+db_print_reassblocks(struct tcpcb *tp)
+{
+ struct tcp_reass_block *trb;
+
+ RB_FOREACH(trb, tcp_ra, &tp->rcv_reass) {
+ db_printf(" reass block 0x%08x - 0x%08x\n",
+ trb->trb_seqs, trb->trb_seqe);
+ }
}
+#endif
More information about the p4-projects
mailing list