PERFORCE change 166378 for review

Andre Oppermann andre at FreeBSD.org
Tue Jul 21 19:13:04 UTC 2009


http://perforce.freebsd.org/chv.cgi?CH=166378

Change 166378 by andre at andre_t61 on 2009/07/21 19:12:41

	tcp_reass_verify() must return 1 to pass the master-KASSERT().
	tcp_reass_sacktrack() must not do LIST_REMOVE() when this is the first element to be inserted.
	Another lookup in present: is unnecessary, just use 'trb'.
	Do not allocate tracking structure for stand-alone missing segment, use the stack based one.
	Adjust KASSERT()s accordingly.

Affected files ...

.. //depot/projects/tcp_reass/netinet/tcp_reass.c#38 edit

Differences ...

==== //depot/projects/tcp_reass/netinet/tcp_reass.c#38 (text+ko) ====

@@ -1,629 +1,633 @@
-/*-
- * Copyright (c) 2007-2009
- *	Andre Oppermann, Internet Business Solutions AG.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
- * $FreeBSD: src/sys/netinet/tcp_reass.c,v 1.352 2007/05/13 22:16:13 andre Exp $
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/netinet/tcp_reass.c,v 1.363 2009/07/16 21:13:04 rwatson Exp $");
-
-/*
- * Operational overview of TCP reassembly:
- *
- * It is the purpose of tcp reassembly to store segments that are received
- * out of order.  This happens when packets are lost along the way due to
- * various reasons.  The most common one is traffic overload which causes
- * routers to stop accepting packets for brief moments.
- *
- * Upon arrival of the missing segment(s) the whole chain of stored segments
- * is moved into the socket buffer.  In case of multiple missing segments
- * the first consequtive part is moved with the remainder being kept in
- * store until the next missing segment arrives.
- *
- * While in reassembly mode *all* arrving segments are put into the reassembly
- * queue.
- *
- * Instead of storing all segments on their own we build blocks of consequtive
- * segments chained together.  We use a red-black tree to cope with arbitrary
- * complexity.  If a segment matches the end of one block and the start of the
- * next block the two blocks are joined together.  If no match is found a
- * new block is created.
- *
- * The reassembly queues block structure is also used to track SACK
- * information as a data receiver.  A double-linked list is added
- * that links the blocks the reverse order of their arrival or updating.
- * This makes us fully compliant to RFC2018 Section 4 including all
- * optional parts marked as "SHOULD".
- *
- * TODO:
- * A further improvement is to merge the content of mbufs together if the
- * preceeding one has enough space to hold the data of the new one.  When
- * trimming the head of an mbuf chain m_adj() empties the mbufs but leaves
- * them in place.  Only when trimming from the tail it actually frees them.
- * Normally we don't get mbuf chains so this isn't too much of a concern
- * right now.  Use m_collapse() to compact the mbuf chains within the
- * blocks.
- */
-
-#include "opt_inet.h"
-
-#include <sys/param.h>
-#include <sys/kernel.h>
-#include <sys/malloc.h>
-#include <sys/mbuf.h>
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-#include <sys/sysctl.h>
-#include <sys/syslog.h>
-#include <sys/systm.h>
-#include <sys/vimage.h>
-
-#include <vm/uma.h>
-
-#include <net/if.h>
-#include <net/route.h>
-
-#include <netinet/in.h>
-#include <netinet/in_pcb.h>
-#include <netinet/in_systm.h>
-#include <netinet/ip.h>
-#include <netinet/ip_var.h>
-#include <netinet/ip_options.h>
-#include <netinet/tcp.h>
-#include <netinet/tcp_fsm.h>
-#include <netinet/tcp_seq.h>
-#include <netinet/tcp_timer.h>
-#include <netinet/tcp_var.h>
-#include <netinet/tcpip.h>
-
-VNET_DEFINE(uma_zone_t, tcp_reass_zone);
-
-SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
-    "TCP Segment Reassembly Queue");
-
-static int tcp_reass_enable = 1;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, enable, CTLFLAG_RW,
-    &tcp_reass_enable, 0,
-    "Enable/disable use of TCP reassembly queue");
-
-static int tcp_reass_maxblocks = 32;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxblocks, CTLFLAG_RW,
-    &tcp_reass_maxblocks, 0,
-    "Per connection limit of TCP segment blocks in reassembly queue");
-
-static int tcp_reass_globalmaxblocks = 65535;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, globalmaxblocks, CTLFLAG_RDTUN,
-    &tcp_reass_globalmaxblocks, 0,
-    "Global limit of TCP segment blocks in reassembly queue");
-
-static int tcp_reass_timeout = 0;
-SYSCTL_PROC(_net_inet_tcp_reass, OID_AUTO, timeout, CTLTYPE_INT|CTLFLAG_RW,
-    &tcp_reass_timeout, 0, sysctl_msec_to_ticks, "I",
-    "Reassembly queue flush timeout in milliseconds");
-
-static int tcp_reass_spacetime = 0;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, space_time, CTLFLAG_RW,
-    &tcp_reass_spacetime, 0,
-    "Reassembly queue strategy of space vs. time efficiency");
-
-static struct tcp_reass_block *
-    tcp_reass_merge(struct tcp_reass_block *, struct tcp_reass_block *);
-
-/* Trim empty mbufs from head of chain. */
-static struct mbuf *
-m_trimhead(struct mbuf *m)
-{
-	struct mbuf *n;
-
-	while (m->m_len == 0) {
-		n = m;
-		m = m->m_next;
-		m_free(n);
-	}
-	return (m);
-}
-
-#if 0
-static u_int
-m_storagesize(struct mbuf *m)
-{
-	u_int mcnt;
-
-	for (mcnt = 0; m != NULL; m = m->m_next)
-		mcnt += (m->m_flags & M_EXT) ?
-		    m->m_ext.ext_size + MSIZE : MSIZE;
-	return (mcnt);
-}
-#endif
-
-/*
- * Initialize TCP reassembly zone on startup.
- */
-void
-tcp_reass_init(void)
-{
-
-	TUNABLE_INT_FETCH("net.inet.tcp.reass.globalmaxblocks",
-	    &tcp_reass_globalmaxblocks);
-	tcp_reass_zone = uma_zcreate("tcpreass", sizeof(struct tcp_reass_block),
-	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
-	uma_zone_set_max(tcp_reass_zone, tcp_reass_globalmaxblocks);
-	tcp_reass_timeout = 30 * hz;
-}
-
-/*
- * Compare function implementing the ranged lookup on the RB tree.
- * NB: The tree must never have any overlapping elements.
- */
-static __inline int
-tcp_reass_cmp(struct tcp_reass_block *a, struct tcp_reass_block *b)
-{
-	if (SEQ_LT(a->trb_seqe, b->trb_seqs))
-		return (-1);
-	else if (SEQ_GT(a->trb_seqs, b->trb_seqe))
-		return (1);
-	else
-		return (0);
-}
-
-RB_PROTOTYPE_STATIC(tcp_ra, tcp_reass_block, trb_rb, tcp_reass_cmp);
-RB_GENERATE_STATIC(tcp_ra, tcp_reass_block, trb_rb, tcp_reass_cmp);
-
-#ifdef INVARIANTS
-static int
-tcp_reass_verify(struct tcpcb *tp)
-{
-	int i = 0, size = 0, total = 0;
-	struct mbuf *m;
-	struct tcp_reass_block *trb, *trbn;
-
-	RB_FOREACH_SAFE(trb, tcp_ra, &tp->rcv_reass, trbn) {
-		KASSERT(SEQ_LT(trb->trb_seqs, trb->trb_seqe),
-		    ("%s: trb_seqs >= trb_seqe", __func__));
-		KASSERT(SEQ_GT(trb->trb_seqs, tp->rcv_nxt),
-		    ("%s: rcv_nxt >= trb_seqs", __func__));
-		KASSERT(trb->trb_m != NULL,
-		    ("%s: trb_m == NULL", __func__));
-		KASSERT(trb->trb_mt != NULL,
-		    ("%s: trb_mt == NULL", __func__));
-		size = SEQ_DELTA(trb->trb_seqs, trb->trb_seqe);
-		KASSERT(size == m_length(trb->trb_m, &m),
-		    ("%s: seq# size != actual mbuf size", __func__));
-		KASSERT(trb->trb_mt == m,
-		    ("%s: trb_mt is not last mbuf", __func__));
-		KASSERT(trbn == NULL || SEQ_LT(trb->trb_seqe, trbn->trb_seqs),
-		    ("%s: overlaps into next block", __func__));
-		total += size;
-		i++;
-	}
-	KASSERT(tp->rcv_reass_size == total,
-	    ("%s: total not correct", __func__));
-
-	LIST_FOREACH(trb, &tp->rcv_reass_sack, trb_sack) {
-		i--;
-	}
-	KASSERT(i == 0,
-	    ("%s: sack list incorrect", __func__));
-
-	return (0);
-}
-#endif
-
-static void
-tcp_reass_free(struct tcpcb *tp, struct tcp_reass_block *trb)
-{
-
-	trb = RB_REMOVE(tcp_ra, &tp->rcv_reass, trb);
-	KASSERT(trb != NULL, ("%s: RB_REMOVE failed", __func__));
-	LIST_REMOVE(trb, trb_sack);
-	if (trb->trb_m != NULL)
-		m_freem(trb->trb_m);
-	tp->rcv_reass_size -= SEQ_DELTA(trb->trb_seqs, trb->trb_seqe);
-	tp->rcv_reass_blocks--;
-	uma_zfree(tcp_reass_zone, trb);
-}
-
-void
-tcp_reass_flush(struct tcpcb *tp)
-{
-	struct tcp_reass_block *trb, *trbn;
-
-	INP_WLOCK_ASSERT(tp->t_inpcb);
-	KASSERT(tcp_reass_verify(tp),
-	    ("%s: reassembly queue inconsistent", __func__));
-
-	RB_FOREACH_SAFE(trb, tcp_ra, &tp->rcv_reass, trbn) {
-		tcp_reass_free(tp, trb);
-	}
-	KASSERT(tp->rcv_reass_size == 0, ("%s: snd_sacked not zero", __func__));
-}
-
-static __inline void
-tcp_reass_sacktrack(struct tcpcb *tp, struct tcp_reass_block *trb)
-{
-
-	if (LIST_FIRST(&tp->rcv_reass_sack) != trb) {
-		LIST_REMOVE(trb, trb_sack);
-		LIST_INSERT_HEAD(&tp->rcv_reass_sack, trb, trb_sack);
-	}
-}
-
-/*
- * Insert segments into the reassembly queue.
- *
- * NB: We must always consume the mbuf.  Either by appeding it to
- * the queue or by freeing it.
- */
-int
-tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
-{
-	int thflags = 0;
-	tcp_seq th_seq;
-	struct socket *so = tp->t_inpcb->inp_socket;
-	struct tcp_reass_block *trb, *trbn;
-	struct tcp_reass_block trbs;
-
-	INP_WLOCK_ASSERT(tp->t_inpcb);
-
-	/*
-	 * Call with th==NULL after becoming established to
-	 * force pre-ESTABLISHED data up to user socket.
-	 * XXX: Was used for T/TCP of which code remains.
-	 */
-	if (th == NULL) {
-		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
-		    RB_EMPTY(&tp->rcv_reass) ||
-		    ((trb = RB_MIN(tcp_ra, &tp->rcv_reass)) &&
-		     trb->trb_seqs != tp->rcv_nxt))
-			return (0);
-		goto present;
-	}
-
-	KASSERT(th != NULL, ("%s: th is NULL", __func__));
-	KASSERT(tlenp != NULL, ("%s: tlenp is NULL", __func__));
-	KASSERT(m != NULL, ("%s: m is NULL", __func__));
-	KASSERT(*tlenp == m_length(m, NULL),
-	    ("%s: tlen != mbuf length", __func__));
-
-	/*
-	 * Store TCP header information in local variables as
-	 * we may lose access to it after mbuf compacting.
-	 */
-	thflags = th->th_flags;
-	th_seq = th->th_seq;
-	th = NULL;		/* Prevent further use. */
-
-	/* Check if it is really neccessary to do all the work. */
-	if (!tcp_reass_enable && RB_EMPTY(&tp->rcv_reass))
-		goto done;
-
-	KASSERT(SEQ_LT(tp->rcv_nxt, th_seq),
-	    ("%s: sequence number below rcv_nxt", __func__));
-	KASSERT(!(tp->rcv_nxt == th_seq) || !(RB_EMPTY(&tp->rcv_reass)),
-	    ("%s: got missing segment but queue is empty", __func__));
-	KASSERT(tcp_reass_verify(tp),
-	    ("%s: reassembly queue inconsistent", __func__));
-
-	/*
-	 * Limit the number of segments in the reassembly queue to prevent
-	 * holding on to too many segments (and thus running out of mbufs).
-	 * Make sure to let the missing segment through which caused this
-	 * queue.
-	 *
-	 * Count the gross space used by the mbufs in the reassembly queue
-	 * and limit it to the free space in the socket buffer.  This way
-	 * the reassembly queue can never consume more mbuf space than the
-	 * socket buffer got allocated anyway and it reflects the actual
-	 * amount of kernel memory used.  This effectively prevents mbuf
-	 * exhaustion due to pathological traffic (one byte segments with
-	 * a hole each time) on a single connection.
-	 *
-	 * Counting the gross mbuf space effectively sets the net data
-	 * limit lower than the socket buffer would allow.
-	 * Don't underestimates the effective free space in the socket
-	 * buffer vs. actual real data with 2k clusters and 1500 byte
-	 * packets by introducing a correction factor of 11/8th.
-	 */
-	if (th_seq != tp->rcv_nxt &&
-	    tp->rcv_reass_blocks > tcp_reass_maxblocks) {
-		//(sbspace(&so->so_rcv) / 8 * 11)
-		TCPSTAT_INC(tcps_reass_overflow);
-		TCPSTAT_INC(tcps_rcvmemdrop);
-		goto done;
-	}
-
-	/*
-	 * FIN handling is a bit tricky.
-	 * We cannot trust a FIN that goes into the reassembly queue.
-	 * It can be easily spoofed as it may be anywhere in the receive
-	 * window (see RST attack mitigation in tcp-secure).
-	 * For this reason (and complexity avoidance) we generally ignore
-	 * any FIN arriving at the reassembly queue with one exception;
-	 * When it exactly matches rcv_nxt together with any data in the
-	 * same segment we can conclude it to be genuine and proceed with
-	 * flushing any other data waiting in the reassembly queue.
-	 * A FIN is part of the sequence space and will get retransmitted
-	 * if it was genuine.
-	 * This approach is based on a discussion on TCPM mailing list.
-	 */
-	if ((thflags & TH_FIN) && tp->rcv_nxt == th_seq) {
-		tcp_reass_flush(tp);
-		if (m->m_len == 0) {
-			tcp_timer_activate(tp, TT_REASS, 0);
-			return (thflags);
-		}
-	} else if (*tlenp == 0)
-		goto done;
-	else
-		thflags &= ~TH_FIN;
-
-	/* Get rid of packet header and mtags. */
-	m_demote(m, 1);
-	/* Trim empty mbufs from head of chain. */
-	m = m_trimhead(m);
-	/* Compact mbuf chain. */
-	if (tcp_reass_spacetime)
-		m = m_collapse(m, M_DONTWAIT, 1024);
-
-	KASSERT(m != NULL, ("%s: m is NULL after collapse", __func__));
-
-	/* Set up search structure. */
-	trbs.trb_seqs = th_seq;
-	trbs.trb_seqe = th_seq + *tlenp;
-	trbs.trb_m = m;
-	trbs.trb_mt = m_last(m);
-
-	/*
-	 * Return match that has at least partial overlap to either side or
-	 * insert a new reassembly block.
-	 */
-	if ((trb = RB_FIND(tcp_ra, &tp->rcv_reass, &trbs)) != NULL) {
-		/* Within an already known block. */
-		if (SEQ_GEQ(trbs.trb_seqs, trb->trb_seqs) &&
-		    SEQ_LEQ(trbs.trb_seqe, trb->trb_seqe)) {
-			tcp_reass_sacktrack(tp, trb);
-			tp->rcv_reass_dsack.start = trbs.trb_seqs;
-			tp->rcv_reass_dsack.end = trbs.trb_seqe;
-			goto done;
-		}
-		tp->rcv_reass_size += SEQ_DELTA(trb->trb_seqs, trb->trb_seqe);
-
-		/* Extends the end, common case. */
-		if (SEQ_GT(trbs.trb_seqe, trb->trb_seqe)) {
-			(void)tcp_reass_merge(trb, &trbs);
-			tcp_reass_sacktrack(tp, trb);
-
-			/* Merge in next blocks if there is overlap. */
-			while ((trbn = RB_NEXT(tcp_ra, &tp->rcv_reass, trb)) != NULL &&
-			    SEQ_GEQ(trb->trb_seqe, trbn->trb_seqs)) {
-				trbn = tcp_reass_merge(trb, trbn);
-				tcp_reass_free(tp, trbn);
-			}
-		}
-
-		/* Extends the start. */
-		if (SEQ_LT(trbs.trb_seqs, trb->trb_seqs)) {
-			(void)tcp_reass_merge(trb, &trbs);
-			tcp_reass_sacktrack(tp, trb);
-
-			/* Merge in previous blocks if there is overlap. */
-			while ((trbn = RB_PREV(tcp_ra, &tp->rcv_reass, trb)) != NULL &&
-			    SEQ_LEQ(trb->trb_seqs, trbn->trb_seqe)) {
-				trbn = tcp_reass_merge(trb, trbn);
-				tcp_reass_free(tp, trbn);
-			}
-		}
-	} else if ((trb = (struct tcp_reass_block *)uma_zalloc(tcp_reass_zone, (M_NOWAIT|M_ZERO))) != NULL) {
-		trb->trb_seqs = trbs.trb_seqs;
-		trb->trb_seqe = trbs.trb_seqe;
-		trb->trb_m = trbs.trb_m;
-		trb->trb_mt = trbs.trb_mt;
-		trbn = RB_INSERT(tcp_ra, &tp->rcv_reass, trb);
-		KASSERT(trbn == NULL, ("%s: RB_INSERT failed", __func__));
-		tcp_reass_sacktrack(tp, trb);
-		tp->rcv_reass_size += SEQ_DELTA(trb->trb_seqs, trb->trb_seqe);
-		tp->rcv_reass_blocks++;
-	} else if (tp->rcv_nxt == th_seq) {
-		trbn = RB_INSERT(tcp_ra, &tp->rcv_reass, &trbs);
-		KASSERT(trbn == NULL, ("%s: RB_INSERT failed", __func__));
-	}
-	if (tp->rcv_nxt == th_seq)
-		goto present;
-
-	KASSERT(tcp_reass_verify(tp),
-	    ("%s: reassembly queue inconsistent", __func__));
-	return (0);
-
-present:
-	/*
-	 * Present data to user, advancing rcv_nxt through the
-	 * completed sequence space.
-	 */
-	KASSERT(!RB_EMPTY(&tp->rcv_reass),
-	    ("%s: queue empty at present", __func__));
-	KASSERT((RB_MIN(tcp_ra, &tp->rcv_reass))->trb_seqs == tp->rcv_nxt,
-	    ("%s: first block does not match rcv_nxt", __func__));
-	TCPSTAT_INC(tcps_reass_missingseg);
-
-	SOCKBUF_LOCK(&so->so_rcv);
-	/* We can only ever dequeue one block. */
-	trb = RB_MIN(tcp_ra, &tp->rcv_reass);
-	if (!(so->so_rcv.sb_state & SBS_CANTRCVMORE)) {
-		sbappendstream_locked(&so->so_rcv, trb->trb_m);
-		tp->rcv_nxt += SEQ_DELTA(trb->trb_seqs, trb->trb_seqe);
-		trb->trb_m = NULL;
-		trb->trb_mt = NULL;
-	}
-	if (trb == &trbs) {
-		RB_REMOVE(tcp_ra, &tp->rcv_reass, trb);
-		if (trb->trb_m != NULL)
-			m_freem(trb->trb_m);
-	} else
-		tcp_reass_free(tp, trb);
-
-	/* NB: sorwakeup_locked() does a implicit socket buffer unlock. */
-	sorwakeup_locked(so);
-
-	/*
-	 * Restart the reassembly queue flush timer after advancing
-	 * the sequence space and if queue is not empty.  Otherwise
-	 * deactivate it.
-	 */
-	if (tcp_reass_timeout && !RB_EMPTY(&tp->rcv_reass))
-		tcp_timer_activate(tp, TT_REASS,
-		    tp->t_rxtcur * tcp_reass_timeout);
-	else
-		tcp_timer_activate(tp, TT_REASS, 0);
-
-	ND6_HINT(tp);
-	return (thflags);
-
-done:
-	m_freem(m);
-	*tlenp = 0;
-	return (0);
-}
-
-/*
- * Merge one or more consecutive blocks together.
- * Always merge trbn into trb!
- */
-static struct tcp_reass_block *
-tcp_reass_merge(struct tcp_reass_block *trb, struct tcp_reass_block *trbn)
-{
-	int i;
-
-	KASSERT(trb != NULL && trbn != NULL,
-	    ("%s: incomplete input", __func__));
-
-	/* Append and prepend. */
-	if (SEQ_GEQ(trb->trb_seqe, trbn->trb_seqs)) {
-		if (SEQ_GEQ(trb->trb_seqe, trbn->trb_seqe))
-			return (trbn);
-		if ((i = SEQ_DELTA(trb->trb_seqe, trbn->trb_seqs)) > 0) {
-			m_adj(trbn->trb_m, i);
-			trbn->trb_m = m_trimhead(trbn->trb_m);
-		}
-		trb->trb_seqe = trbn->trb_seqe;
-		trb->trb_mt->m_next = trbn->trb_m;
-		if (tcp_reass_spacetime) {
-			trb->trb_mt = m_collapse(trb->trb_mt, M_DONTWAIT, 1024);
-			trb->trb_mt = m_last(trb->trb_mt);
-		} else
-			trb->trb_mt = trbn->trb_mt;
-	} else if (SEQ_LEQ(trb->trb_seqs, trbn->trb_seqe)) {
-		if (SEQ_LEQ(trb->trb_seqs, trbn->trb_seqs))
-			return (trbn);
-		if ((i = SEQ_DELTA(trb->trb_seqs, trbn->trb_seqe)) > 0) {
-			m_adj(trb->trb_m, i);
-			trb->trb_m = m_trimhead(trb->trb_m);
-		}
-		trb->trb_seqs = trbn->trb_seqs;
-		trb->trb_m = trbn->trb_m;
-		trbn->trb_mt->m_next = trb->trb_m;
-		if (tcp_reass_spacetime) {
-			trbn->trb_mt = m_collapse(trbn->trb_mt, M_DONTWAIT, 1024);
-			trb->trb_mt = m_last(trbn->trb_mt);
-		}
-	} else
-		return (NULL);
-
-	trbn->trb_seqs = 0;
-	trbn->trb_seqe = i;
-	trbn->trb_m = NULL;
-	trbn->trb_mt = NULL;
-	return (trbn);		
-}
-
-/*
- * Put the sequence number of the reassembly queue blocks into
- * the SACK options of an outgoing segment.
- *  RFC2018: section ...
- *  RFC2883: section ...
- */
-int
-tcp_reass_sack(struct tcpcb *tp, u_char *optp, int numsacks)
-{
-	int nsacks = 0;
-	tcp_seq sack_seq;
-	struct tcp_reass_block *trb;
-
-	INP_WLOCK_ASSERT(tp->t_inpcb);
-	KASSERT(numsacks > 0,
-	    ("%s: zero sack blocks to add", __func__));
-	KASSERT(!LIST_EMPTY(&tp->rcv_reass_sack),
-	    ("%s: sack list empty", __func__));
-
-	/* DSACK */
-	if (tp->rcv_reass_dsack.start == tp->rcv_reass_dsack.end) {
-		sack_seq = htonl(tp->rcv_reass_dsack.start);
-		bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
-		optp += sizeof(sack_seq);
-		sack_seq = htonl(tp->rcv_reass_dsack.end);
-		bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
-		optp += sizeof(sack_seq);
-		tp->rcv_reass_dsack.start = 0;
-		tp->rcv_reass_dsack.end = 0;
-		numsacks--;
-		nsacks++;
-	}
-
-	/*
-	 * The most recent block must appear first.  Add the other
-	 * blocks in most recent created or updated order.
-	 *  RFC2018: section 4
-	 */
-	LIST_FOREACH(trb, &tp->rcv_reass_sack, trb_sack) {
-		if (numsacks < 1)
-			break;
-		sack_seq = htonl(trb->trb_seqs);
-		bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
-		optp += sizeof(sack_seq);
-		sack_seq = htonl(trb->trb_seqe);
-		bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
-		optp += sizeof(sack_seq);
-		numsacks--;
-		nsacks++;
-	}
-
-	return (nsacks);
-}
-
-#ifdef DDB
-static void
-db_print_reassblocks(struct tcpcb *tp)
-{
-	struct tcp_reass_block *trb;
-
-	RB_FOREACH(trb, tcp_ra, &tp->rcv_reass) {
-		db_printf(" reass block 0x%08x - 0x%08x\n",
-		    trb->trb_seqs, trb->trb_seqe);
-	}
-}
-#endif
+/*-
+ * Copyright (c) 2007-2009
+ *	Andre Oppermann, Internet Business Solutions AG.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
+ * $FreeBSD: src/sys/netinet/tcp_reass.c,v 1.352 2007/05/13 22:16:13 andre Exp $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/netinet/tcp_reass.c,v 1.363 2009/07/16 21:13:04 rwatson Exp $");
+
+/*
+ * Operational overview of TCP reassembly:
+ *
+ * It is the purpose of tcp reassembly to store segments that are received
+ * out of order.  This happens when packets are lost along the way due to
+ * various reasons.  The most common one is traffic overload which causes
+ * routers to stop accepting packets for brief moments.
+ *
+ * Upon arrival of the missing segment(s) the whole chain of stored segments
+ * is moved into the socket buffer.  In case of multiple missing segments
+ * the first consequtive part is moved with the remainder being kept in
+ * store until the next missing segment arrives.
+ *
+ * While in reassembly mode *all* arrving segments are put into the reassembly
+ * queue.
+ *
+ * Instead of storing all segments on their own we build blocks of consequtive
+ * segments chained together.  We use a red-black tree to cope with arbitrary
+ * complexity.  If a segment matches the end of one block and the start of the
+ * next block the two blocks are joined together.  If no match is found a
+ * new block is created.
+ *
+ * The reassembly queues block structure is also used to track SACK
+ * information as a data receiver.  A double-linked list is added
+ * that links the blocks the reverse order of their arrival or updating.
+ * This makes us fully compliant to RFC2018 Section 4 including all
+ * optional parts marked as "SHOULD".
+ *
+ * TODO:
+ * A further improvement is to merge the content of mbufs together if the
+ * preceeding one has enough space to hold the data of the new one.  When
+ * trimming the head of an mbuf chain m_adj() empties the mbufs but leaves
+ * them in place.  Only when trimming from the tail it actually frees them.
+ * Normally we don't get mbuf chains so this isn't too much of a concern
+ * right now.  Use m_collapse() to compact the mbuf chains within the
+ * blocks.
+ */
+
+#include "opt_inet.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/vimage.h>
+
+#include <vm/uma.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_options.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
+
+VNET_DEFINE(uma_zone_t, tcp_reass_zone);
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
+    "TCP Segment Reassembly Queue");
+
+static int tcp_reass_enable = 1;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, enable, CTLFLAG_RW,
+    &tcp_reass_enable, 0,
+    "Enable/disable use of TCP reassembly queue");
+
+static int tcp_reass_maxblocks = 32;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxblocks, CTLFLAG_RW,
+    &tcp_reass_maxblocks, 0,
+    "Per connection limit of TCP segment blocks in reassembly queue");
+
+static int tcp_reass_globalmaxblocks = 65535;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, globalmaxblocks, CTLFLAG_RDTUN,
+    &tcp_reass_globalmaxblocks, 0,
+    "Global limit of TCP segment blocks in reassembly queue");
+
+static int tcp_reass_timeout = 0;
+SYSCTL_PROC(_net_inet_tcp_reass, OID_AUTO, timeout, CTLTYPE_INT|CTLFLAG_RW,
+    &tcp_reass_timeout, 0, sysctl_msec_to_ticks, "I",
+    "Reassembly queue flush timeout in milliseconds");
+
+static int tcp_reass_spacetime = 0;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, space_time, CTLFLAG_RW,
+    &tcp_reass_spacetime, 0,
+    "Reassembly queue strategy of space vs. time efficiency");
+
+static struct tcp_reass_block *
+    tcp_reass_merge(struct tcp_reass_block *, struct tcp_reass_block *);
+
+/* Trim empty mbufs from head of chain. */
+static struct mbuf *
+m_trimhead(struct mbuf *m)
+{
+	struct mbuf *n;
+
+	while (m->m_len == 0) {
+		n = m;
+		m = m->m_next;
+		m_free(n);
+	}
+	return (m);
+}
+
+#if 0
+static u_int
+m_storagesize(struct mbuf *m)
+{
+	u_int mcnt;
+
+	for (mcnt = 0; m != NULL; m = m->m_next)
+		mcnt += (m->m_flags & M_EXT) ?
+		    m->m_ext.ext_size + MSIZE : MSIZE;
+	return (mcnt);
+}
+#endif
+
+/*
+ * Initialize TCP reassembly zone on startup.
+ */
+void
+tcp_reass_init(void)
+{
+
+	TUNABLE_INT_FETCH("net.inet.tcp.reass.globalmaxblocks",
+	    &tcp_reass_globalmaxblocks);
+	tcp_reass_zone = uma_zcreate("tcpreass", sizeof(struct tcp_reass_block),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	uma_zone_set_max(tcp_reass_zone, tcp_reass_globalmaxblocks);
+	tcp_reass_timeout = 30 * hz;
+}
+
+/*
+ * Compare function implementing the ranged lookup on the RB tree.
+ * NB: The tree must never have any overlapping elements.
+ */
+static __inline int
+tcp_reass_cmp(struct tcp_reass_block *a, struct tcp_reass_block *b)
+{
+	if (SEQ_LT(a->trb_seqe, b->trb_seqs))
+		return (-1);
+	else if (SEQ_GT(a->trb_seqs, b->trb_seqe))
+		return (1);
+	else
+		return (0);
+}
+
+RB_PROTOTYPE_STATIC(tcp_ra, tcp_reass_block, trb_rb, tcp_reass_cmp);
+RB_GENERATE_STATIC(tcp_ra, tcp_reass_block, trb_rb, tcp_reass_cmp);
+
+#ifdef INVARIANTS
+static int
+tcp_reass_verify(struct tcpcb *tp)
+{
+	int i = 0, size = 0, total = 0;
+	struct mbuf *m;
+	struct tcp_reass_block *trb, *trbn;
+
+	RB_FOREACH_SAFE(trb, tcp_ra, &tp->rcv_reass, trbn) {
+		KASSERT(SEQ_LT(trb->trb_seqs, trb->trb_seqe),
+		    ("%s: trb_seqs >= trb_seqe", __func__));
+		KASSERT(SEQ_GT(trb->trb_seqs, tp->rcv_nxt),
+		    ("%s: rcv_nxt >= trb_seqs", __func__));
+		KASSERT(trb->trb_m != NULL,
+		    ("%s: trb_m == NULL", __func__));
+		KASSERT(trb->trb_mt != NULL,
+		    ("%s: trb_mt == NULL", __func__));
+		size = SEQ_DELTA(trb->trb_seqs, trb->trb_seqe);
+		KASSERT(size == m_length(trb->trb_m, &m),
+		    ("%s: seq# size != actual mbuf size", __func__));
+		KASSERT(trb->trb_mt == m,
+		    ("%s: trb_mt is not last mbuf", __func__));
+		KASSERT(trbn == NULL || SEQ_LT(trb->trb_seqe, trbn->trb_seqs),
+		    ("%s: overlaps into next block", __func__));
+		total += size;
+		i++;
+	}
+	KASSERT(tp->rcv_reass_size == total,
+	    ("%s: total not correct", __func__));
+
+	LIST_FOREACH(trb, &tp->rcv_reass_sack, trb_sack) {
+		i--;
+	}
+	KASSERT(i == 0,
+	    ("%s: sack list incorrect", __func__));
+
+	return (1);
+}
+#endif
+
+static void
+tcp_reass_free(struct tcpcb *tp, struct tcp_reass_block *trb)
+{
+
+	trb = RB_REMOVE(tcp_ra, &tp->rcv_reass, trb);
+	KASSERT(trb != NULL, ("%s: RB_REMOVE failed", __func__));
+	LIST_REMOVE(trb, trb_sack);
+	if (trb->trb_m != NULL)
+		m_freem(trb->trb_m);
+	tp->rcv_reass_size -= SEQ_DELTA(trb->trb_seqs, trb->trb_seqe);
+	tp->rcv_reass_blocks--;
+	uma_zfree(tcp_reass_zone, trb);
+}
+
+void
+tcp_reass_flush(struct tcpcb *tp)
+{
+	struct tcp_reass_block *trb, *trbn;
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+	KASSERT(tcp_reass_verify(tp),
+	    ("%s: reassembly queue inconsistent", __func__));
+
+	RB_FOREACH_SAFE(trb, tcp_ra, &tp->rcv_reass, trbn) {
+		tcp_reass_free(tp, trb);
+	}
+	KASSERT(tp->rcv_reass_size == 0, ("%s: snd_sacked not zero", __func__));
+}
+
+static __inline void
+tcp_reass_sacktrack(struct tcpcb *tp, struct tcp_reass_block *trb)
+{
+
+	if (LIST_FIRST(&tp->rcv_reass_sack) != trb) {
+		if (!LIST_EMPTY(&tp->rcv_reass_sack))
+			LIST_REMOVE(trb, trb_sack);
+		LIST_INSERT_HEAD(&tp->rcv_reass_sack, trb, trb_sack);
+	}
+}
+
+/*
+ * Insert segments into the reassembly queue.
+ *
+ * NB: We must always consume the mbuf.  Either by appeding it to
+ * the queue or by freeing it.
+ */
+int
+tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
+{
+	int thflags = 0;
+	tcp_seq th_seq;
+	struct socket *so = tp->t_inpcb->inp_socket;
+	struct tcp_reass_block *trb = NULL, *trbn;
+	struct tcp_reass_block trbs;
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	/*
+	 * Call with th==NULL after becoming established to
+	 * force pre-ESTABLISHED data up to user socket.
+	 * XXX: Was used for T/TCP of which code remains.
+	 */
+	if (th == NULL) {
+		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
+		    RB_EMPTY(&tp->rcv_reass) ||
+		    ((trb = RB_MIN(tcp_ra, &tp->rcv_reass)) &&
+		     trb->trb_seqs != tp->rcv_nxt))
+			return (0);
+		trb = RB_MIN(tcp_ra, &tp->rcv_reass);
+		goto present;
+	}
+
+	KASSERT(th != NULL, ("%s: th is NULL", __func__));
+	KASSERT(tlenp != NULL, ("%s: tlenp is NULL", __func__));
+	KASSERT(m != NULL, ("%s: m is NULL", __func__));
+	KASSERT(*tlenp == m_length(m, NULL),
+	    ("%s: tlen != mbuf length", __func__));
+
+	/*
+	 * Store TCP header information in local variables as
+	 * we may lose access to it after mbuf compacting.
+	 */
+	thflags = th->th_flags;
+	th_seq = th->th_seq;
+	th = NULL;		/* Prevent further use. */
+
+	/* Check if it is really neccessary to do all the work. */
+	if (!tcp_reass_enable && RB_EMPTY(&tp->rcv_reass))
+		goto done;
+
+	KASSERT(SEQ_LT(tp->rcv_nxt, th_seq),
+	    ("%s: sequence number below rcv_nxt", __func__));
+	KASSERT(!(tp->rcv_nxt == th_seq) || !(RB_EMPTY(&tp->rcv_reass)),
+	    ("%s: got missing segment but queue is empty", __func__));
+	KASSERT(tcp_reass_verify(tp),
+	    ("%s: reassembly queue already inconsistent", __func__));
+
+	/*
+	 * Limit the number of segments in the reassembly queue to prevent
+	 * holding on to too many segments (and thus running out of mbufs).
+	 * Make sure to let the missing segment through which caused this
+	 * queue.
+	 *
+	 * Count the gross space used by the mbufs in the reassembly queue
+	 * and limit it to the free space in the socket buffer.  This way
+	 * the reassembly queue can never consume more mbuf space than the
+	 * socket buffer got allocated anyway and it reflects the actual
+	 * amount of kernel memory used.  This effectively prevents mbuf
+	 * exhaustion due to pathological traffic (one byte segments with
+	 * a hole each time) on a single connection.
+	 *
+	 * Counting the gross mbuf space effectively sets the net data
+	 * limit lower than the socket buffer would allow.
+	 * Don't underestimates the effective free space in the socket
+	 * buffer vs. actual real data with 2k clusters and 1500 byte
+	 * packets by introducing a correction factor of 11/8th.
+	 */

>>> TRUNCATED FOR MAIL (1000 lines) <<<


More information about the p4-projects mailing list