PERFORCE change 166159 for review

Andre Oppermann andre at FreeBSD.org
Thu Jul 16 08:49:25 UTC 2009


http://perforce.freebsd.org/chv.cgi?CH=166159

Change 166159 by andre at andre_t61 on 2009/07/16 08:49:06

	Move queue integrity test to its own function.

Affected files ...

.. //depot/projects/tcp_reass/netinet/tcp_reass.c#30 edit

Differences ...

==== //depot/projects/tcp_reass/netinet/tcp_reass.c#30 (text+ko) ====

@@ -1,731 +1,740 @@
-/*-
- * Copyright (c) 2007
- *	Andre Oppermann, Internet Business Solutions AG.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
- * $FreeBSD: src/sys/netinet/tcp_reass.c,v 1.352 2007/05/13 22:16:13 andre Exp $
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/netinet/tcp_reass.c,v 1.362 2009/07/14 22:48:30 rwatson Exp $");
-
-/*
- * Operational overview of TCP reassembly:
- *
- * It is the purpose of tcp reassembly to store segments that are received
- * out of order.  This happens when packets are lost along the way due to
- * various reasons.  The most common one is traffic overload which causes
- * routers to stop accepting packets for brief moments.
- *
- * Upon arrival of the missing segment(s) the whole chain of stored segments
- * is moved into the socket buffer.  In case of multiple missing segments
- * the first consequtive part is moved with the remainder being kept in
- * store until the next missing segment arrives.
- *
- * While in reassembly mode *all* arrving segments are put into the reassembly
- * queue.
- *
- * Instead of storing all segments on their own we build blocks of consequtive
- * segments chained together.  We use a tailq because a new segments has the
- * highest probability to fit the tail of the chain.  If not, the second
- * highest probability is the beginning of the chain for being the missing
- * segment.  Otherwise we cycle through each consequtive block until a match
- * is found.  If a segment matches the end of one block and the start of the
- * next block the two blocks are joined together.  If no match is found a
- * new block is created.
- *
- * This system is very efficient and can deal efficiently with long chains 
- * and many holes.
- *
- * trq_tail ----------------------------------------------\
- * trq_head --> [block] ------>	[block] ------>	[block] <-/
- *		m_next		m_next		m_next
- *		   |		   |		   |
- *		m_next		m_next		m_next
- *		   |		   |		   |
- *		m_next		m_next		m_next
- *
- *
- * The reassembly queues block structure is also used to track SACK
- * information as a data receiver.  A double-linked list is added
- * that links the blocks the reverse order of their arrival or updating.
- * This makes us fully compliant to RFC2018 Section 4 including all
- * optional parts marked as "SHOULD".
- *
- * TODO:
- * A further improvement is to merge the content of mbufs together if the
- * preceeding one has enough space to hold the data of the new one.  When
- * trimming the head of an mbuf chain m_adj() empties the mbufs but leaves
- * them in place.  Only when trimming from the tail it actually frees them.
- * Normally we don't get mbuf chains so this isn't too much of a concern
- * right now.  Use m_collapse() to compact the mbuf chains within the
- * blocks.
- */
-
-#include "opt_inet.h"
-
-#include <sys/param.h>
-#include <sys/kernel.h>
-#include <sys/malloc.h>
-#include <sys/mbuf.h>
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-#include <sys/sysctl.h>
-#include <sys/syslog.h>
-#include <sys/systm.h>
-#include <sys/vimage.h>
-
-#include <vm/uma.h>
-
-#include <netinet/in.h>
-#include <netinet/in_pcb.h>
-#include <netinet/in_systm.h>
-#include <netinet/tcp.h>
-#include <netinet/tcp_fsm.h>
-#include <netinet/tcp_seq.h>
-#include <netinet/tcp_timer.h>
-#include <netinet/tcp_var.h>
-
-static VNET_DEFINE(int, tcp_reass_maxseg);
-VNET_DEFINE(int, tcp_reass_qsize);
-static VNET_DEFINE(int, tcp_reass_maxqlen);
-static VNET_DEFINE(int, tcp_reass_overflows);
-
-#define	V_tcp_reass_maxseg		VNET_GET(tcp_reass_maxseg)
-#define	V_tcp_reass_maxqlen		VNET_GET(tcp_reass_maxqlen)
-#define	V_tcp_reass_overflows		VNET_GET(tcp_reass_overflows)
-
-SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
-    "TCP Segment Reassembly Queue");
-
-static int tcp_reass_enabled = 1;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, enable, CTLFLAG_WR,
-    &tcp_reass_enabled, 0,
-    "Enable/disable use of TCP Reassembly Queue");
-
-static int tcp_reass_maxblocks = 0;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxblocks, CTLFLAG_RDTUN,
-    &tcp_reass_maxblocks, 0,
-    "Global maximum number of TCP Segment Blocks in Reassembly Queue");
-
-static int tcp_reass_qsize = 0;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, curblocks, CTLFLAG_RD,
-    &tcp_reass_qsize, 0,
-    "Global number of TCP Segment Blocks currently in Reassembly Queue");
-
-static int tcp_reass_qtimo = 0;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, queue_timeout, CTLFLAG_RW,
-    &tcp_reass_qtimo, 0,
-    "Reassembly Queue Timeout in multiples of the Retransmission Timeout");
-
-static int tcp_reass_spacetime = 0;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, space_time, CTLFLAG_RW,
-    &tcp_reass_spacetime, 0,
-    "Reassembly Queue strategy of space vs. time efficiency");
-
-static void	tcp_reass_merge(struct tcpcb *, struct trq *, struct trq *);
-
-uma_zone_t	tcp_reass_zone;
-
-static __inline void
-sack_track(struct trq *tqe) {
-	if (LIST_FIRST(&tp->t_trq_sack) != (tqe)) {
-		LIST_REMOVE((tqe), trq_s);
-		LIST_INSERT_HEAD(&tp->t_trq_sack, (tqe), trq_s);
-	}
-}
-
-/* Trim empty mbufs from head of chain. */
-static struct mbuf *
-m_trimhead(struct mbuf *m) {
-	struct mbuf *n;
-	while (m->m_len == 0) {
-		n = m;
-		m = m->m_next;
-		m_free(n);
-	}
-	return (m);
-}
-
-static u_int
-m_storagesize(m) {
-	u_int mcnt;
-	for (mcnt = 0, m; n; m = m->m_next)
-		mcnt += (m->m_flags & M_EXT) ?
-		    m->m_ext.ext_size + MSIZE : MSIZE;
-	return (mcnt);
-}
-
-/*
- * Adjust TCP reassembly zone limits when the nmbclusters zone changes.
- */
-static void
-tcp_reass_zone_change(void *tag)
-{
-
-	tcp_reass_maxblocks = nmbclusters / 16;
-	uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks);
-}
-
-VNET_DEFINE(uma_zone_t, tcp_reass_zone);
-
-/*
- * Initialize TCP reassembly zone on startup.
- */
-void
-tcp_reass_init(void)
-{
-
-	/* XXX: nmbclusters may be zero. */
-	tcp_reass_maxblocks = nmbclusters / 16;
-	TUNABLE_INT_FETCH("net.inet.tcp.reass.maxblocks",
-	    &tcp_reass_maxblocks);
-	tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct trq),
-	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
-	uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks);
-	EVENTHANDLER_REGISTER(nmbclusters_change,
-	    tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY);
-}
-
-/*
- * Insert segments into the reassembly queue.
- *
- * NB: We must always consume the mbuf.  Either by appeding it to
- * the queue or by freeing it.
- */
-int
-tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
-{
-	struct trq *tqe, *tqen;
-	struct socket *so = tp->t_inpcb->inp_socket;
-	struct mbuf *n;
-	int i, thflags = 0, mcnt;
-	tcp_seq th_seq;
-	struct trq tqes;
-
-	INP_WLOCK_ASSERT(tp->t_inpcb);
-
-	/*
-	 * Call with th==NULL after becoming established to
-	 * force pre-ESTABLISHED data up to user socket.
-	 * XXX: Was used for T/TCP of which code remains.
-	 */
-	if (th == NULL) {
-		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
-		    TAILQ_EMPTY(&tp->t_trq) ||
-		    ((tqe = TAILQ_FIRST(&tp->t_trq)) &&
-		     tqe->trq_seq != tp->rcv_nxt))
-			return (0);
-		goto present;
-	}
-
-	/*
-	 * Store TCP header information in local variables as
-	 * we may lose access to it after mbuf compacting.
-	 */
-	thflags = th->th_flags;
-	th_seq = th->th_seq;
-	th = NULL;		/* Prevent further use. */
-
-	/* Check if it is really neccessary to do all the work. */
-	if (!tcp_reass_enabled && TAILQ_EMPTY(&tp->t_trq)) {
-		*tlenp = 0;
-		m_freem(m);
-		return (0);
-	}
-
-	KASSERT(SEQ_LEQ(tp->rcv_nxt, th_seq),
-	    ("%s: sequence number below rcv_nxt", __func__));
-	KASSERT(!(tp->rcv_nxt == th_seq) || !(TAILQ_EMPTY(&tp->t_trq)),
-	    ("%s: got missing segment but queue is empty", __func__));
-
-#ifdef INVARIANTS
-	i = 0;
-	TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) {
-		KASSERT(SEQ_GEQ(tqe->trq_seq, tp->rcv_nxt),
-		    ("%s: trq_seq < rcv_nxt", __func__));
-		KASSERT(tqen == NULL ||
-		    SEQ_LT(tqe->trq_seq + tqe->trq_len, tqen->trq_seq),
-		    ("%s: overlapping blocks", __func__));
-		i++;
-	}
-	LIST_FOREACH(tqe, &tp->t_trq_sack, trq_s) {
-		i--;
-	}
-	KASSERT(i == 0, ("%s: SEQ# ordered tailq and arrival ordered "
-	    "SACK list are not equally long", __func__));
-#endif
-
-	/*
-	 * Limit the number of segments in the reassembly queue to prevent
-	 * holding on to too many segments (and thus running out of mbufs).
-	 * Make sure to let the missing segment through which caused this
-	 * queue.
-	 *
-	 * Count the gross space used by the mbufs in the reassembly queue
-	 * and limit it to the free space in the socket buffer.  This way
-	 * the reassembly queue can never consume more mbuf space than the
-	 * socket buffer got allocated anyway and it reflects the actual
-	 * amount of kernel memory used.  This effectively prevents mbuf
-	 * exhaustion due to pathological traffic (one byte segments with
-	 * a hole each time) on a single connection.
-	 *
-	 * Counting the gross mbuf space effectively sets the net data
-	 * limit lower than the socket buffer would allow.
-	 * Don't underestimates the effective free space in the socket
-	 * buffer vs. actual real data with 2k clusters and 1500 byte
-	 * packets by introducing a correction factor of 11/8th.
-	 */
-	if (th_seq != tp->rcv_nxt &&
-	    tp->t_trqmcnt > (sbspace(&so->so_rcv) / 8 * 11)) {
-		TCPSTAT_INC(tcps_reass_overflow);
-		TCPSTAT_INC(tcps_rcvmemdrop);
-		m_freem(m);
-		*tlenp = 0;
-		return (0);
-	}
-
-	/* Get rid of packet header and mtags. */
-	m_demote(m, 1);
-
-	/* Trim empty mbufs from head of chain. */
-	m = m_trimhead(m);
-
-	/* NB: m_adj(m, -i) may free mbufs at the tail of a chain. */
-	mcnt = m_storagesize(m);
-
-	/*
-	 * FIN handling is a bit tricky.
-	 * We cannot trust a FIN that goes into the reassembly queue.
-	 * It can be easily spoofed as it may be anywhere in the receive
-	 * window (see RST attack mitigation in tcp-secure).
-	 * For this reason (and complexity avoidance) we generally ignore
-	 * any FIN arriving at the reassembly queue with one exception;
-	 * When it exactly matches rcv_nxt together with any data in the
-	 * same segment we can conclude it to be genuine and proceed with
-	 * flushing any other data waiting in the reassembly queue.
-	 * A FIN is part of the sequence space and will get retransmitted
-	 * if it was genuine.
-	 * This approach is based on a discussion on TCPM mailing list.
-	 */
-	if ((thflags & TH_FIN) && tp->rcv_nxt == th_seq) {
-		tcp_reass_qfree(tp);
-		tqe = NULL;
-		if (m->m_len == 0) {
-			tcp_timer_activate(tp, TT_REASS, 0);
-			return (thflags);
-		}
-		goto insert;
-	} else
-		thflags &= ~TH_FIN;
-
-	/* Check if this is the first segment. */
-	if (TAILQ_EMPTY(&tp->t_trq))
-		goto insert;
-
-	/* Starting point for the following tests. */
-	tqe = TAILQ_LAST(&tp->t_trq, trq_head);
-
-	/* Check if this segment directly attaches to the end. */
-	if (tqe->trq_seq + tqe->trq_len == th_seq) {
-		tqe->trq_len += *tlenp;
-		tqe->trq_mcnt += mcnt;
-		tp->t_trqmcnt += mcnt;
-		tqe->trq_ml->m_next = m;
-		tqe->trq_ml = m_last(m);
-		if (tcp_reass_spacetime) {
-			tqe->trq_m =  m_collapse(tqe->trq_m, M_DONTWAIT, 1024);
-			tp->t_trqmcnt -= tqe->trq_mcnt;
-			tqe->trq_mcnt = m_storagesize(tqe->trq_m);
-			tqe->trq_mcnt += tp->t_trqmcnt;
-		}
-		sack_track(tqe);
-		/* TCP statistics. */
-		TCPSTAT_INC(tcps_rcvoopack);
-		TCPSTAT_ADD(tcps_rcvoobyte, *tlenp);
-		TCPSTAT_INC(tcps_reass_tail);
-		return (0);
-	}
-
-	/* Check if beyond last block. */
-	if (SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq))
-		goto insert;
-
-	/* Check if this is the missing segment. */
-	if (tp->rcv_nxt == th_seq) {
-		tqe = TAILQ_FIRST(&tp->t_trq);
-		KASSERT(SEQ_GT(tqe->trq_seq, th_seq),
-		    ("%s: first block starts below missing segment", __func__));
-		/* Check if segment prepends first block. */
-		if (SEQ_LEQ(tqe->trq_seq, th_seq + *tlenp)) {
-			/* Trim tail of segment. */
-			if ((i = SEQ_DELTA(tqe->trq_seq, th_seq + *tlenp))) {
-				m_adj(m, -i);
-				*tlenp -= i;
-				/* TCP statistics. */
-				TCPSTAT_INC(tcps_rcvpartduppack);
-				TCPSTAT_ADD(tcps_rcvpartdupbyte, i);
-				/* Update accounting. */
-				mcnt = m_storagesize(m);
-			}
-			tqe->trq_len += *tlenp;
-			tqe->trq_mcnt += mcnt;
-			tp->t_trqmcnt += mcnt;
-			tqe->trq_seq = th_seq;
-			n = m_last(m);
-			n->m_next = tqe->trq_m;
-			tqe->trq_m = m;
-			goto present;
-		}
-		goto insert;	/* No statistics, this segment is in line. */
-	}
-
-	/* TCP statistics. */
-	TCPSTAT_INC(tcps_rcvoopack);
-	TCPSTAT_ADD(tcps_rcvoobyte, *tlenp);
-
-	/* See where it fits. */
-	TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) {
-		/* Segment is after this blocks coverage. */
-		if (SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq))
-			continue;
-		/* Segment is after the previous one but before this one. */
-		if (SEQ_GT(tqe->trq_seq, th_seq + *tlenp))
-			break;		/* Insert as new block. */
-
-		/* Segment is already fully covered. */
-		if (SEQ_LEQ(tqe->trq_seq, th_seq) &&
-		    SEQ_GEQ(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp)) {
-			TCPSTAT_INC(tcps_rcvduppack);
-			TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp);
-			TCPSTAT_INC(tcps_reass_covered);
-			/*
-			 * XXXAO: What to SACK report when duplicate?
-			 * See RFC2883: D-SACK (Duplicate SACK)
-			 */
-			sack_track(tqe);
-			m_freem(m);
-			*tlenp = 0;
-			return (0);
-		}
-
-		/* Segment covers and extends on both ends. */
-		if (SEQ_GT(tqe->trq_seq, th_seq) &&
-		    SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp)) {
-			/* Replace block content. */
-			tp->t_trqmcnt -= tqe->trq_mcnt;
-			m_freem(tqe->trq_m);
-			tqe->trq_len = *tlenp;
-			tqe->trq_mcnt = mcnt;
-			tp->t_trqmcnt += mcnt;
-			tqe->trq_seq = th_seq;
-			tqe->trq_m = m;
-			tqe->trq_ml = m_last(m);
-			/* Check if segment bridges next block to merge. */
-			if (tqen != NULL &&
-			    SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq))
-				tcp_reass_merge(tp, tqe, tqen);
-			sack_track(tqe);
-			TCPSTAT_INC(tcps_reass_replace);
-			return (0);
-		}
-
-		/* Segment prepends to this block. */
-		if (SEQ_GT(tqe->trq_seq, th_seq) &&
-		    SEQ_LEQ(tqe->trq_seq, th_seq + *tlenp) &&
-		    SEQ_GEQ(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp)) {
-			KASSERT(!(thflags & TH_FIN),
-			    ("%s: new segment with FIN can't prepend", __func__));
-			/* Trim tail of segment. */
-			if ((i = SEQ_DELTA(tqe->trq_seq, th_seq + *tlenp))) {
-				m_adj(m, -i);
-				*tlenp -= i;
-				/* TCP statistics. */
-				TCPSTAT_INC(tcps_rcvpartduppack);
-				TCPSTAT_ADD(tcps_rcvpartdupbyte, i);
-				/* Update accounting. */
-				mcnt = m_storagesize(m);
-			}
-			tqe->trq_len += *tlenp;
-			tqe->trq_mcnt += mcnt;
-			tp->t_trqmcnt += mcnt;
-			tqe->trq_seq = th_seq;
-			n = m_last(m);
-			n->m_next = tqe->trq_m;
-			tqe->trq_m = m;
-			sack_track(tqe);
-			TCPSTAT_INC(tcps_reass_prepend);
-			return (0);
-		}
-
-		/* Segment appends to this block. */
-		if (SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp) &&
-		    SEQ_LEQ(tqe->trq_seq, th_seq) &&
-		    SEQ_GEQ(tqe->trq_seq + tqe->trq_len, th_seq)) {
-			/* Trim head of segment. */
-			if ((i = SEQ_DELTA(tqe->trq_seq + tqe->trq_len, th_seq))) {
-				m_adj(m, i);
-				*tlenp -= i;
-				/* TCP Statistics. */
-				TCPSTAT_INC(tcps_rcvpartduppack);
-				TCPSTAT_ADD(tcps_rcvpartdupbyte, i);
-			}
-			tqe->trq_len += *tlenp;
-			tqe->trq_mcnt += mcnt;
-			tp->t_trqmcnt += mcnt;
-			tqe->trq_ml->m_next = m;
-			tqe->trq_ml = m_last(m);
-			/* Check if segment bridges two blocks to merge. */
-			if (tqen != NULL &&
-			    SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq))
-				tcp_reass_merge(tp, tqe, tqen);
-			sack_track(tqe);
-			TCPSTAT_INC(tcps_reass_append);
-			return (0);
-		}
-	}
-
-insert:
-	/* Prepare to insert into block queue. */
-	if (tp->rcv_nxt == th_seq) {
-		/*
-		 * Use temporary struct trq on the stack for missing
-		 * segment to prevent blocking of all reassembly queues
-		 * due to zone exhaustion.
-		 */
-		tqen = &tqes;
-	} else {
-		tqen = uma_zalloc(tcp_reass_zone, (M_NOWAIT|M_ZERO));
-		if (tqen == NULL) {
-			TCPSTAT_INC(tcps_rcvmemdrop);
-			m_freem(m);
-			*tlenp = 0;
-			return (0);
-		}
-		TCPSTAT_INC(tcps_reass_blocks);
-	}
-	tcp_reass_qsize++;
-	if (tcp_reass_spacetime) {
-		m = m_collapse();
-		mcnt = m_storagesize(m);
-	}
-	tqen->trq_seq = th_seq;
-	tqen->trq_len = *tlenp;
-	tqen->trq_mcnt = mcnt;
-	tp->t_trqmcnt += mcnt;
-	tqen->trq_m = m;
-	tqen->trq_ml = m_last(m);
-
-	/* Where to insert. */
-	if (tqe != NULL && SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq))
-		TAILQ_INSERT_AFTER(&tp->t_trq, tqe, tqen, trq_q);
-	else if (tqe != NULL)
-		TAILQ_INSERT_BEFORE(tqe, tqen, trq_q);
-	else {
-		KASSERT(TAILQ_EMPTY(&tp->t_trq),
-		    ("%s: first element queue not empty", __func__));
-		TAILQ_INSERT_HEAD(&tp->t_trq, tqen, trq_q);
-		/*
-		 * Flush the reassembly queue after x times the
-		 * current retransmit interval measured from the
-		 * arrival time of the first segment.
-		 */
-		if (tcp_reass_qtimo)
-			tcp_timer_activate(tp, TT_REASS,
-			    tp->t_rxtcur * tcp_reass_qtimo);
-	}
-	LIST_INSERT_HEAD(&tp->t_trq_sack, tqen, trq_s);
-
-	/* Missing segment? */
-	if (tp->rcv_nxt != th_seq)
-		return (0);
-present:
-	/*
-	 * Present data to user, advancing rcv_nxt through the
-	 * completed sequence space.
-	 */
-	KASSERT(!TAILQ_EMPTY(&tp->t_trq),
-	    ("%s: queue empty at present", __func__));
-	KASSERT((TAILQ_FIRST(&tp->t_trq))->trq_seq == tp->rcv_nxt,
-	    ("%s: first block does not match rcv_nxt", __func__));
-	TCPSTAT_INC(tcps_reass_missingseg);
-
-	SOCKBUF_LOCK(&so->so_rcv);
-	TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) {
-		KASSERT(SEQ_GEQ(tqe->trq_seq, tp->rcv_nxt),
-		    ("%s: trq_seq < rcv_nxt", __func__));
-		KASSERT(tqen == NULL ||
-		    SEQ_LEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq),
-		    ("%s: block overlaps into next one", __func__));
-
-		if (tqe->trq_seq != tp->rcv_nxt)
-			break;
-		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
-			m_freem(tqe->trq_m);
-		else
-			sbappendstream_locked(&so->so_rcv, tqe->trq_m);
-		tp->rcv_nxt += tqe->trq_len;
-		tp->t_trqmcnt -= tqe->trq_mcnt;
-		TAILQ_REMOVE(&tp->t_trq, tqe, trq_q);
-		LIST_REMOVE(tqe, trq_s);
-		if (tqe != &tqes)
-			uma_zfree(tcp_reass_zone, tqe);
-		V_tcp_reass_qsize--;
-	}
-	/* NB: sorwakeup_locked() does a implicit socket buffer unlock. */
-	sorwakeup_locked(so);
-
-	/*
-	 * Restart the reassembly queue flush timer after advancing
-	 * the sequence space and if queue is not empty.  Otherwise
-	 * deactivate it.
-	 */
-	if (tcp_reass_qtimo && !TAILQ_EMPTY(&tp->t_trq))
-		tcp_timer_activate(tp, TT_REASS,
-		    tp->t_rxtcur * tcp_reass_qtimo);
-	else
-		tcp_timer_activate(tp, TT_REASS, 0);
-
-	ND6_HINT(tp);
-	return (thflags);
-}
-
-/*
- * Merge one or more consecutive blocks together.
- */
-static void
-tcp_reass_merge(struct tcpcb *tp, struct trq *tqe, struct trq *tqen)
-{
-	int i;
-
-	KASSERT(tqe != NULL && tqen != NULL,
-	    ("%s: incomplete input", __func__));
-	KASSERT(SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq),
-	    ("%s: blocks do not overlap, nothing to merge", __func__));
-
-	/* Appended block may reach beyond next block. */
-	while (SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq + tqen->trq_len)) {
-		/* TCP Statistics. */
-		TCPSTAT_ADD(tcps_rcvpartdupbyte, tqen->trq_len);
-		TCPSTAT_INC(tcps_reass_covered);
-		tp->t_trqmcnt -= tqe->trq_mcnt;
-		m_freem(tqen->trq_m);
-		TAILQ_REMOVE(&tp->t_trq, tqen, trq_q);
-		LIST_REMOVE(tqen, trq_s);
-		uma_zfree(tcp_reass_zone, tqen);
-		tcp_reass_qsize--;
-		/* And the one after that. */
-		if ((tqen = TAILQ_NEXT(tqe, trq_q)) == NULL)
-			return;
-	}
-
-	/* Trim head of next block. */
-	if ((i = SEQ_DELTA(tqe->trq_seq + tqe->trq_len, tqen->trq_seq))) {
-		m_adj(tqen->trq_m, i);
-		tqen->trq_len -= i;
-		TCPSTAT_ADD(tcps_rcvpartdupbyte, i);		/* Statistics */
-		/* Dispose of empty mbufs. */
-		if (tcp_reass_spacetime) {
-			tqen->trq_m = m_trimhead(tqen->trq_m);
-			tqen->trq_mcnt = m_storagesize(tqen->trq_m);
-		}
-		KASSERT(tqen->trq_m != NULL,
-		    ("%s: no remaining mbufs in block", __func__));
-	}
-
-	/* Merge blocks together. */
-	tqe->trq_len += tqen->trq_len;
-	tqe->trq_mcnt += tqen->trq_mcnt;
-	tqe->trq_ml->m_next = tqen->trq_m;
-	tqe->trq_ml = tqen->trq_ml;
-	TAILQ_REMOVE(&tp->t_trq, tqen, trq_q);
-	LIST_REMOVE(tqen, trq_s);
-	uma_zfree(tcp_reass_zone, tqen);
-	tcp_reass_qsize--;
-	TCPSTAT_INC(tcps_reass_merge);
-}
-
-/*
- * Put the sequence number of the reassembly queue blocks into
- * the SACK options of an outgoing segment.
- */
-int
-tcp_reass_sack(struct tcpcb *tp, u_char *optp, int numsacks)
-{
-	struct trq *tqe;
-	tcp_seq sack_seq;
-	int nsacks = 0;
-
-	KASSERT(numsacks > 0,
-	    ("%s: zero sack blocks to add", __func__));
-	KASSERT(!TAILQ_EMPTY(&tp->t_trq),
-	    ("%s: reassembly queue empty", __func__));
-	KASSERT(!LIST_EMPTY(&tp->t_trq_sack),
-	    ("%s: sack list empty", __func__));
-
-	/*
-	 * The most recent block must appear first.  RFC2018, Section 4.
-	 * Add the other blocks in most recent created or updated order.
-	 */
-	LIST_FOREACH(tqe, &tp->t_trq_sack, trq_s) {
-		if (numsacks < 1)
-			break;
-		sack_seq = htonl(tqe->trq_seq);
-		bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
-		optp += sizeof(sack_seq);
-		sack_seq = htonl(tqe->trq_seq + tqe->trq_len);
-		bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
-		optp += sizeof(sack_seq);
-		numsacks--;
-		nsacks++;
-	}
-
-	return (nsacks);
-}
-
-/*
- * Free the reassembly queue on tcpcb disposal or on general memory shortage.
- */
-void
-tcp_reass_qfree(struct tcpcb *tp)
-{
-	struct trq *tqe, *tqen;
-
-	INP_WLOCK_ASSERT(tp->t_inpcb);
-
-	TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) {
-		m_freem(tqe->trq_m);
-		KASSERT(tp->t_trqmcnt >= tqe->trq_mcnt,
-		    ("%s: t_trqmcnt incorrect", __func__));
-		tp->t_trqmcnt -= tqe->trq_mcnt;
-		TAILQ_REMOVE(&tp->t_trq, tqe, trq_q);
-		LIST_REMOVE(tqe, trq_s);
-		uma_zfree(tcp_reass_zone, tqe);
-		tcp_reass_qsize--;
-	}
-	tcp_timer_activate(tp, TT_REASS, 0);
-}
+/*-
+ * Copyright (c) 2007
+ *	Andre Oppermann, Internet Business Solutions AG.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
+ * $FreeBSD: src/sys/netinet/tcp_reass.c,v 1.352 2007/05/13 22:16:13 andre Exp $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/netinet/tcp_reass.c,v 1.362 2009/07/14 22:48:30 rwatson Exp $");
+
+/*
+ * Operational overview of TCP reassembly:
+ *
+ * It is the purpose of tcp reassembly to store segments that are received
+ * out of order.  This happens when packets are lost along the way due to
+ * various reasons.  The most common one is traffic overload which causes
+ * routers to stop accepting packets for brief moments.
+ *
+ * Upon arrival of the missing segment(s) the whole chain of stored segments
+ * is moved into the socket buffer.  In case of multiple missing segments
+ * the first consequtive part is moved with the remainder being kept in
+ * store until the next missing segment arrives.
+ *
+ * While in reassembly mode *all* arrving segments are put into the reassembly
+ * queue.
+ *
+ * Instead of storing all segments on their own we build blocks of consequtive
+ * segments chained together.  We use a tailq because a new segments has the
+ * highest probability to fit the tail of the chain.  If not, the second
+ * highest probability is the beginning of the chain for being the missing
+ * segment.  Otherwise we cycle through each consequtive block until a match
+ * is found.  If a segment matches the end of one block and the start of the
+ * next block the two blocks are joined together.  If no match is found a
+ * new block is created.
+ *
+ * This system is very efficient and can deal efficiently with long chains 
+ * and many holes.
+ *
+ * trq_tail ----------------------------------------------\
+ * trq_head --> [block] ------>	[block] ------>	[block] <-/
+ *		m_next		m_next		m_next
+ *		   |		   |		   |
+ *		m_next		m_next		m_next
+ *		   |		   |		   |
+ *		m_next		m_next		m_next
+ *
+ *
+ * The reassembly queues block structure is also used to track SACK
+ * information as a data receiver.  A double-linked list is added
+ * that links the blocks the reverse order of their arrival or updating.
+ * This makes us fully compliant to RFC2018 Section 4 including all
+ * optional parts marked as "SHOULD".
+ *
+ * TODO:
+ * A further improvement is to merge the content of mbufs together if the
+ * preceeding one has enough space to hold the data of the new one.  When
+ * trimming the head of an mbuf chain m_adj() empties the mbufs but leaves
+ * them in place.  Only when trimming from the tail it actually frees them.
+ * Normally we don't get mbuf chains so this isn't too much of a concern
+ * right now.  Use m_collapse() to compact the mbuf chains within the
+ * blocks.
+ */
+
+#include "opt_inet.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/vimage.h>
+
+#include <vm/uma.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+
+static VNET_DEFINE(int, tcp_reass_maxseg);
+VNET_DEFINE(int, tcp_reass_qsize);
+static VNET_DEFINE(int, tcp_reass_maxqlen);
+static VNET_DEFINE(int, tcp_reass_overflows);
+
+VNET_DEFINE(uma_zone_t, tcp_reass_zone);
+
+#define	V_tcp_reass_maxseg		VNET_GET(tcp_reass_maxseg)
+#define	V_tcp_reass_maxqlen		VNET_GET(tcp_reass_maxqlen)
+#define	V_tcp_reass_overflows		VNET_GET(tcp_reass_overflows)
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
+    "TCP Segment Reassembly Queue");
+
+static int tcp_reass_enabled = 1;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, enable, CTLFLAG_WR,
+    &tcp_reass_enabled, 0,
+    "Enable/disable use of TCP Reassembly Queue");
+
+static int tcp_reass_maxblocks = 0;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxblocks, CTLFLAG_RDTUN,
+    &tcp_reass_maxblocks, 0,
+    "Global maximum number of TCP Segment Blocks in Reassembly Queue");
+
+static int tcp_reass_qsize = 0;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, curblocks, CTLFLAG_RD,
+    &tcp_reass_qsize, 0,
+    "Global number of TCP Segment Blocks currently in Reassembly Queue");
+
+static int tcp_reass_qtimo = 0;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, queue_timeout, CTLFLAG_RW,
+    &tcp_reass_qtimo, 0,
+    "Reassembly Queue Timeout in multiples of the Retransmission Timeout");
+
+static int tcp_reass_spacetime = 0;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, space_time, CTLFLAG_RW,
+    &tcp_reass_spacetime, 0,
+    "Reassembly Queue strategy of space vs. time efficiency");
+
+static void	tcp_reass_merge(struct tcpcb *, struct trq *, struct trq *);
+
+static __inline void
+sack_track(struct trq *tqe) {
+	if (LIST_FIRST(&tp->t_trq_sack) != (tqe)) {
+		LIST_REMOVE((tqe), trq_s);
+		LIST_INSERT_HEAD(&tp->t_trq_sack, (tqe), trq_s);
+	}
+}
+
+/* Trim empty mbufs from head of chain. */
+static struct mbuf *
+m_trimhead(struct mbuf *m) {
+	struct mbuf *n;
+
+	while (m->m_len == 0) {
+		n = m;
+		m = m->m_next;
+		m_free(n);
+	}
+	return (m);
+}
+
+static u_int
+m_storagesize(m) {
+	u_int mcnt;
+
+	for (mcnt = 0, m; n; m = m->m_next)
+		mcnt += (m->m_flags & M_EXT) ?
+		    m->m_ext.ext_size + MSIZE : MSIZE;
+	return (mcnt);
+}
+
+/*
+ * Adjust TCP reassembly zone limits when the nmbclusters zone changes.
+ */
+static void
+tcp_reass_zone_change(void *tag)
+{
+
+	tcp_reass_maxblocks = nmbclusters / 16;
+	uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks);
+}
+
+#ifdef INVARIANTS
+static int
+tcp_reass_verify(struct tcpcb *tp)
+{
+	struct trq *tqe, *tqen;
+	int i = 0;
+
+	TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) {
+		KASSERT(SEQ_GEQ(tqe->trq_seq, tp->rcv_nxt),
+		    ("%s: trq_seq < rcv_nxt", __func__));
+		KASSERT(tqen == NULL ||
+		    SEQ_LT(tqe->trq_seq + tqe->trq_len, tqen->trq_seq),
+		    ("%s: overlapping blocks", __func__));
+		i++;
+	}
+	LIST_FOREACH(tqe, &tp->t_trq_sack, trq_s) {
+		i--;
+	}
+	KASSERT(i == 0, ("%s: SEQ# ordered tailq and arrival ordered "
+	    "SACK list are not equally long", __func__));
+	return (0);
+}
+#endif
+
+/*
+ * Initialize TCP reassembly zone on startup.
+ */
+void
+tcp_reass_init(void)
+{
+
+	/* XXX: nmbclusters may be zero. */
+	tcp_reass_maxblocks = nmbclusters / 16;
+	TUNABLE_INT_FETCH("net.inet.tcp.reass.maxblocks",
+	    &tcp_reass_maxblocks);
+	tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct trq),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks);
+	EVENTHANDLER_REGISTER(nmbclusters_change,
+	    tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY);
+}
+
+/*
+ * Insert segments into the reassembly queue.
+ *
+ * NB: We must always consume the mbuf.  Either by appeding it to
+ * the queue or by freeing it.
+ */
+int
+tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
+{
+	struct trq *tqe, *tqen;
+	struct socket *so = tp->t_inpcb->inp_socket;
+	struct mbuf *n;
+	int i, thflags = 0, mcnt;
+	tcp_seq th_seq;
+	struct trq tqes;
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	/*

>>> TRUNCATED FOR MAIL (1000 lines) <<<


More information about the p4-projects mailing list