PERFORCE change 166159 for review
Andre Oppermann
andre at FreeBSD.org
Thu Jul 16 08:49:25 UTC 2009
http://perforce.freebsd.org/chv.cgi?CH=166159
Change 166159 by andre at andre_t61 on 2009/07/16 08:49:06
Move queue integrity test to its own function.
Affected files ...
.. //depot/projects/tcp_reass/netinet/tcp_reass.c#30 edit
Differences ...
==== //depot/projects/tcp_reass/netinet/tcp_reass.c#30 (text+ko) ====
@@ -1,731 +1,740 @@
-/*-
- * Copyright (c) 2007
- * Andre Oppermann, Internet Business Solutions AG. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
- * $FreeBSD: src/sys/netinet/tcp_reass.c,v 1.352 2007/05/13 22:16:13 andre Exp $
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/netinet/tcp_reass.c,v 1.362 2009/07/14 22:48:30 rwatson Exp $");
-
-/*
- * Operational overview of TCP reassembly:
- *
- * It is the purpose of tcp reassembly to store segments that are received
- * out of order. This happens when packets are lost along the way due to
- * various reasons. The most common one is traffic overload which causes
- * routers to stop accepting packets for brief moments.
- *
- * Upon arrival of the missing segment(s) the whole chain of stored segments
- * is moved into the socket buffer. In case of multiple missing segments
- * the first consequtive part is moved with the remainder being kept in
- * store until the next missing segment arrives.
- *
- * While in reassembly mode *all* arrving segments are put into the reassembly
- * queue.
- *
- * Instead of storing all segments on their own we build blocks of consequtive
- * segments chained together. We use a tailq because a new segments has the
- * highest probability to fit the tail of the chain. If not, the second
- * highest probability is the beginning of the chain for being the missing
- * segment. Otherwise we cycle through each consequtive block until a match
- * is found. If a segment matches the end of one block and the start of the
- * next block the two blocks are joined together. If no match is found a
- * new block is created.
- *
- * This system is very efficient and can deal efficiently with long chains
- * and many holes.
- *
- * trq_tail ----------------------------------------------\
- * trq_head --> [block] ------> [block] ------> [block] <-/
- * m_next m_next m_next
- * | | |
- * m_next m_next m_next
- * | | |
- * m_next m_next m_next
- *
- *
- * The reassembly queues block structure is also used to track SACK
- * information as a data receiver. A double-linked list is added
- * that links the blocks the reverse order of their arrival or updating.
- * This makes us fully compliant to RFC2018 Section 4 including all
- * optional parts marked as "SHOULD".
- *
- * TODO:
- * A further improvement is to merge the content of mbufs together if the
- * preceeding one has enough space to hold the data of the new one. When
- * trimming the head of an mbuf chain m_adj() empties the mbufs but leaves
- * them in place. Only when trimming from the tail it actually frees them.
- * Normally we don't get mbuf chains so this isn't too much of a concern
- * right now. Use m_collapse() to compact the mbuf chains within the
- * blocks.
- */
-
-#include "opt_inet.h"
-
-#include <sys/param.h>
-#include <sys/kernel.h>
-#include <sys/malloc.h>
-#include <sys/mbuf.h>
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-#include <sys/sysctl.h>
-#include <sys/syslog.h>
-#include <sys/systm.h>
-#include <sys/vimage.h>
-
-#include <vm/uma.h>
-
-#include <netinet/in.h>
-#include <netinet/in_pcb.h>
-#include <netinet/in_systm.h>
-#include <netinet/tcp.h>
-#include <netinet/tcp_fsm.h>
-#include <netinet/tcp_seq.h>
-#include <netinet/tcp_timer.h>
-#include <netinet/tcp_var.h>
-
-static VNET_DEFINE(int, tcp_reass_maxseg);
-VNET_DEFINE(int, tcp_reass_qsize);
-static VNET_DEFINE(int, tcp_reass_maxqlen);
-static VNET_DEFINE(int, tcp_reass_overflows);
-
-#define V_tcp_reass_maxseg VNET_GET(tcp_reass_maxseg)
-#define V_tcp_reass_maxqlen VNET_GET(tcp_reass_maxqlen)
-#define V_tcp_reass_overflows VNET_GET(tcp_reass_overflows)
-
-SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
- "TCP Segment Reassembly Queue");
-
-static int tcp_reass_enabled = 1;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, enable, CTLFLAG_WR,
- &tcp_reass_enabled, 0,
- "Enable/disable use of TCP Reassembly Queue");
-
-static int tcp_reass_maxblocks = 0;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxblocks, CTLFLAG_RDTUN,
- &tcp_reass_maxblocks, 0,
- "Global maximum number of TCP Segment Blocks in Reassembly Queue");
-
-static int tcp_reass_qsize = 0;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, curblocks, CTLFLAG_RD,
- &tcp_reass_qsize, 0,
- "Global number of TCP Segment Blocks currently in Reassembly Queue");
-
-static int tcp_reass_qtimo = 0;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, queue_timeout, CTLFLAG_RW,
- &tcp_reass_qtimo, 0,
- "Reassembly Queue Timeout in multiples of the Retransmission Timeout");
-
-static int tcp_reass_spacetime = 0;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, space_time, CTLFLAG_RW,
- &tcp_reass_spacetime, 0,
- "Reassembly Queue strategy of space vs. time efficiency");
-
-static void tcp_reass_merge(struct tcpcb *, struct trq *, struct trq *);
-
-uma_zone_t tcp_reass_zone;
-
-static __inline void
-sack_track(struct trq *tqe) {
- if (LIST_FIRST(&tp->t_trq_sack) != (tqe)) {
- LIST_REMOVE((tqe), trq_s);
- LIST_INSERT_HEAD(&tp->t_trq_sack, (tqe), trq_s);
- }
-}
-
-/* Trim empty mbufs from head of chain. */
-static struct mbuf *
-m_trimhead(struct mbuf *m) {
- struct mbuf *n;
- while (m->m_len == 0) {
- n = m;
- m = m->m_next;
- m_free(n);
- }
- return (m);
-}
-
-static u_int
-m_storagesize(m) {
- u_int mcnt;
- for (mcnt = 0, m; n; m = m->m_next)
- mcnt += (m->m_flags & M_EXT) ?
- m->m_ext.ext_size + MSIZE : MSIZE;
- return (mcnt);
-}
-
-/*
- * Adjust TCP reassembly zone limits when the nmbclusters zone changes.
- */
-static void
-tcp_reass_zone_change(void *tag)
-{
-
- tcp_reass_maxblocks = nmbclusters / 16;
- uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks);
-}
-
-VNET_DEFINE(uma_zone_t, tcp_reass_zone);
-
-/*
- * Initialize TCP reassembly zone on startup.
- */
-void
-tcp_reass_init(void)
-{
-
- /* XXX: nmbclusters may be zero. */
- tcp_reass_maxblocks = nmbclusters / 16;
- TUNABLE_INT_FETCH("net.inet.tcp.reass.maxblocks",
- &tcp_reass_maxblocks);
- tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct trq),
- NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
- uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks);
- EVENTHANDLER_REGISTER(nmbclusters_change,
- tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY);
-}
-
-/*
- * Insert segments into the reassembly queue.
- *
- * NB: We must always consume the mbuf. Either by appeding it to
- * the queue or by freeing it.
- */
-int
-tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
-{
- struct trq *tqe, *tqen;
- struct socket *so = tp->t_inpcb->inp_socket;
- struct mbuf *n;
- int i, thflags = 0, mcnt;
- tcp_seq th_seq;
- struct trq tqes;
-
- INP_WLOCK_ASSERT(tp->t_inpcb);
-
- /*
- * Call with th==NULL after becoming established to
- * force pre-ESTABLISHED data up to user socket.
- * XXX: Was used for T/TCP of which code remains.
- */
- if (th == NULL) {
- if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
- TAILQ_EMPTY(&tp->t_trq) ||
- ((tqe = TAILQ_FIRST(&tp->t_trq)) &&
- tqe->trq_seq != tp->rcv_nxt))
- return (0);
- goto present;
- }
-
- /*
- * Store TCP header information in local variables as
- * we may lose access to it after mbuf compacting.
- */
- thflags = th->th_flags;
- th_seq = th->th_seq;
- th = NULL; /* Prevent further use. */
-
- /* Check if it is really neccessary to do all the work. */
- if (!tcp_reass_enabled && TAILQ_EMPTY(&tp->t_trq)) {
- *tlenp = 0;
- m_freem(m);
- return (0);
- }
-
- KASSERT(SEQ_LEQ(tp->rcv_nxt, th_seq),
- ("%s: sequence number below rcv_nxt", __func__));
- KASSERT(!(tp->rcv_nxt == th_seq) || !(TAILQ_EMPTY(&tp->t_trq)),
- ("%s: got missing segment but queue is empty", __func__));
-
-#ifdef INVARIANTS
- i = 0;
- TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) {
- KASSERT(SEQ_GEQ(tqe->trq_seq, tp->rcv_nxt),
- ("%s: trq_seq < rcv_nxt", __func__));
- KASSERT(tqen == NULL ||
- SEQ_LT(tqe->trq_seq + tqe->trq_len, tqen->trq_seq),
- ("%s: overlapping blocks", __func__));
- i++;
- }
- LIST_FOREACH(tqe, &tp->t_trq_sack, trq_s) {
- i--;
- }
- KASSERT(i == 0, ("%s: SEQ# ordered tailq and arrival ordered "
- "SACK list are not equally long", __func__));
-#endif
-
- /*
- * Limit the number of segments in the reassembly queue to prevent
- * holding on to too many segments (and thus running out of mbufs).
- * Make sure to let the missing segment through which caused this
- * queue.
- *
- * Count the gross space used by the mbufs in the reassembly queue
- * and limit it to the free space in the socket buffer. This way
- * the reassembly queue can never consume more mbuf space than the
- * socket buffer got allocated anyway and it reflects the actual
- * amount of kernel memory used. This effectively prevents mbuf
- * exhaustion due to pathological traffic (one byte segments with
- * a hole each time) on a single connection.
- *
- * Counting the gross mbuf space effectively sets the net data
- * limit lower than the socket buffer would allow.
- * Don't underestimates the effective free space in the socket
- * buffer vs. actual real data with 2k clusters and 1500 byte
- * packets by introducing a correction factor of 11/8th.
- */
- if (th_seq != tp->rcv_nxt &&
- tp->t_trqmcnt > (sbspace(&so->so_rcv) / 8 * 11)) {
- TCPSTAT_INC(tcps_reass_overflow);
- TCPSTAT_INC(tcps_rcvmemdrop);
- m_freem(m);
- *tlenp = 0;
- return (0);
- }
-
- /* Get rid of packet header and mtags. */
- m_demote(m, 1);
-
- /* Trim empty mbufs from head of chain. */
- m = m_trimhead(m);
-
- /* NB: m_adj(m, -i) may free mbufs at the tail of a chain. */
- mcnt = m_storagesize(m);
-
- /*
- * FIN handling is a bit tricky.
- * We cannot trust a FIN that goes into the reassembly queue.
- * It can be easily spoofed as it may be anywhere in the receive
- * window (see RST attack mitigation in tcp-secure).
- * For this reason (and complexity avoidance) we generally ignore
- * any FIN arriving at the reassembly queue with one exception;
- * When it exactly matches rcv_nxt together with any data in the
- * same segment we can conclude it to be genuine and proceed with
- * flushing any other data waiting in the reassembly queue.
- * A FIN is part of the sequence space and will get retransmitted
- * if it was genuine.
- * This approach is based on a discussion on TCPM mailing list.
- */
- if ((thflags & TH_FIN) && tp->rcv_nxt == th_seq) {
- tcp_reass_qfree(tp);
- tqe = NULL;
- if (m->m_len == 0) {
- tcp_timer_activate(tp, TT_REASS, 0);
- return (thflags);
- }
- goto insert;
- } else
- thflags &= ~TH_FIN;
-
- /* Check if this is the first segment. */
- if (TAILQ_EMPTY(&tp->t_trq))
- goto insert;
-
- /* Starting point for the following tests. */
- tqe = TAILQ_LAST(&tp->t_trq, trq_head);
-
- /* Check if this segment directly attaches to the end. */
- if (tqe->trq_seq + tqe->trq_len == th_seq) {
- tqe->trq_len += *tlenp;
- tqe->trq_mcnt += mcnt;
- tp->t_trqmcnt += mcnt;
- tqe->trq_ml->m_next = m;
- tqe->trq_ml = m_last(m);
- if (tcp_reass_spacetime) {
- tqe->trq_m = m_collapse(tqe->trq_m, M_DONTWAIT, 1024);
- tp->t_trqmcnt -= tqe->trq_mcnt;
- tqe->trq_mcnt = m_storagesize(tqe->trq_m);
- tqe->trq_mcnt += tp->t_trqmcnt;
- }
- sack_track(tqe);
- /* TCP statistics. */
- TCPSTAT_INC(tcps_rcvoopack);
- TCPSTAT_ADD(tcps_rcvoobyte, *tlenp);
- TCPSTAT_INC(tcps_reass_tail);
- return (0);
- }
-
- /* Check if beyond last block. */
- if (SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq))
- goto insert;
-
- /* Check if this is the missing segment. */
- if (tp->rcv_nxt == th_seq) {
- tqe = TAILQ_FIRST(&tp->t_trq);
- KASSERT(SEQ_GT(tqe->trq_seq, th_seq),
- ("%s: first block starts below missing segment", __func__));
- /* Check if segment prepends first block. */
- if (SEQ_LEQ(tqe->trq_seq, th_seq + *tlenp)) {
- /* Trim tail of segment. */
- if ((i = SEQ_DELTA(tqe->trq_seq, th_seq + *tlenp))) {
- m_adj(m, -i);
- *tlenp -= i;
- /* TCP statistics. */
- TCPSTAT_INC(tcps_rcvpartduppack);
- TCPSTAT_ADD(tcps_rcvpartdupbyte, i);
- /* Update accounting. */
- mcnt = m_storagesize(m);
- }
- tqe->trq_len += *tlenp;
- tqe->trq_mcnt += mcnt;
- tp->t_trqmcnt += mcnt;
- tqe->trq_seq = th_seq;
- n = m_last(m);
- n->m_next = tqe->trq_m;
- tqe->trq_m = m;
- goto present;
- }
- goto insert; /* No statistics, this segment is in line. */
- }
-
- /* TCP statistics. */
- TCPSTAT_INC(tcps_rcvoopack);
- TCPSTAT_ADD(tcps_rcvoobyte, *tlenp);
-
- /* See where it fits. */
- TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) {
- /* Segment is after this blocks coverage. */
- if (SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq))
- continue;
- /* Segment is after the previous one but before this one. */
- if (SEQ_GT(tqe->trq_seq, th_seq + *tlenp))
- break; /* Insert as new block. */
-
- /* Segment is already fully covered. */
- if (SEQ_LEQ(tqe->trq_seq, th_seq) &&
- SEQ_GEQ(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp)) {
- TCPSTAT_INC(tcps_rcvduppack);
- TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp);
- TCPSTAT_INC(tcps_reass_covered);
- /*
- * XXXAO: What to SACK report when duplicate?
- * See RFC2883: D-SACK (Duplicate SACK)
- */
- sack_track(tqe);
- m_freem(m);
- *tlenp = 0;
- return (0);
- }
-
- /* Segment covers and extends on both ends. */
- if (SEQ_GT(tqe->trq_seq, th_seq) &&
- SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp)) {
- /* Replace block content. */
- tp->t_trqmcnt -= tqe->trq_mcnt;
- m_freem(tqe->trq_m);
- tqe->trq_len = *tlenp;
- tqe->trq_mcnt = mcnt;
- tp->t_trqmcnt += mcnt;
- tqe->trq_seq = th_seq;
- tqe->trq_m = m;
- tqe->trq_ml = m_last(m);
- /* Check if segment bridges next block to merge. */
- if (tqen != NULL &&
- SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq))
- tcp_reass_merge(tp, tqe, tqen);
- sack_track(tqe);
- TCPSTAT_INC(tcps_reass_replace);
- return (0);
- }
-
- /* Segment prepends to this block. */
- if (SEQ_GT(tqe->trq_seq, th_seq) &&
- SEQ_LEQ(tqe->trq_seq, th_seq + *tlenp) &&
- SEQ_GEQ(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp)) {
- KASSERT(!(thflags & TH_FIN),
- ("%s: new segment with FIN can't prepend", __func__));
- /* Trim tail of segment. */
- if ((i = SEQ_DELTA(tqe->trq_seq, th_seq + *tlenp))) {
- m_adj(m, -i);
- *tlenp -= i;
- /* TCP statistics. */
- TCPSTAT_INC(tcps_rcvpartduppack);
- TCPSTAT_ADD(tcps_rcvpartdupbyte, i);
- /* Update accounting. */
- mcnt = m_storagesize(m);
- }
- tqe->trq_len += *tlenp;
- tqe->trq_mcnt += mcnt;
- tp->t_trqmcnt += mcnt;
- tqe->trq_seq = th_seq;
- n = m_last(m);
- n->m_next = tqe->trq_m;
- tqe->trq_m = m;
- sack_track(tqe);
- TCPSTAT_INC(tcps_reass_prepend);
- return (0);
- }
-
- /* Segment appends to this block. */
- if (SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp) &&
- SEQ_LEQ(tqe->trq_seq, th_seq) &&
- SEQ_GEQ(tqe->trq_seq + tqe->trq_len, th_seq)) {
- /* Trim head of segment. */
- if ((i = SEQ_DELTA(tqe->trq_seq + tqe->trq_len, th_seq))) {
- m_adj(m, i);
- *tlenp -= i;
- /* TCP Statistics. */
- TCPSTAT_INC(tcps_rcvpartduppack);
- TCPSTAT_ADD(tcps_rcvpartdupbyte, i);
- }
- tqe->trq_len += *tlenp;
- tqe->trq_mcnt += mcnt;
- tp->t_trqmcnt += mcnt;
- tqe->trq_ml->m_next = m;
- tqe->trq_ml = m_last(m);
- /* Check if segment bridges two blocks to merge. */
- if (tqen != NULL &&
- SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq))
- tcp_reass_merge(tp, tqe, tqen);
- sack_track(tqe);
- TCPSTAT_INC(tcps_reass_append);
- return (0);
- }
- }
-
-insert:
- /* Prepare to insert into block queue. */
- if (tp->rcv_nxt == th_seq) {
- /*
- * Use temporary struct trq on the stack for missing
- * segment to prevent blocking of all reassembly queues
- * due to zone exhaustion.
- */
- tqen = &tqes;
- } else {
- tqen = uma_zalloc(tcp_reass_zone, (M_NOWAIT|M_ZERO));
- if (tqen == NULL) {
- TCPSTAT_INC(tcps_rcvmemdrop);
- m_freem(m);
- *tlenp = 0;
- return (0);
- }
- TCPSTAT_INC(tcps_reass_blocks);
- }
- tcp_reass_qsize++;
- if (tcp_reass_spacetime) {
- m = m_collapse();
- mcnt = m_storagesize(m);
- }
- tqen->trq_seq = th_seq;
- tqen->trq_len = *tlenp;
- tqen->trq_mcnt = mcnt;
- tp->t_trqmcnt += mcnt;
- tqen->trq_m = m;
- tqen->trq_ml = m_last(m);
-
- /* Where to insert. */
- if (tqe != NULL && SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq))
- TAILQ_INSERT_AFTER(&tp->t_trq, tqe, tqen, trq_q);
- else if (tqe != NULL)
- TAILQ_INSERT_BEFORE(tqe, tqen, trq_q);
- else {
- KASSERT(TAILQ_EMPTY(&tp->t_trq),
- ("%s: first element queue not empty", __func__));
- TAILQ_INSERT_HEAD(&tp->t_trq, tqen, trq_q);
- /*
- * Flush the reassembly queue after x times the
- * current retransmit interval measured from the
- * arrival time of the first segment.
- */
- if (tcp_reass_qtimo)
- tcp_timer_activate(tp, TT_REASS,
- tp->t_rxtcur * tcp_reass_qtimo);
- }
- LIST_INSERT_HEAD(&tp->t_trq_sack, tqen, trq_s);
-
- /* Missing segment? */
- if (tp->rcv_nxt != th_seq)
- return (0);
-present:
- /*
- * Present data to user, advancing rcv_nxt through the
- * completed sequence space.
- */
- KASSERT(!TAILQ_EMPTY(&tp->t_trq),
- ("%s: queue empty at present", __func__));
- KASSERT((TAILQ_FIRST(&tp->t_trq))->trq_seq == tp->rcv_nxt,
- ("%s: first block does not match rcv_nxt", __func__));
- TCPSTAT_INC(tcps_reass_missingseg);
-
- SOCKBUF_LOCK(&so->so_rcv);
- TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) {
- KASSERT(SEQ_GEQ(tqe->trq_seq, tp->rcv_nxt),
- ("%s: trq_seq < rcv_nxt", __func__));
- KASSERT(tqen == NULL ||
- SEQ_LEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq),
- ("%s: block overlaps into next one", __func__));
-
- if (tqe->trq_seq != tp->rcv_nxt)
- break;
- if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
- m_freem(tqe->trq_m);
- else
- sbappendstream_locked(&so->so_rcv, tqe->trq_m);
- tp->rcv_nxt += tqe->trq_len;
- tp->t_trqmcnt -= tqe->trq_mcnt;
- TAILQ_REMOVE(&tp->t_trq, tqe, trq_q);
- LIST_REMOVE(tqe, trq_s);
- if (tqe != &tqes)
- uma_zfree(tcp_reass_zone, tqe);
- V_tcp_reass_qsize--;
- }
- /* NB: sorwakeup_locked() does a implicit socket buffer unlock. */
- sorwakeup_locked(so);
-
- /*
- * Restart the reassembly queue flush timer after advancing
- * the sequence space and if queue is not empty. Otherwise
- * deactivate it.
- */
- if (tcp_reass_qtimo && !TAILQ_EMPTY(&tp->t_trq))
- tcp_timer_activate(tp, TT_REASS,
- tp->t_rxtcur * tcp_reass_qtimo);
- else
- tcp_timer_activate(tp, TT_REASS, 0);
-
- ND6_HINT(tp);
- return (thflags);
-}
-
-/*
- * Merge one or more consecutive blocks together.
- */
-static void
-tcp_reass_merge(struct tcpcb *tp, struct trq *tqe, struct trq *tqen)
-{
- int i;
-
- KASSERT(tqe != NULL && tqen != NULL,
- ("%s: incomplete input", __func__));
- KASSERT(SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq),
- ("%s: blocks do not overlap, nothing to merge", __func__));
-
- /* Appended block may reach beyond next block. */
- while (SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq + tqen->trq_len)) {
- /* TCP Statistics. */
- TCPSTAT_ADD(tcps_rcvpartdupbyte, tqen->trq_len);
- TCPSTAT_INC(tcps_reass_covered);
- tp->t_trqmcnt -= tqe->trq_mcnt;
- m_freem(tqen->trq_m);
- TAILQ_REMOVE(&tp->t_trq, tqen, trq_q);
- LIST_REMOVE(tqen, trq_s);
- uma_zfree(tcp_reass_zone, tqen);
- tcp_reass_qsize--;
- /* And the one after that. */
- if ((tqen = TAILQ_NEXT(tqe, trq_q)) == NULL)
- return;
- }
-
- /* Trim head of next block. */
- if ((i = SEQ_DELTA(tqe->trq_seq + tqe->trq_len, tqen->trq_seq))) {
- m_adj(tqen->trq_m, i);
- tqen->trq_len -= i;
- TCPSTAT_ADD(tcps_rcvpartdupbyte, i); /* Statistics */
- /* Dispose of empty mbufs. */
- if (tcp_reass_spacetime) {
- tqen->trq_m = m_trimhead(tqen->trq_m);
- tqen->trq_mcnt = m_storagesize(tqen->trq_m);
- }
- KASSERT(tqen->trq_m != NULL,
- ("%s: no remaining mbufs in block", __func__));
- }
-
- /* Merge blocks together. */
- tqe->trq_len += tqen->trq_len;
- tqe->trq_mcnt += tqen->trq_mcnt;
- tqe->trq_ml->m_next = tqen->trq_m;
- tqe->trq_ml = tqen->trq_ml;
- TAILQ_REMOVE(&tp->t_trq, tqen, trq_q);
- LIST_REMOVE(tqen, trq_s);
- uma_zfree(tcp_reass_zone, tqen);
- tcp_reass_qsize--;
- TCPSTAT_INC(tcps_reass_merge);
-}
-
-/*
- * Put the sequence number of the reassembly queue blocks into
- * the SACK options of an outgoing segment.
- */
-int
-tcp_reass_sack(struct tcpcb *tp, u_char *optp, int numsacks)
-{
- struct trq *tqe;
- tcp_seq sack_seq;
- int nsacks = 0;
-
- KASSERT(numsacks > 0,
- ("%s: zero sack blocks to add", __func__));
- KASSERT(!TAILQ_EMPTY(&tp->t_trq),
- ("%s: reassembly queue empty", __func__));
- KASSERT(!LIST_EMPTY(&tp->t_trq_sack),
- ("%s: sack list empty", __func__));
-
- /*
- * The most recent block must appear first. RFC2018, Section 4.
- * Add the other blocks in most recent created or updated order.
- */
- LIST_FOREACH(tqe, &tp->t_trq_sack, trq_s) {
- if (numsacks < 1)
- break;
- sack_seq = htonl(tqe->trq_seq);
- bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
- optp += sizeof(sack_seq);
- sack_seq = htonl(tqe->trq_seq + tqe->trq_len);
- bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
- optp += sizeof(sack_seq);
- numsacks--;
- nsacks++;
- }
-
- return (nsacks);
-}
-
-/*
- * Free the reassembly queue on tcpcb disposal or on general memory shortage.
- */
-void
-tcp_reass_qfree(struct tcpcb *tp)
-{
- struct trq *tqe, *tqen;
-
- INP_WLOCK_ASSERT(tp->t_inpcb);
-
- TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) {
- m_freem(tqe->trq_m);
- KASSERT(tp->t_trqmcnt >= tqe->trq_mcnt,
- ("%s: t_trqmcnt incorrect", __func__));
- tp->t_trqmcnt -= tqe->trq_mcnt;
- TAILQ_REMOVE(&tp->t_trq, tqe, trq_q);
- LIST_REMOVE(tqe, trq_s);
- uma_zfree(tcp_reass_zone, tqe);
- tcp_reass_qsize--;
- }
- tcp_timer_activate(tp, TT_REASS, 0);
-}
+/*-
+ * Copyright (c) 2007
+ * Andre Oppermann, Internet Business Solutions AG. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
+ * $FreeBSD: src/sys/netinet/tcp_reass.c,v 1.352 2007/05/13 22:16:13 andre Exp $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/netinet/tcp_reass.c,v 1.362 2009/07/14 22:48:30 rwatson Exp $");
+
+/*
+ * Operational overview of TCP reassembly:
+ *
+ * It is the purpose of tcp reassembly to store segments that are received
+ * out of order. This happens when packets are lost along the way due to
+ * various reasons. The most common one is traffic overload which causes
+ * routers to stop accepting packets for brief moments.
+ *
+ * Upon arrival of the missing segment(s) the whole chain of stored segments
+ * is moved into the socket buffer. In case of multiple missing segments
+ * the first consequtive part is moved with the remainder being kept in
+ * store until the next missing segment arrives.
+ *
+ * While in reassembly mode *all* arrving segments are put into the reassembly
+ * queue.
+ *
+ * Instead of storing all segments on their own we build blocks of consequtive
+ * segments chained together. We use a tailq because a new segments has the
+ * highest probability to fit the tail of the chain. If not, the second
+ * highest probability is the beginning of the chain for being the missing
+ * segment. Otherwise we cycle through each consequtive block until a match
+ * is found. If a segment matches the end of one block and the start of the
+ * next block the two blocks are joined together. If no match is found a
+ * new block is created.
+ *
+ * This system is very efficient and can deal efficiently with long chains
+ * and many holes.
+ *
+ * trq_tail ----------------------------------------------\
+ * trq_head --> [block] ------> [block] ------> [block] <-/
+ * m_next m_next m_next
+ * | | |
+ * m_next m_next m_next
+ * | | |
+ * m_next m_next m_next
+ *
+ *
+ * The reassembly queues block structure is also used to track SACK
+ * information as a data receiver. A double-linked list is added
+ * that links the blocks the reverse order of their arrival or updating.
+ * This makes us fully compliant to RFC2018 Section 4 including all
+ * optional parts marked as "SHOULD".
+ *
+ * TODO:
+ * A further improvement is to merge the content of mbufs together if the
+ * preceeding one has enough space to hold the data of the new one. When
+ * trimming the head of an mbuf chain m_adj() empties the mbufs but leaves
+ * them in place. Only when trimming from the tail it actually frees them.
+ * Normally we don't get mbuf chains so this isn't too much of a concern
+ * right now. Use m_collapse() to compact the mbuf chains within the
+ * blocks.
+ */
+
+#include "opt_inet.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/vimage.h>
+
+#include <vm/uma.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+
+static VNET_DEFINE(int, tcp_reass_maxseg);
+VNET_DEFINE(int, tcp_reass_qsize);
+static VNET_DEFINE(int, tcp_reass_maxqlen);
+static VNET_DEFINE(int, tcp_reass_overflows);
+
+VNET_DEFINE(uma_zone_t, tcp_reass_zone);
+
+#define V_tcp_reass_maxseg VNET_GET(tcp_reass_maxseg)
+#define V_tcp_reass_maxqlen VNET_GET(tcp_reass_maxqlen)
+#define V_tcp_reass_overflows VNET_GET(tcp_reass_overflows)
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
+ "TCP Segment Reassembly Queue");
+
+static int tcp_reass_enabled = 1;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, enable, CTLFLAG_WR,
+ &tcp_reass_enabled, 0,
+ "Enable/disable use of TCP Reassembly Queue");
+
+static int tcp_reass_maxblocks = 0;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxblocks, CTLFLAG_RDTUN,
+ &tcp_reass_maxblocks, 0,
+ "Global maximum number of TCP Segment Blocks in Reassembly Queue");
+
+static int tcp_reass_qsize = 0;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, curblocks, CTLFLAG_RD,
+ &tcp_reass_qsize, 0,
+ "Global number of TCP Segment Blocks currently in Reassembly Queue");
+
+static int tcp_reass_qtimo = 0;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, queue_timeout, CTLFLAG_RW,
+ &tcp_reass_qtimo, 0,
+ "Reassembly Queue Timeout in multiples of the Retransmission Timeout");
+
+static int tcp_reass_spacetime = 0;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, space_time, CTLFLAG_RW,
+ &tcp_reass_spacetime, 0,
+ "Reassembly Queue strategy of space vs. time efficiency");
+
+static void tcp_reass_merge(struct tcpcb *, struct trq *, struct trq *);
+
+static __inline void
+sack_track(struct trq *tqe) {
+ if (LIST_FIRST(&tp->t_trq_sack) != (tqe)) {
+ LIST_REMOVE((tqe), trq_s);
+ LIST_INSERT_HEAD(&tp->t_trq_sack, (tqe), trq_s);
+ }
+}
+
+/* Trim empty mbufs from head of chain. */
+static struct mbuf *
+m_trimhead(struct mbuf *m) {
+ struct mbuf *n;
+
+ while (m->m_len == 0) {
+ n = m;
+ m = m->m_next;
+ m_free(n);
+ }
+ return (m);
+}
+
+static u_int
+m_storagesize(m) {
+ u_int mcnt;
+
+ for (mcnt = 0, m; n; m = m->m_next)
+ mcnt += (m->m_flags & M_EXT) ?
+ m->m_ext.ext_size + MSIZE : MSIZE;
+ return (mcnt);
+}
+
+/*
+ * Adjust TCP reassembly zone limits when the nmbclusters zone changes.
+ */
+static void
+tcp_reass_zone_change(void *tag)
+{
+
+ tcp_reass_maxblocks = nmbclusters / 16;
+ uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks);
+}
+
+#ifdef INVARIANTS
+static int
+tcp_reass_verify(struct tcpcb *tp)
+{
+ struct trq *tqe, *tqen;
+ int i = 0;
+
+ TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) {
+ KASSERT(SEQ_GEQ(tqe->trq_seq, tp->rcv_nxt),
+ ("%s: trq_seq < rcv_nxt", __func__));
+ KASSERT(tqen == NULL ||
+ SEQ_LT(tqe->trq_seq + tqe->trq_len, tqen->trq_seq),
+ ("%s: overlapping blocks", __func__));
+ i++;
+ }
+ LIST_FOREACH(tqe, &tp->t_trq_sack, trq_s) {
+ i--;
+ }
+ KASSERT(i == 0, ("%s: SEQ# ordered tailq and arrival ordered "
+ "SACK list are not equally long", __func__));
+ return (0);
+}
+#endif
+
+/*
+ * Initialize TCP reassembly zone on startup.
+ */
+void
+tcp_reass_init(void)
+{
+
+ /* XXX: nmbclusters may be zero. */
+ tcp_reass_maxblocks = nmbclusters / 16;
+ TUNABLE_INT_FETCH("net.inet.tcp.reass.maxblocks",
+ &tcp_reass_maxblocks);
+ tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct trq),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks);
+ EVENTHANDLER_REGISTER(nmbclusters_change,
+ tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY);
+}
+
+/*
+ * Insert segments into the reassembly queue.
+ *
+ * NB: We must always consume the mbuf. Either by appeding it to
+ * the queue or by freeing it.
+ */
+int
+tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
+{
+ struct trq *tqe, *tqen;
+ struct socket *so = tp->t_inpcb->inp_socket;
+ struct mbuf *n;
+ int i, thflags = 0, mcnt;
+ tcp_seq th_seq;
+ struct trq tqes;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ /*
>>> TRUNCATED FOR MAIL (1000 lines) <<<
More information about the p4-projects
mailing list