PERFORCE change 166205 for review

Andre Oppermann andre at FreeBSD.org
Fri Jul 17 18:22:52 UTC 2009


http://perforce.freebsd.org/chv.cgi?CH=166205

Change 166205 by andre at andre_flirtbox on 2009/07/17 18:22:49

	More fixes.

Affected files ...

.. //depot/projects/tcp_reass/netinet/tcp_input.c#12 edit
.. //depot/projects/tcp_reass/netinet/tcp_output.c#14 edit
.. //depot/projects/tcp_reass/netinet/tcp_reass.c#32 edit
.. //depot/projects/tcp_reass/netinet/tcp_sack.c#8 edit
.. //depot/projects/tcp_reass/netinet/tcp_subr.c#10 edit
.. //depot/projects/tcp_reass/netinet/tcp_usrreq.c#12 edit
.. //depot/projects/tcp_reass/netinet/tcp_var.h#16 edit

Differences ...

==== //depot/projects/tcp_reass/netinet/tcp_input.c#12 (text+ko) ====

@@ -1245,7 +1245,7 @@
 	    tp->snd_nxt == tp->snd_max &&
 	    tiwin && tiwin == tp->snd_wnd && 
 	    ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
-	    LIST_EMPTY(&tp->t_segq) &&
+	    TAILQ_EMPTY(&tp->t_trq) &&
 	    ((to.to_flags & TOF_TS) == 0 ||
 	     TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
 
@@ -1386,10 +1386,6 @@
 				panic("%s: ti_locked %d on pure data "
 				    "segment", __func__, ti_locked);
 			ti_locked = TI_UNLOCKED;
-
-			/* Clean receiver SACK report if present */
-			if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
-				tcp_clean_sackreport(tp);
 			TCPSTAT_INC(tcps_preddat);
 			tp->rcv_nxt += tlen;
 			/*
@@ -2560,7 +2556,6 @@
 	 */
 	if ((tlen || (thflags & TH_FIN)) &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
-		tcp_seq save_start = th->th_seq;
 		m_adj(m, drop_hdrlen);	/* delayed header drop */
 		/*
 		 * Insert segment which includes th into TCP reassembly queue
@@ -2575,7 +2570,7 @@
 		 * fast retransmit can work).
 		 */
 		if (th->th_seq == tp->rcv_nxt &&
-		    LIST_EMPTY(&tp->t_segq) &&
+		    TAILQ_EMPTY(&tp->t_trq) &&
 		    TCPS_HAVEESTABLISHED(tp->t_state)) {
 			if (DELAY_ACK(tp))
 				tp->t_flags |= TF_DELACK;
@@ -2600,11 +2595,9 @@
 			 * m_adj() doesn't actually frees any mbufs
 			 * when trimming from the head.
 			 */
-			thflags = tcp_reass(tp, th, &tlen, m);
+			thflags |= tcp_reass(tp, th, &tlen, m);
 			tp->t_flags |= TF_ACKNOW;
 		}
-		if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
-			tcp_update_sack_list(tp, save_start, save_start + tlen);
 #if 0
 		/*
 		 * Note the amount of data that peer has sent into

==== //depot/projects/tcp_reass/netinet/tcp_output.c#14 (text+ko) ====

@@ -49,6 +49,7 @@
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/vimage.h>
+#include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/route.h>
@@ -141,7 +142,7 @@
 	struct mbuf *m;
 	struct ip *ip = NULL;
 	struct ipovly *ipov = NULL;
-	struct tcphdr *th;
+	struct tcphdr *th = NULL;
 	u_char opt[TCP_MAXOLEN];
 	unsigned ipoptlen, optlen, hdrlen;
 #ifdef IPSEC
@@ -152,6 +153,8 @@
 	struct sackhole *p;
 	int tso = 0;
 	struct tcpopt to;
+	char *s;
+	int ipout = 0;
 #if 0
 	int maxburst = TCP_MAXBURST;
 #endif
@@ -476,7 +479,7 @@
 	if (len > tp->t_maxseg) {
 		if ((tp->t_flags & TF_TSO) && V_tcp_do_tso &&
 		    ((tp->t_flags & TF_SIGNATURE) == 0) &&
-		    tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
+		    TAILQ_EMPTY(&tp->t_trq) && sack_rxmit == 0 &&
 		    tp->t_inpcb->inp_options == NULL &&
 		    tp->t_inpcb->in6p_options == NULL
 #ifdef IPSEC
@@ -683,10 +686,9 @@
 				to.to_flags |= TOF_SACKPERM;
 			else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 			    (tp->t_flags & TF_SACK_PERMIT) &&
-			    tp->rcv_numsacks > 0) {
+			    !TAILQ_EMPTY(&tp->t_trq)) {
 				to.to_flags |= TOF_SACK;
-				to.to_nsacks = tp->rcv_numsacks;
-				to.to_sacks = (u_char *)tp->sackblks;
+				to.to_sacks = (u_char *)tp;
 			}
 		}
 #ifdef TCP_SIGNATURE
@@ -1184,6 +1186,8 @@
 	if (V_path_mtu_discovery)
 		ip->ip_off |= IP_DF;
 
+	ipout = 1;
+
 	error = ip_output(m, tp->t_inpcb->inp_options, NULL,
 	    ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
 	    tp->t_inpcb);
@@ -1217,6 +1221,13 @@
 		}
 out:
 		SOCKBUF_UNLOCK_ASSERT(&so->so_snd);	/* Check gotos. */
+
+		if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, NULL, NULL, NULL))) {
+			log(LOG_DEBUG, "%s; %s: error %i while sending (ip_output %i)\n",
+			    s, __func__, error, ipout);
+			free(s, M_TCPLOG);
+		}
+
 		switch (error) {
 		case EPERM:
 			tp->t_softerror = error;
@@ -1410,8 +1421,6 @@
 		case TOF_SACK:
 			{
 			int sackblks = 0;
-			struct sackblk *sack = (struct sackblk *)to->to_sacks;
-			tcp_seq sack_seq;
 
 			while (!optlen || optlen % 4 != 2) {
 				optlen += TCPOLEN_NOP;
@@ -1421,19 +1430,11 @@
 				continue;
 			optlen += TCPOLEN_SACKHDR;
 			*optp++ = TCPOPT_SACK;
-			sackblks = min(to->to_nsacks,
-					(TCP_MAXOLEN - optlen) / TCPOLEN_SACK);
-			*optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK;
-			while (sackblks--) {
-				sack_seq = htonl(sack->start);
-				bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
-				optp += sizeof(sack_seq);
-				sack_seq = htonl(sack->end);
-				bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
-				optp += sizeof(sack_seq);
-				optlen += TCPOLEN_SACK;
-				sack++;
-			}
+			sackblks = tcp_reass_sack((struct tcpcb *)to->to_sacks,
+			    optp + 1, (TCP_MAXOLEN - optlen) / TCPOLEN_SACK);
+			*optp++ = TCPOLEN_SACKHDR + (sackblks * TCPOLEN_SACK);
+			optlen += TCPOLEN_SACK * sackblks;
+			optp += sizeof(tcp_seq) * 2 * sackblks;
 			TCPSTAT_INC(tcps_sack_send_blocks);
 			break;
 			}

==== //depot/projects/tcp_reass/netinet/tcp_reass.c#32 (text+ko) ====

@@ -1,740 +1,742 @@
-/*-
- * Copyright (c) 2007
- *	Andre Oppermann, Internet Business Solutions AG.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
- * $FreeBSD: src/sys/netinet/tcp_reass.c,v 1.352 2007/05/13 22:16:13 andre Exp $
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/netinet/tcp_reass.c,v 1.362 2009/07/14 22:48:30 rwatson Exp $");
-
-/*
- * Operational overview of TCP reassembly:
- *
- * It is the purpose of tcp reassembly to store segments that are received
- * out of order.  This happens when packets are lost along the way due to
- * various reasons.  The most common one is traffic overload which causes
- * routers to stop accepting packets for brief moments.
- *
- * Upon arrival of the missing segment(s) the whole chain of stored segments
- * is moved into the socket buffer.  In case of multiple missing segments
- * the first consequtive part is moved with the remainder being kept in
- * store until the next missing segment arrives.
- *
- * While in reassembly mode *all* arrving segments are put into the reassembly
- * queue.
- *
- * Instead of storing all segments on their own we build blocks of consequtive
- * segments chained together.  We use a tailq because a new segments has the
- * highest probability to fit the tail of the chain.  If not, the second
- * highest probability is the beginning of the chain for being the missing
- * segment.  Otherwise we cycle through each consequtive block until a match
- * is found.  If a segment matches the end of one block and the start of the
- * next block the two blocks are joined together.  If no match is found a
- * new block is created.
- *
- * This system is very efficient and can deal efficiently with long chains 
- * and many holes.
- *
- * trq_tail ----------------------------------------------\
- * trq_head --> [block] ------>	[block] ------>	[block] <-/
- *		m_next		m_next		m_next
- *		   |		   |		   |
- *		m_next		m_next		m_next
- *		   |		   |		   |
- *		m_next		m_next		m_next
- *
- *
- * The reassembly queues block structure is also used to track SACK
- * information as a data receiver.  A double-linked list is added
- * that links the blocks the reverse order of their arrival or updating.
- * This makes us fully compliant to RFC2018 Section 4 including all
- * optional parts marked as "SHOULD".
- *
- * TODO:
- * A further improvement is to merge the content of mbufs together if the
- * preceeding one has enough space to hold the data of the new one.  When
- * trimming the head of an mbuf chain m_adj() empties the mbufs but leaves
- * them in place.  Only when trimming from the tail it actually frees them.
- * Normally we don't get mbuf chains so this isn't too much of a concern
- * right now.  Use m_collapse() to compact the mbuf chains within the
- * blocks.
- */
-
-#include "opt_inet.h"
-
-#include <sys/param.h>
-#include <sys/kernel.h>
-#include <sys/malloc.h>
-#include <sys/mbuf.h>
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-#include <sys/sysctl.h>
-#include <sys/syslog.h>
-#include <sys/systm.h>
-#include <sys/vimage.h>
-
-#include <vm/uma.h>
-
-#include <netinet/in.h>
-#include <netinet/in_pcb.h>
-#include <netinet/in_systm.h>
-#include <netinet/tcp.h>
-#include <netinet/tcp_fsm.h>
-#include <netinet/tcp_seq.h>
-#include <netinet/tcp_timer.h>
-#include <netinet/tcp_var.h>
-
-static VNET_DEFINE(int, tcp_reass_maxseg);
-VNET_DEFINE(int, tcp_reass_qsize);
-static VNET_DEFINE(int, tcp_reass_maxqlen);
-static VNET_DEFINE(int, tcp_reass_overflows);
-
-VNET_DEFINE(uma_zone_t, tcp_reass_zone);
-
-#define	V_tcp_reass_maxseg		VNET_GET(tcp_reass_maxseg)
-#define	V_tcp_reass_maxqlen		VNET_GET(tcp_reass_maxqlen)
-#define	V_tcp_reass_overflows		VNET_GET(tcp_reass_overflows)
-
-SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
-    "TCP Segment Reassembly Queue");
-
-static int tcp_reass_enabled = 1;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, enable, CTLFLAG_WR,
-    &tcp_reass_enabled, 0,
-    "Enable/disable use of TCP Reassembly Queue");
-
-static int tcp_reass_maxblocks = 0;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxblocks, CTLFLAG_RDTUN,
-    &tcp_reass_maxblocks, 0,
-    "Global maximum number of TCP Segment Blocks in Reassembly Queue");
-
-static int tcp_reass_qsize = 0;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, curblocks, CTLFLAG_RD,
-    &tcp_reass_qsize, 0,
-    "Global number of TCP Segment Blocks currently in Reassembly Queue");
-
-static int tcp_reass_qtimo = 0;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, queue_timeout, CTLFLAG_RW,
-    &tcp_reass_qtimo, 0,
-    "Reassembly Queue Timeout in multiples of the Retransmission Timeout");
-
-static int tcp_reass_spacetime = 0;
-SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, space_time, CTLFLAG_RW,
-    &tcp_reass_spacetime, 0,
-    "Reassembly Queue strategy of space vs. time efficiency");
-
-static void	tcp_reass_merge(struct tcpcb *, struct trq *, struct trq *);
-
-static __inline void
-sack_track(struct tcpcb *tp, struct trq *tqe) {
-	if (LIST_FIRST(&tp->t_trq_sack) != (tqe)) {
-		LIST_REMOVE((tqe), trq_s);
-		LIST_INSERT_HEAD(&tp->t_trq_sack, (tqe), trq_s);
-	}
-}
-
-/* Trim empty mbufs from head of chain. */
-static struct mbuf *
-m_trimhead(struct mbuf *m) {
-	struct mbuf *n;
-
-	while (m->m_len == 0) {
-		n = m;
-		m = m->m_next;
-		m_free(n);
-	}
-	return (m);
-}
-
-static u_int
-m_storagesize(m) {
+/*-
+ * Copyright (c) 2007
+ *	Andre Oppermann, Internet Business Solutions AG.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
+ * $FreeBSD: src/sys/netinet/tcp_reass.c,v 1.352 2007/05/13 22:16:13 andre Exp $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/netinet/tcp_reass.c,v 1.362 2009/07/14 22:48:30 rwatson Exp $");
+
+/*
+ * Operational overview of TCP reassembly:
+ *
+ * It is the purpose of tcp reassembly to store segments that are received
+ * out of order.  This happens when packets are lost along the way due to
+ * various reasons.  The most common one is traffic overload which causes
+ * routers to stop accepting packets for brief moments.
+ *
+ * Upon arrival of the missing segment(s) the whole chain of stored segments
+ * is moved into the socket buffer.  In case of multiple missing segments
+ * the first consequtive part is moved with the remainder being kept in
+ * store until the next missing segment arrives.
+ *
+ * While in reassembly mode *all* arrving segments are put into the reassembly
+ * queue.
+ *
+ * Instead of storing all segments on their own we build blocks of consequtive
+ * segments chained together.  We use a tailq because a new segments has the
+ * highest probability to fit the tail of the chain.  If not, the second
+ * highest probability is the beginning of the chain for being the missing
+ * segment.  Otherwise we cycle through each consequtive block until a match
+ * is found.  If a segment matches the end of one block and the start of the
+ * next block the two blocks are joined together.  If no match is found a
+ * new block is created.
+ *
+ * This system is very efficient and can deal efficiently with long chains 
+ * and many holes.
+ *
+ * trq_tail ----------------------------------------------\
+ * trq_head --> [block] ------>	[block] ------>	[block] <-/
+ *		m_next		m_next		m_next
+ *		   |		   |		   |
+ *		m_next		m_next		m_next
+ *		   |		   |		   |
+ *		m_next		m_next		m_next
+ *
+ *
+ * The reassembly queues block structure is also used to track SACK
+ * information as a data receiver.  A double-linked list is added
+ * that links the blocks the reverse order of their arrival or updating.
+ * This makes us fully compliant to RFC2018 Section 4 including all
+ * optional parts marked as "SHOULD".
+ *
+ * TODO:
+ * A further improvement is to merge the content of mbufs together if the
+ * preceeding one has enough space to hold the data of the new one.  When
+ * trimming the head of an mbuf chain m_adj() empties the mbufs but leaves
+ * them in place.  Only when trimming from the tail it actually frees them.
+ * Normally we don't get mbuf chains so this isn't too much of a concern
+ * right now.  Use m_collapse() to compact the mbuf chains within the
+ * blocks.
+ */
+
+#include "opt_inet.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/vimage.h>
+
+#include <vm/uma.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_options.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
+
+VNET_DEFINE(uma_zone_t, tcp_reass_zone);
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
+    "TCP Segment Reassembly Queue");
+
+static int tcp_reass_enabled = 1;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, enable, CTLFLAG_WR,
+    &tcp_reass_enabled, 0,
+    "Enable/disable use of TCP Reassembly Queue");
+
+static int tcp_reass_maxblocks = 0;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxblocks, CTLFLAG_RDTUN,
+    &tcp_reass_maxblocks, 0,
+    "Global maximum number of TCP Segment Blocks in Reassembly Queue");
+
+int tcp_reass_qsize = 0;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, curblocks, CTLFLAG_RD,
+    &tcp_reass_qsize, 0,
+    "Global number of TCP Segment Blocks currently in Reassembly Queue");
+
+static int tcp_reass_qtimo = 0;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, queue_timeout, CTLFLAG_RW,
+    &tcp_reass_qtimo, 0,
+    "Reassembly Queue Timeout in multiples of the Retransmission Timeout");
+
+static int tcp_reass_spacetime = 0;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, space_time, CTLFLAG_RW,
+    &tcp_reass_spacetime, 0,
+    "Reassembly Queue strategy of space vs. time efficiency");
+
+static void	tcp_reass_merge(struct tcpcb *, struct trq *, struct trq *);
+
+static __inline void
+sack_track(struct tcpcb *tp, struct trq *tqe)
+{
+
+	if (LIST_FIRST(&tp->t_trq_sack) != (tqe)) {
+		LIST_REMOVE((tqe), trq_s);
+		LIST_INSERT_HEAD(&tp->t_trq_sack, (tqe), trq_s);
+	}
+}
+
+/* Trim empty mbufs from head of chain. */
+static struct mbuf *
+m_trimhead(struct mbuf *m)
+{
+	struct mbuf *n;
+
+	while (m->m_len == 0) {
+		n = m;
+		m = m->m_next;
+		m_free(n);
+	}
+	return (m);
+}
+
+static u_int
+m_storagesize(struct mbuf *m)
+{
 	u_int mcnt;
-
-	for (mcnt = 0; m != NULL; m = m->m_next)
-		mcnt += (m->m_flags & M_EXT) ?
-		    m->m_ext.ext_size + MSIZE : MSIZE;
-	return (mcnt);
-}
-
-/*
- * Adjust TCP reassembly zone limits when the nmbclusters zone changes.
- */
-static void
-tcp_reass_zone_change(void *tag)
-{
-
-	tcp_reass_maxblocks = nmbclusters / 16;
-	uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks);
-}
-
-#ifdef INVARIANTS
-static int
-tcp_reass_verify(struct tcpcb *tp)
-{
-	struct trq *tqe, *tqen;
-	int i = 0;
-
-	TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) {
-		KASSERT(SEQ_GEQ(tqe->trq_seq, tp->rcv_nxt),
-		    ("%s: trq_seq < rcv_nxt", __func__));
-		KASSERT(tqen == NULL ||
-		    SEQ_LT(tqe->trq_seq + tqe->trq_len, tqen->trq_seq),
-		    ("%s: overlapping blocks", __func__));
-		i++;
-	}
-	LIST_FOREACH(tqe, &tp->t_trq_sack, trq_s) {
-		i--;
-	}
-	KASSERT(i == 0, ("%s: SEQ# ordered tailq and arrival ordered "
-	    "SACK list are not equally long", __func__));
-	return (0);
-}
-#endif
-
-/*
- * Initialize TCP reassembly zone on startup.
- */
-void
-tcp_reass_init(void)
-{
-
-	/* XXX: nmbclusters may be zero. */
-	tcp_reass_maxblocks = nmbclusters / 16;
-	TUNABLE_INT_FETCH("net.inet.tcp.reass.maxblocks",
-	    &tcp_reass_maxblocks);
-	tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct trq),
-	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
-	uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks);
-	EVENTHANDLER_REGISTER(nmbclusters_change,
-	    tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY);
-}
-
-/*
- * Insert segments into the reassembly queue.
- *
- * NB: We must always consume the mbuf.  Either by appeding it to
- * the queue or by freeing it.
- */
-int
-tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
-{
-	struct trq *tqe, *tqen;
-	struct socket *so = tp->t_inpcb->inp_socket;
-	struct mbuf *n;
-	int i, thflags = 0, mcnt;
-	tcp_seq th_seq;
-	struct trq tqes;
-
-	INP_WLOCK_ASSERT(tp->t_inpcb);
-
-	/*
-	 * Call with th==NULL after becoming established to
-	 * force pre-ESTABLISHED data up to user socket.
-	 * XXX: Was used for T/TCP of which code remains.
-	 */
-	if (th == NULL) {
-		if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
-		    TAILQ_EMPTY(&tp->t_trq) ||
-		    ((tqe = TAILQ_FIRST(&tp->t_trq)) &&
-		     tqe->trq_seq != tp->rcv_nxt))
-			return (0);
-		goto present;
-	}
-
-	/*
-	 * Store TCP header information in local variables as
-	 * we may lose access to it after mbuf compacting.
-	 */
-	thflags = th->th_flags;
-	th_seq = th->th_seq;
-	th = NULL;		/* Prevent further use. */
-
-	/* Check if it is really neccessary to do all the work. */
-	if (!tcp_reass_enabled && TAILQ_EMPTY(&tp->t_trq)) {
-		*tlenp = 0;
-		m_freem(m);
-		return (0);
-	}
-
-	KASSERT(SEQ_LEQ(tp->rcv_nxt, th_seq),
-	    ("%s: sequence number below rcv_nxt", __func__));
-	KASSERT(!(tp->rcv_nxt == th_seq) || !(TAILQ_EMPTY(&tp->t_trq)),
-	    ("%s: got missing segment but queue is empty", __func__));
-	KASSERT(tcp_reass_verify(tp),
-	    ("%s: reassembly queue inconsistent", __func__));
-
-	/*
-	 * Limit the number of segments in the reassembly queue to prevent
-	 * holding on to too many segments (and thus running out of mbufs).
-	 * Make sure to let the missing segment through which caused this
-	 * queue.
-	 *
-	 * Count the gross space used by the mbufs in the reassembly queue
-	 * and limit it to the free space in the socket buffer.  This way
-	 * the reassembly queue can never consume more mbuf space than the
-	 * socket buffer got allocated anyway and it reflects the actual
-	 * amount of kernel memory used.  This effectively prevents mbuf
-	 * exhaustion due to pathological traffic (one byte segments with
-	 * a hole each time) on a single connection.
-	 *
-	 * Counting the gross mbuf space effectively sets the net data
-	 * limit lower than the socket buffer would allow.
-	 * Don't underestimates the effective free space in the socket
-	 * buffer vs. actual real data with 2k clusters and 1500 byte
-	 * packets by introducing a correction factor of 11/8th.
-	 */
-	if (th_seq != tp->rcv_nxt &&
-	    tp->t_trqmcnt > (sbspace(&so->so_rcv) / 8 * 11)) {
-		TCPSTAT_INC(tcps_reass_overflow);
-		TCPSTAT_INC(tcps_rcvmemdrop);
-		m_freem(m);
-		*tlenp = 0;
-		return (0);
-	}
-
-	/* Get rid of packet header and mtags. */
-	m_demote(m, 1);
-
-	/* Trim empty mbufs from head of chain. */
-	m = m_trimhead(m);
-
-	/* NB: m_adj(m, -i) may free mbufs at the tail of a chain. */
-	mcnt = m_storagesize(m);
-
-	/*
-	 * FIN handling is a bit tricky.
-	 * We cannot trust a FIN that goes into the reassembly queue.
-	 * It can be easily spoofed as it may be anywhere in the receive
-	 * window (see RST attack mitigation in tcp-secure).
-	 * For this reason (and complexity avoidance) we generally ignore
-	 * any FIN arriving at the reassembly queue with one exception;
-	 * When it exactly matches rcv_nxt together with any data in the
-	 * same segment we can conclude it to be genuine and proceed with
-	 * flushing any other data waiting in the reassembly queue.
-	 * A FIN is part of the sequence space and will get retransmitted
-	 * if it was genuine.
-	 * This approach is based on a discussion on TCPM mailing list.
-	 */
-	if ((thflags & TH_FIN) && tp->rcv_nxt == th_seq) {
-		tcp_reass_qfree(tp);
-		tqe = NULL;
-		if (m->m_len == 0) {
-			tcp_timer_activate(tp, TT_REASS, 0);
-			return (thflags);
-		}
-		goto insert;
-	} else
-		thflags &= ~TH_FIN;
-
-	/* Check if this is the first segment. */
-	if (TAILQ_EMPTY(&tp->t_trq))
-		goto insert;
-
-	/* Starting point for the following tests. */
-	tqe = TAILQ_LAST(&tp->t_trq, trq_head);
-
-	/* Check if this segment directly attaches to the end. */
-	if (tqe->trq_seq + tqe->trq_len == th_seq) {
-		tqe->trq_len += *tlenp;
-		tqe->trq_mcnt += mcnt;
-		tp->t_trqmcnt += mcnt;
-		tqe->trq_ml->m_next = m;
-		tqe->trq_ml = m_last(m);
-		if (tcp_reass_spacetime) {
-			tqe->trq_m =  m_collapse(tqe->trq_m, M_DONTWAIT, 1024);
-			tp->t_trqmcnt -= tqe->trq_mcnt;
-			tqe->trq_mcnt = m_storagesize(tqe->trq_m);
-			tqe->trq_mcnt += tp->t_trqmcnt;
-		}
-		sack_track(tqe);
-		/* TCP statistics. */
-		TCPSTAT_INC(tcps_rcvoopack);
-		TCPSTAT_ADD(tcps_rcvoobyte, *tlenp);
-		TCPSTAT_INC(tcps_reass_tail);
-		return (0);
-	}
-
-	/* Check if beyond last block. */
-	if (SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq))
-		goto insert;
-
-	/* Check if this is the missing segment. */
-	if (tp->rcv_nxt == th_seq) {
-		tqe = TAILQ_FIRST(&tp->t_trq);
-		KASSERT(SEQ_GT(tqe->trq_seq, th_seq),
-		    ("%s: first block starts below missing segment", __func__));
-		/* Check if segment prepends first block. */
-		if (SEQ_LEQ(tqe->trq_seq, th_seq + *tlenp)) {
-			/* Trim tail of segment. */
-			if ((i = SEQ_DELTA(tqe->trq_seq, th_seq + *tlenp))) {
-				m_adj(m, -i);
-				*tlenp -= i;
-				/* TCP statistics. */
-				TCPSTAT_INC(tcps_rcvpartduppack);
-				TCPSTAT_ADD(tcps_rcvpartdupbyte, i);
-				/* Update accounting. */
-				mcnt = m_storagesize(m);
-			}
-			tqe->trq_len += *tlenp;
-			tqe->trq_mcnt += mcnt;
-			tp->t_trqmcnt += mcnt;
-			tqe->trq_seq = th_seq;
-			n = m_last(m);
-			n->m_next = tqe->trq_m;
-			tqe->trq_m = m;
-			goto present;
-		}
-		goto insert;	/* No statistics, this segment is in line. */
-	}
-
-	/* TCP statistics. */
-	TCPSTAT_INC(tcps_rcvoopack);
-	TCPSTAT_ADD(tcps_rcvoobyte, *tlenp);
-
-	/* See where it fits. */
-	TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) {
-		/* Segment is after this blocks coverage. */
-		if (SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq))
-			continue;
-		/* Segment is after the previous one but before this one. */
-		if (SEQ_GT(tqe->trq_seq, th_seq + *tlenp))
-			break;		/* Insert as new block. */
-
-		/* Segment is already fully covered. */
-		if (SEQ_LEQ(tqe->trq_seq, th_seq) &&
-		    SEQ_GEQ(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp)) {
-			TCPSTAT_INC(tcps_rcvduppack);
-			TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp);
-			TCPSTAT_INC(tcps_reass_covered);
-			/*
-			 * XXXAO: What to SACK report when duplicate?
-			 * See RFC2883: D-SACK (Duplicate SACK)
-			 */
-			sack_track(tqe);
-			m_freem(m);
-			*tlenp = 0;
-			return (0);
-		}
-
-		/* Segment covers and extends on both ends. */
-		if (SEQ_GT(tqe->trq_seq, th_seq) &&
-		    SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp)) {
-			/* Replace block content. */
-			tp->t_trqmcnt -= tqe->trq_mcnt;
-			m_freem(tqe->trq_m);
-			tqe->trq_len = *tlenp;
-			tqe->trq_mcnt = mcnt;
-			tp->t_trqmcnt += mcnt;
-			tqe->trq_seq = th_seq;
-			tqe->trq_m = m;
-			tqe->trq_ml = m_last(m);
-			/* Check if segment bridges next block to merge. */
-			if (tqen != NULL &&
-			    SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq))
-				tcp_reass_merge(tp, tqe, tqen);
-			sack_track(tqe);
-			TCPSTAT_INC(tcps_reass_replace);
-			return (0);
-		}
-
-		/* Segment prepends to this block. */
-		if (SEQ_GT(tqe->trq_seq, th_seq) &&
-		    SEQ_LEQ(tqe->trq_seq, th_seq + *tlenp) &&
-		    SEQ_GEQ(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp)) {
-			KASSERT(!(thflags & TH_FIN),
-			    ("%s: new segment with FIN can't prepend", __func__));
-			/* Trim tail of segment. */
-			if ((i = SEQ_DELTA(tqe->trq_seq, th_seq + *tlenp))) {
-				m_adj(m, -i);
-				*tlenp -= i;
-				/* TCP statistics. */
-				TCPSTAT_INC(tcps_rcvpartduppack);
-				TCPSTAT_ADD(tcps_rcvpartdupbyte, i);
-				/* Update accounting. */
-				mcnt = m_storagesize(m);
-			}
-			tqe->trq_len += *tlenp;
-			tqe->trq_mcnt += mcnt;
-			tp->t_trqmcnt += mcnt;
-			tqe->trq_seq = th_seq;
-			n = m_last(m);
-			n->m_next = tqe->trq_m;
-			tqe->trq_m = m;
-			sack_track(tqe);
-			TCPSTAT_INC(tcps_reass_prepend);
-			return (0);
-		}
-
-		/* Segment appends to this block. */
-		if (SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp) &&
-		    SEQ_LEQ(tqe->trq_seq, th_seq) &&
-		    SEQ_GEQ(tqe->trq_seq + tqe->trq_len, th_seq)) {
-			/* Trim head of segment. */
-			if ((i = SEQ_DELTA(tqe->trq_seq + tqe->trq_len, th_seq))) {
-				m_adj(m, i);
-				*tlenp -= i;
-				/* TCP Statistics. */
-				TCPSTAT_INC(tcps_rcvpartduppack);
-				TCPSTAT_ADD(tcps_rcvpartdupbyte, i);
-			}
-			tqe->trq_len += *tlenp;
-			tqe->trq_mcnt += mcnt;
-			tp->t_trqmcnt += mcnt;
-			tqe->trq_ml->m_next = m;
-			tqe->trq_ml = m_last(m);
-			/* Check if segment bridges two blocks to merge. */
-			if (tqen != NULL &&
-			    SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq))
-				tcp_reass_merge(tp, tqe, tqen);
-			sack_track(tqe);
-			TCPSTAT_INC(tcps_reass_append);
-			return (0);
-		}
-	}
-
-insert:
-	/* Prepare to insert into block queue. */
-	if (tp->rcv_nxt == th_seq) {
-		/*
-		 * Use temporary struct trq on the stack for missing
-		 * segment to prevent blocking of all reassembly queues
-		 * due to zone exhaustion.
-		 */
-		tqen = &tqes;
-	} else {
-		tqen = uma_zalloc(tcp_reass_zone, (M_NOWAIT|M_ZERO));
-		if (tqen == NULL) {
-			TCPSTAT_INC(tcps_rcvmemdrop);
-			m_freem(m);
-			*tlenp = 0;
-			return (0);
-		}
-		TCPSTAT_INC(tcps_reass_blocks);
-	}
-	tcp_reass_qsize++;
-	if (tcp_reass_spacetime) {
-		m = m_collapse();
-		mcnt = m_storagesize(m);
-	}
-	tqen->trq_seq = th_seq;
-	tqen->trq_len = *tlenp;
-	tqen->trq_mcnt = mcnt;
-	tp->t_trqmcnt += mcnt;
-	tqen->trq_m = m;
-	tqen->trq_ml = m_last(m);
-
-	/* Where to insert. */
-	if (tqe != NULL && SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq))
-		TAILQ_INSERT_AFTER(&tp->t_trq, tqe, tqen, trq_q);
-	else if (tqe != NULL)
-		TAILQ_INSERT_BEFORE(tqe, tqen, trq_q);
-	else {
-		KASSERT(TAILQ_EMPTY(&tp->t_trq),
-		    ("%s: first element queue not empty", __func__));
-		TAILQ_INSERT_HEAD(&tp->t_trq, tqen, trq_q);
-		/*
-		 * Flush the reassembly queue after x times the
-		 * current retransmit interval measured from the
-		 * arrival time of the first segment.
-		 */
-		if (tcp_reass_qtimo)
-			tcp_timer_activate(tp, TT_REASS,
-			    tp->t_rxtcur * tcp_reass_qtimo);
-	}
-	LIST_INSERT_HEAD(&tp->t_trq_sack, tqen, trq_s);
-
-	/* Missing segment? */
-	if (tp->rcv_nxt != th_seq)
-		return (0);
-present:
-	/*
-	 * Present data to user, advancing rcv_nxt through the
-	 * completed sequence space.
-	 */
-	KASSERT(!TAILQ_EMPTY(&tp->t_trq),
-	    ("%s: queue empty at present", __func__));
-	KASSERT((TAILQ_FIRST(&tp->t_trq))->trq_seq == tp->rcv_nxt,
-	    ("%s: first block does not match rcv_nxt", __func__));
-	TCPSTAT_INC(tcps_reass_missingseg);
-
-	SOCKBUF_LOCK(&so->so_rcv);
-	TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) {
-		KASSERT(SEQ_GEQ(tqe->trq_seq, tp->rcv_nxt),
-		    ("%s: trq_seq < rcv_nxt", __func__));
-		KASSERT(tqen == NULL ||
-		    SEQ_LEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq),
-		    ("%s: block overlaps into next one", __func__));
-
-		if (tqe->trq_seq != tp->rcv_nxt)
-			break;
-		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
-			m_freem(tqe->trq_m);
-		else
-			sbappendstream_locked(&so->so_rcv, tqe->trq_m);
-		tp->rcv_nxt += tqe->trq_len;
-		tp->t_trqmcnt -= tqe->trq_mcnt;
-		TAILQ_REMOVE(&tp->t_trq, tqe, trq_q);
-		LIST_REMOVE(tqe, trq_s);
-		if (tqe != &tqes)
-			uma_zfree(tcp_reass_zone, tqe);
-		V_tcp_reass_qsize--;
-	}
-	/* NB: sorwakeup_locked() does a implicit socket buffer unlock. */
-	sorwakeup_locked(so);
-
-	/*
-	 * Restart the reassembly queue flush timer after advancing
-	 * the sequence space and if queue is not empty.  Otherwise
-	 * deactivate it.
-	 */
-	if (tcp_reass_qtimo && !TAILQ_EMPTY(&tp->t_trq))
-		tcp_timer_activate(tp, TT_REASS,
-		    tp->t_rxtcur * tcp_reass_qtimo);
-	else
-		tcp_timer_activate(tp, TT_REASS, 0);
-
-	ND6_HINT(tp);
-	return (thflags);
-}
-
-/*
- * Merge one or more consecutive blocks together.
- */
-static void
-tcp_reass_merge(struct tcpcb *tp, struct trq *tqe, struct trq *tqen)
-{
-	int i;
-
-	KASSERT(tqe != NULL && tqen != NULL,
-	    ("%s: incomplete input", __func__));
-	KASSERT(SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq),
-	    ("%s: blocks do not overlap, nothing to merge", __func__));
-
-	/* Appended block may reach beyond next block. */
-	while (SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq + tqen->trq_len)) {
-		/* TCP Statistics. */
-		TCPSTAT_ADD(tcps_rcvpartdupbyte, tqen->trq_len);

>>> TRUNCATED FOR MAIL (1000 lines) <<<


More information about the p4-projects mailing list