PERFORCE change 133280 for review

Andre Oppermann andre at FreeBSD.org
Mon Jan 14 15:09:48 PST 2008


http://perforce.freebsd.org/chv.cgi?CH=133280

Change 133280 by andre at andre_flirtbox on 2008/01/14 23:09:09

	Change reassembly queue size limiting from number of mbufs to gross mbuf space
	used in queue.  This is more natural as it reflects the actual amount of kernel
	memory used.  It properly accounts for the mbuf storage size, be it normal mbuf,
	cluster or jumbo mbuf.  The upper limit is the remaining space in the socket
	buffer.  This effectively prevents mbuf exhaustion due to pathological traffic
	(one byte segments with a hole each time) on a single connection.  The maximal
	possible mbuf usage is now equivalent to an unserviced filled up socket buffer.
	Accounting for the gross mbuf space underestimates the effective free space in
	the socket buffer vs. actual real data with 2k clusters and 1500 byte packets.
	This shouldn't be too much of a problem though.  This edge case will be solved
	later when mbuf chain compacting is done.
	
	Reassembly specific sysctls are now unused.
	
	Update all tcp statistics.  Add new statistic for reassembly queue overflows.

Affected files ...

.. //depot/projects/tcp_reass/netinet/tcp_reass.c#4 edit
.. //depot/projects/tcp_reass/netinet/tcp_usrreq.c#3 edit
.. //depot/projects/tcp_reass/netinet/tcp_var.h#3 edit

Differences ...

==== //depot/projects/tcp_reass/netinet/tcp_reass.c#4 (text+ko) ====

@@ -103,18 +103,20 @@
 static int tcp_reass_maxseg = 0;
 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN,
     &tcp_reass_maxseg, 0,
-    "Global maximum number of TCP Segments in Reassembly Queue");
+    "Global maximum number of TCP Segment Blocks in Reassembly Queue");
 
 int tcp_reass_qsize = 0;
 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD,
     &tcp_reass_qsize, 0,
-    "Global number of TCP Segments currently in Reassembly Queue");
+    "Global number of TCP Segment Blocks currently in Reassembly Queue");
 
+/* XXX: unused. */
 static int tcp_reass_maxqlen = 48;
 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxqlen, CTLFLAG_RW,
     &tcp_reass_maxqlen, 0,
     "Maximum number of TCP Segments per individual Reassembly Queue");
 
+/* XXX: unused, moved to tcpstat. */
 static int tcp_reass_overflows = 0;
 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD,
     &tcp_reass_overflows, 0,
@@ -135,6 +137,7 @@
 tcp_reass_init(void)
 {
 
+	/* XXX: nmbclusters may be zero. */
 	tcp_reass_maxseg = nmbclusters / 16;
 	TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments",
 	    &tcp_reass_maxseg);
@@ -151,7 +154,7 @@
 	struct trq *tqe, *tqen;
 	struct socket *so = tp->t_inpcb->inp_socket;
 	struct mbuf *n;
-	int i, flags = 0, segs = 0;
+	int i, flags = 0, mcnt = 0;
 
 	INP_LOCK_ASSERT(tp->t_inpcb);
 
@@ -162,31 +165,39 @@
 	if (th == NULL)
 		goto present;
 
+	KASSERT(SEQ_LEQ(tp->rcv_nxt, th->th_seq),
+	    ("%s: sequence number below rcv_nxt", __func__));
+
 	/*
 	 * Limit the number of segments in the reassembly queue to prevent
 	 * holding on to too many segments (and thus running out of mbufs).
 	 * Make sure to let the missing segment through which caused this
-	 * queue.  Always keep one global queue entry spare to be able to
-	 * process the missing segment.
+	 * queue.
+	 * Count the gross space used by the mbufs in the reassembly queue
+	 * and limit it to the free space in the socket buffer.  This way
+	 * the reassembly queue can never consume more mbuf space than the
+	 * socket buffer got allocated anyway and it reflects the actual
+	 * amount of kernel memory used.  This effectively prevents mbuf
+	 * exhaustion due to pathological traffic (one byte segments with
+	 * a hole each time) on a single connection.
+	 * Counting the gross mbuf space effectively sets the net data
+	 * limit lower than the socket buffer would allow.
+	 * It underestimates the effective free space in the socket buffer
+	 * vs. actual real data with 2k clusters and 1500 byte packets.
+	 * This shouldn't be too much of a problem though.
 	 */
 	if (th->th_seq != tp->rcv_nxt &&
-	    (tcp_reass_qsize + 1 >= tcp_reass_maxseg ||
-	     tp->t_trqlen >= tcp_reass_maxqlen)) {
-		tcp_reass_overflows++;
+	    tp->t_trqmcnt > sbspace(&so->so_rcv)) {
+		tcpstat.tcps_rcvreassoverflow++;
 		tcpstat.tcps_rcvmemdrop++;
 		m_freem(m);
 		*tlenp = 0;
 		return (0);
 	}
 
-	/* Accounting. */
-	tcpstat.tcps_rcvoopack++;
-	tcpstat.tcps_rcvoobyte += *tlenp;
 	/* NB: m_adj(m, -i) may free mbufs at the tail of a chain. */
 	for (n = m; n; n = n->m_next)
-		segs++;
-	tp->t_trqlen += segs;
-	tcp_reass_qsize += segs;
+		mcnt += (n->m_flags & M_EXT) ? n->m_ext.ext_size : MSIZE;
 
 	/* Get rid of packet header and mtags. */
 	m_demote(m, 1);
@@ -195,9 +206,13 @@
 	tqe = TAILQ_LAST(&tp->t_trq, trq_head);
 	if (tqe && tqe->trq_seq + tqe->trq_len == th->th_seq) {
 		tqe->trq_len += *tlenp;
-		tqe->trq_segs += segs;
+		tqe->trq_mcnt += mcnt;
+		tp->t_trqmcnt += mcnt;
 		tqe->trq_ml->m_next = m;
 		tqe->trq_ml = m_last(m);
+		/* TCP statistics. */
+		tcpstat.tcps_rcvoopack++;
+		tcpstat.tcps_rcvoobyte += *tlenp;
 		return (0);
 	}
 
@@ -207,24 +222,24 @@
 		KASSERT(tqe != NULL,
 		    ("%s: missing segment but nothing in queue", __func__));
 		KASSERT(SEQ_GT(tqe->trq_seq, th->th_seq),
-		    ("%s: first block already contains missing segment", __func__));
+		    ("%s: first block starts below missing segment", __func__));
 		if (SEQ_LT(tqe->trq_seq, th->th_seq + *tlenp)) {
 			/* Trim tail of segment. */
 			if ((i = tqe->trq_seq - (th->th_seq + *tlenp))) {
 				m_adj(m, i);
-				*tlenp -= i;
-				/* tcpstat */
+				*tlenp += i;		/* NB: i is negative */
+				/* TCP statistics. */
+				tcpstat.tcps_rcvpartduppack++;
+				tcpstat.tcps_rcvpartdupbyte -= i;
 				/* Update accounting. */
-				if (segs > 1) {
-					for (n = m; n; n = n->m_next)
-						segs--;
-					tp->t_trqlen -= segs;
-					tcp_reass_qsize -= segs;
-				}
+				for (n = m; n; n = n->m_next)
+					mcnt += (n->m_flags & M_EXT) ?
+					    n->m_ext.ext_size : MSIZE;
 			}
 			/* Segment prepends first block. */
 			tqe->trq_len += *tlenp;
-			tqe->trq_segs += segs;
+			tqe->trq_mcnt += mcnt;
+			tp->t_trqmcnt += mcnt;
 			tqe->trq_seq = th->th_seq;
 			n = m_last(m);
 			n->m_next = tqe->trq_m;
@@ -234,6 +249,10 @@
 		goto insert;
 	}
 
+	/* TCP statistics. */
+	tcpstat.tcps_rcvoopack++;
+	tcpstat.tcps_rcvoobyte += *tlenp;
+
 	/* See where it fits. */
 	TAILQ_FOREACH(tqe, &tp->t_trq, trq_q) {
 		/* Segment is after this blocks coverage. */
@@ -248,8 +267,6 @@
 			tcpstat.tcps_rcvduppack++;
 			tcpstat.tcps_rcvdupbyte += *tlenp;
 			m_freem(m);
-			tp->t_trqlen -= segs;
-			tcp_reass_qsize -= segs;
 			*tlenp = 0;
 			return (0);
 		}
@@ -259,10 +276,14 @@
 			if ((i = tqe->trq_seq + tqe->trq_len - th->th_seq)) {
 				m_adj(m, i);
 				*tlenp -= i;
-				/* tcpstat */
+				/* TCP Statistics. */
+				tcpstat.tcps_rcvpartduppack++;
+				tcpstat.tcps_rcvpartdupbyte += i;
+				/* XXX dupes */
 			}
 			tqe->trq_len += *tlenp;
-			tqe->trq_segs += segs;
+			tqe->trq_mcnt += mcnt;
+			tp->t_trqmcnt += mcnt;
 			tqe->trq_ml->m_next = m;
 			tqe->trq_ml = m_last(m);
 			/* Check if segment bridges two blocks to merge. */
@@ -274,14 +295,17 @@
 				    tqen->trq_seq)) {
 					m_adj(tqen->trq_m, i);
 					tqen->trq_len -= i;
-					/* tcpstat */
+					/* TCP statistics. */
+					tcpstat.tcps_rcvpartduppack++;
+					tcpstat.tcps_rcvpartdupbyte += i;
 				}
 				tqe->trq_len += tqen->trq_len;
-				tqe->trq_segs += tqen->trq_segs;
+				tqe->trq_mcnt += tqen->trq_mcnt;
 				tqe->trq_ml->m_next = tqen->trq_m;
 				tqe->trq_ml = tqen->trq_ml;
 				TAILQ_REMOVE(&tp->t_trq, tqen, trq_q);
 				uma_zfree(tcp_reass_zone, tqen);
+				tcp_reass_qsize--;
 			}
 			return (0);
 		}
@@ -290,17 +314,18 @@
 			/* Trim tail of segment. */
 			if ((i = tqe->trq_seq - (th->th_seq + *tlenp))) {
 				m_adj(m, i);
-				*tlenp -= i;
+				*tlenp += i;		/* NB: i is negative */
+				/* TCP statistics. */
+				tcpstat.tcps_rcvpartduppack++;
+				tcpstat.tcps_rcvpartdupbyte -= i;
 				/* Update accounting. */
-				if (segs > 1) {
-					for (n = m; n; n = n->m_next)
-						segs--;
-					tp->t_trqlen -= segs;
-					tcp_reass_qsize -= segs;
-				}
+				for (n = m; n; n = n->m_next)
+					mcnt += (n->m_flags & M_EXT) ?
+					    n->m_ext.ext_size : MSIZE;
 			}
 			tqe->trq_len += *tlenp;
-			tqe->trq_segs += segs;
+			tqe->trq_mcnt += mcnt;
+			tp->t_trqmcnt += mcnt;
 			tqe->trq_seq = th->th_seq;
 			n = m_last(m);
 			n->m_next = tqe->trq_m;
@@ -318,9 +343,11 @@
 		*tlenp = 0;
 		return (0);
 	}
+	tcp_reass_qsize++;
 	tqen->trq_seq = th->th_seq;
 	tqen->trq_len = *tlenp;
-	tqen->trq_segs = segs;
+	tqen->trq_mcnt = mcnt;
+	tp->t_trqmcnt += mcnt;
 	tqen->trq_m = m;
 	tqen->trq_ml = m_last(m);
 
@@ -345,6 +372,7 @@
 		return (0);
 	SOCKBUF_LOCK(&so->so_rcv);
 	TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) {
+		/* We can never go more than one round. */
 		if (tqe->trq_seq != tp->rcv_nxt)
 			break;
 #if 1
@@ -356,12 +384,10 @@
 			m_freem(tqe->trq_m);
 		else
 			sbappendstream_locked(&so->so_rcv, tqe->trq_m);
-		KASSERT(tp->t_trqlen >= tqe->trq_segs,
-		    ("%s: t_trqlen incorrect", __func__));
-		tp->t_trqlen -= tqe->trq_segs;
-		tcp_reass_qsize -= tqe->trq_segs;
+		tp->t_trqmcnt -= tqe->trq_mcnt;
 		TAILQ_REMOVE(&tp->t_trq, tqe, trq_q);
 		uma_zfree(tcp_reass_zone, tqe);
+		tcp_reass_qsize--;
 	}
 	/* NB: sorwakeup_locked() does an implicit socket buffer unlock. */
 	sorwakeup_locked(so);
@@ -384,11 +410,11 @@
 
 	TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) {
 		m_freem(tqe->trq_m);
-		KASSERT(tp->t_trqlen >= tqe->trq_segs,
-		    ("%s: t_trqlen incorrect", __func__));
-		tp->t_trqlen -= tqe->trq_segs;
-		tcp_reass_qsize -= tqe->trq_segs;
+		KASSERT(tp->t_trqmcnt >= tqe->trq_mcnt,
+		    ("%s: t_trqmcnt incorrect", __func__));
+		tp->t_trqmcnt -= tqe->trq_mcnt;
 		TAILQ_REMOVE(&tp->t_trq, tqe, trq_q);
 		uma_zfree(tcp_reass_zone, tqe);
+		tcp_reass_qsize--;
 	}
 }

==== //depot/projects/tcp_reass/netinet/tcp_usrreq.c#3 (text+ko) ====

@@ -1744,8 +1744,8 @@
 	indent += 2;
 
 	db_print_indent(indent);
-	db_printf("t_segq first: %p   t_segqlen: %d   t_dupacks: %d\n",
-	   TAILQ_FIRST(&tp->t_trq), tp->t_trqlen, tp->t_dupacks);
+	db_printf("t_trq first: %p   t_trqmcnt: %d   t_dupacks: %d\n",
+	   TAILQ_FIRST(&tp->t_trq), tp->t_trqmcnt, tp->t_dupacks);
 
 	db_print_indent(indent);
 	db_printf("tt_rexmt: %p   tt_persist: %p   tt_keep: %p\n",

==== //depot/projects/tcp_reass/netinet/tcp_var.h#3 (text+ko) ====

@@ -40,12 +40,12 @@
  */
 extern int	tcp_do_rfc1323;
 
-/* TCP reassembly queue segment entry. */
+/* TCP reassembly queue segment block entry. */
 struct trq {
 	TAILQ_ENTRY(trq) trq_q;
-	tcp_seq		trq_seq;	/* start of segment */
-	int		trq_len;	/* length of segment */
-	int		trq_segs;	/* number of mbufs */
+	tcp_seq		trq_seq;	/* start of block */
+	int		trq_len;	/* length of block */
+	int		trq_mcnt;	/* gross mbuf size of block */
 	int		trq_flags;	/* flags for segment chain */
 #define TRQ_FIN		0x01		/* FIN was on last segment */
 	struct mbuf	*trq_m;		/* mbuf chain of data */
@@ -97,7 +97,7 @@
  */
 struct tcpcb {
 	struct	trq_head t_trq;		/* segment reassembly queue */
-	int	t_trqlen;		/* segment reassembly queue length */
+	int	t_trqmcnt;		/* segment reassembly queue gross usage */
 	int	t_dupacks;		/* consecutive dup acks recd */
 
 	struct tcp_timer *t_timers;	/* All the TCP timers in one struct */
@@ -384,6 +384,7 @@
 	u_long	tcps_rcvpartdupbyte;	/* dup. bytes in part-dup. packets */
 	u_long	tcps_rcvoopack;		/* out-of-order packets received */
 	u_long	tcps_rcvoobyte;		/* out-of-order bytes received */
+	u_long	tcps_rcvreassoverflow;	/* reassembly queue overflows */
 	u_long	tcps_rcvpackafterwin;	/* packets with data after window */
 	u_long	tcps_rcvbyteafterwin;	/* bytes rcvd after window */
 	u_long	tcps_rcvafterclose;	/* packets rcvd after "close" */


More information about the p4-projects mailing list