PERFORCE change 133751 for review

Andre Oppermann andre at FreeBSD.org
Sun Jan 20 14:49:10 PST 2008


http://perforce.freebsd.org/chv.cgi?CH=133751

Change 133751 by andre at andre_flirtbox on 2008/01/20 22:48:27

	
	Add more detailed statistics for reassembly queue behavior tracking.
	
	Add sysctl to optionally disable the reassembly queue for testing.
	
	Add sysctl tracking the global current amount of mbuf memory stored
	in all reassembly queues.
	
	Use struct trq on stack for the missing segment to prevent blocking
	of all reassembly queue due to zone exhaustion.
	
	Add tcp timer to flush the reassembly queue after inactivity of more
	than four times retransmit interval.  This prevents the reassembly
	queues from tying up memory for a long time.  Especially with large
	socket buffers.
	
	Remove separate tracking of out-of-order blocks in the SACK code.
	Use the block information of the reassembly queue instead.  Track
	which block was updated last.  It must be the first block to appear
	in the SACK list of an outgoing ACK segment.  There is one subtle
	change here: The RFC says the other blocks in the SACK list SHOULD
	appear in order of their arrival.  Instead the list ascends from
	the closest block to rcv_nxt now.  This is how the reassembly queue
	tracks its blocks.  It also makes sense as the information close
	to rcv_nxt is the most valueable to fill the holes.  In practice
	the SACK list is ordered like this anyway.  Except for heavy
	reordering.
	
	Will discuss the last two changes (flush timer and ordering of SACK
	list) on the tcpm mailing list.
	
	Checkpointing.  Code compiles and theoretically complete but not
	yet tested.
	
	TODO: Testing of latest changes.
	TODO: KTR tracing of reassembly queue behavior.
	TODO: Discussion on tcpm.
	TODO: "ipfw tcptruncate" option to test reassembly code with wildly
	      cut segments (think chainsaw massacre).
	TODO: Use m_collapse() to keep down mbuf usage of blocks.
	TODO: ddb function to examine reassembly queue.

Affected files ...

.. //depot/projects/tcp_reass/netinet/tcp_input.c#3 edit
.. //depot/projects/tcp_reass/netinet/tcp_output.c#2 edit
.. //depot/projects/tcp_reass/netinet/tcp_reass.c#11 edit
.. //depot/projects/tcp_reass/netinet/tcp_sack.c#2 edit
.. //depot/projects/tcp_reass/netinet/tcp_subr.c#3 edit
.. //depot/projects/tcp_reass/netinet/tcp_timer.c#2 edit
.. //depot/projects/tcp_reass/netinet/tcp_timer.h#2 edit
.. //depot/projects/tcp_reass/netinet/tcp_usrreq.c#4 edit
.. //depot/projects/tcp_reass/netinet/tcp_var.h#5 edit

Differences ...

==== //depot/projects/tcp_reass/netinet/tcp_input.c#3 (text+ko) ====

@@ -1110,9 +1110,6 @@
 			 * with nothing on the reassembly queue and
 			 * we have enough buffer space to take it.
 			 */
-			/* Clean receiver SACK report if present */
-			if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
-				tcp_clean_sackreport(tp);
 			++tcpstat.tcps_preddat;
 			tp->rcv_nxt += tlen;
 			/*
@@ -2218,7 +2215,6 @@
 	 */
 	if ((tlen || (thflags & TH_FIN)) &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
-		tcp_seq save_start = th->th_seq;
 		m_adj(m, drop_hdrlen);	/* delayed header drop */
 		/*
 		 * Insert segment which includes th into TCP reassembly queue
@@ -2261,8 +2257,6 @@
 			thflags = tcp_reass(tp, th, &tlen, m);
 			tp->t_flags |= TF_ACKNOW;
 		}
-		if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
-			tcp_update_sack_list(tp, save_start, save_start + tlen);
 #if 0
 		/*
 		 * Note the amount of data that peer has sent into

==== //depot/projects/tcp_reass/netinet/tcp_output.c#2 (text+ko) ====

@@ -466,7 +466,7 @@
 	if (len > tp->t_maxseg) {
 		if ((tp->t_flags & TF_TSO) && tcp_do_tso &&
 		    ((tp->t_flags & TF_SIGNATURE) == 0) &&
-		    tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
+		    TAILQ_EMPTY(&tp->t_trq) && sack_rxmit == 0 &&
 		    tp->t_inpcb->inp_options == NULL &&
 		    tp->t_inpcb->in6p_options == NULL
 #ifdef IPSEC
@@ -673,10 +673,9 @@
 				to.to_flags |= TOF_SACKPERM;
 			else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 			    (tp->t_flags & TF_SACK_PERMIT) &&
-			    tp->rcv_numsacks > 0) {
+			    !TAILQ_EMPTY(&tp->t_trq)) {
 				to.to_flags |= TOF_SACK;
-				to.to_nsacks = tp->rcv_numsacks;
-				to.to_sacks = (u_char *)tp->sackblks;
+				to.to_sacks = (u_char *)tp;
 			}
 		}
 #ifdef TCP_SIGNATURE
@@ -1348,8 +1347,6 @@
 		case TOF_SACK:
 			{
 			int sackblks = 0;
-			struct sackblk *sack = (struct sackblk *)to->to_sacks;
-			tcp_seq sack_seq;
 
 			while (!optlen || optlen % 4 != 2) {
 				optlen += TCPOLEN_NOP;
@@ -1359,19 +1356,11 @@
 				continue;
 			optlen += TCPOLEN_SACKHDR;
 			*optp++ = TCPOPT_SACK;
-			sackblks = min(to->to_nsacks,
-					(TCP_MAXOLEN - optlen) / TCPOLEN_SACK);
-			*optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK;
-			while (sackblks--) {
-				sack_seq = htonl(sack->start);
-				bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
-				optp += sizeof(sack_seq);
-				sack_seq = htonl(sack->end);
-				bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
-				optp += sizeof(sack_seq);
-				optlen += TCPOLEN_SACK;
-				sack++;
-			}
+			sackblks = tcp_reass_sack((struct tcpcb *)to->to_sacks,
+			    optp + 1, (TCP_MAXOLEN - optlen) / TCPOLEN_SACK);
+			*optp++ = TCPOLEN_SACKHDR + (sackblks * TCPOLEN_SACK);
+			optlen += TCPOLEN_SACK * sackblks;
+			optp += sizeof(tcp_seq) * 2 * sackblks;
 			tcpstat.tcps_sack_send_blocks++;
 			break;
 			}

==== //depot/projects/tcp_reass/netinet/tcp_reass.c#11 (text+ko) ====

@@ -96,22 +96,33 @@
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
     "TCP Segment Reassembly Queue");
 
+static int tcp_reass_enabled = 1;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, enable, CTLFLAG_WR,
+    &tcp_reass_enabled, 0,
+    "Use of TCP Reassembly Queue");
+
 static int tcp_reass_maxblocks = 0;
 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxblocks, CTLFLAG_RDTUN,
     &tcp_reass_maxblocks, 0,
     "Global maximum number of TCP Segment Blocks in Reassembly Queue");
 
-int tcp_reass_qsize = 0;
+static int tcp_reass_qsize = 0;
 SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, curblocks, CTLFLAG_RD,
     &tcp_reass_qsize, 0,
     "Global number of TCP Segment Blocks currently in Reassembly Queue");
 
-static void	tcp_reass_merge(struct tcpcb *, int *, struct trq *, struct trq *);
+static int tcp_reass_mcnt = 0;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, mbufbytes, CTLFLAG_RD,
+    &tcp_reass_mcnt, 0,
+    "Global gross memory size of all mbufs currently in Reassembly Queue");
+
+static void	tcp_reass_merge(struct tcpcb *, struct trq *, struct trq *);
 
 /* Initialize TCP reassembly queue */
 static void
@@ -163,9 +174,17 @@
 		goto present;
 	}
 
+	/* Check if it is really neccessary to do all the work. */
+	if (!tcp_reass_enabled) {
+		*tlenp = 0;
+		m_freem(m);
+		return (0);
+	}
+
 	/* XXX: should not happen, but does for some reason. */
 	if (*tlenp == 0)
 		return (0);
+
 	KASSERT(*tlenp > 0,
 	    ("%s: segment doesn't contain any data", __func__));
 	KASSERT(SEQ_LEQ(tp->rcv_nxt, th->th_seq),
@@ -203,7 +222,7 @@
 	 */
 	if (th->th_seq != tp->rcv_nxt &&
 	    tp->t_trqmcnt > sbspace(&so->so_rcv)) {
-		tcpstat.tcps_rcvreassoverflow++;
+		tcpstat.tcps_reass_overflow++;
 		tcpstat.tcps_rcvmemdrop++;
 		m_freem(m);
 		*tlenp = 0;
@@ -232,11 +251,14 @@
 		tqe->trq_len += *tlenp;
 		tqe->trq_mcnt += mcnt;
 		tp->t_trqmcnt += mcnt;
+		tcp_reass_mcnt += mcnt;
 		tqe->trq_ml->m_next = m;
 		tqe->trq_ml = m_last(m);
+		tp->t_trq_last = tqe;
 		/* TCP statistics. */
 		tcpstat.tcps_rcvoopack++;
 		tcpstat.tcps_rcvoobyte += *tlenp;
+		tcpstat.tcps_reass_tail++;
 		return (0);
 	}
 
@@ -270,6 +292,7 @@
 			tqe->trq_len += *tlenp;
 			tqe->trq_mcnt += mcnt;
 			tp->t_trqmcnt += mcnt;
+			tcp_reass_mcnt += mcnt;
 			tqe->trq_seq = th->th_seq;
 			n = m_last(m);
 			n->m_next = tqe->trq_m;
@@ -297,6 +320,8 @@
 		    SEQ_GEQ(tqe->trq_seq + tqe->trq_len, th->th_seq + *tlenp)) {
 			tcpstat.tcps_rcvduppack++;
 			tcpstat.tcps_rcvdupbyte += *tlenp;
+			tcpstat.tcps_reass_covered++;
+			tp->t_trq_last = tqe;
 			m_freem(m);
 			*tlenp = 0;
 			return (0);
@@ -311,13 +336,16 @@
 			tqe->trq_len = *tlenp;
 			tqe->trq_mcnt = mcnt;
 			tp->t_trqmcnt += mcnt;
+			tcp_reass_mcnt += mcnt;
 			tqe->trq_seq = th->th_seq;
 			tqe->trq_m = m;
 			tqe->trq_ml = m_last(m);
 			/* Check if segment bridges next block to merge. */
 			if (tqen != NULL &&
 			    SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq))
-				tcp_reass_merge(tp, tlenp, tqe, tqen);
+				tcp_reass_merge(tp, tqe, tqen);
+			tp->t_trq_last = tqe;
+			tcpstat.tcps_reass_replace++;
 			return (0);
 		}
 
@@ -340,10 +368,13 @@
 			tqe->trq_len += *tlenp;
 			tqe->trq_mcnt += mcnt;
 			tp->t_trqmcnt += mcnt;
+			tcp_reass_mcnt += mcnt;
 			tqe->trq_seq = th->th_seq;
 			n = m_last(m);
 			n->m_next = tqe->trq_m;
 			tqe->trq_m = m;
+			tp->t_trq_last = tqe;
+			tcpstat.tcps_reass_prepend++;
 			return (0);
 		}
 
@@ -362,12 +393,15 @@
 			tqe->trq_len += *tlenp;
 			tqe->trq_mcnt += mcnt;
 			tp->t_trqmcnt += mcnt;
+			tcp_reass_mcnt += mcnt;
 			tqe->trq_ml->m_next = m;
 			tqe->trq_ml = m_last(m);
 			/* Check if segment bridges two blocks to merge. */
 			if (tqen != NULL &&
 			    SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq))
-				tcp_reass_merge(tp, tlenp, tqe, tqen);
+				tcp_reass_merge(tp, tqe, tqen);
+			tp->t_trq_last = tqe;
+			tcpstat.tcps_reass_append++;
 			return (0);
 		}
 	}
@@ -376,19 +410,23 @@
 	/* Prepare to insert into block queue. */
 	if (tp->rcv_nxt == th->th_seq)
 		tqen = &tqes;
-	else
+	else {
 		tqen = uma_zalloc(tcp_reass_zone, (M_NOWAIT|M_ZERO));
-	if (tqen == NULL) {
-		tcpstat.tcps_rcvmemdrop++;
-		m_freem(m);
-		*tlenp = 0;
-		return (0);
+		if (tqen == NULL) {
+			tcpstat.tcps_rcvmemdrop++;
+			m_freem(m);
+			*tlenp = 0;
+			return (0);
+		}
+		tp->t_trq_last = tqe;
+		tcpstat.tcps_reass_blocks++;
 	}
 	tcp_reass_qsize++;
 	tqen->trq_seq = th->th_seq;
 	tqen->trq_len = *tlenp;
 	tqen->trq_mcnt = mcnt;
 	tp->t_trqmcnt += mcnt;
+	tcp_reass_mcnt += mcnt;
 	tqen->trq_m = m;
 	tqen->trq_ml = m_last(m);
 
@@ -401,6 +439,11 @@
 		KASSERT(TAILQ_EMPTY(&tp->t_trq),
 		    ("%s: queue not empty", __func__));
 		TAILQ_INSERT_HEAD(&tp->t_trq, tqen, trq_q);
+		/*
+		 * Flush the reassembly queue after four times the
+		 * current retransmit interval.
+		 */
+		tcp_timer_activate(tp, TT_REASS, tp->t_rxtcur * 4);
 	}
 
 	/* Missing segment? */
@@ -413,6 +456,9 @@
 	 */
 	KASSERT(!TAILQ_EMPTY(&tp->t_trq),
 	    ("%s: queue empty at present", __func__));
+	KASSERT((TAILQ_FIRST(&tp->t_trq))->trq_seq == tp->rcv_nxt,
+	    ("%s: first block does not match rcv_nxt", __func__));
+	tcpstat.tcps_reass_missingseg++;
 	SOCKBUF_LOCK(&so->so_rcv);
 	TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) {
 		KASSERT(SEQ_GEQ(tqe->trq_seq, tp->rcv_nxt),
@@ -432,13 +478,23 @@
 			sbappendstream_locked(&so->so_rcv, tqe->trq_m);
 		tp->rcv_nxt += tqe->trq_len;
 		tp->t_trqmcnt -= tqe->trq_mcnt;
+		tcp_reass_mcnt -= tqe->trq_mcnt;
 		TAILQ_REMOVE(&tp->t_trq, tqe, trq_q);
+		if (tp->t_trq_last == tqe)
+			tp->t_trq_last = NULL;
 		if (tqe != &tqes)
 			uma_zfree(tcp_reass_zone, tqe);
 		tcp_reass_qsize--;
 	}
 	/* NB: sorwakeup_locked() does an implicit socket buffer unlock. */
 	sorwakeup_locked(so);
+
+	/* Reset the flush timer if queue is not empty. */
+	if (!TAILQ_EMPTY(&tp->t_trq))
+		tcp_timer_activate(tp, TT_REASS, tp->t_rxtcur * 4);
+	else
+		tcp_timer_activate(tp, TT_REASS, 0);
+
 	ND6_HINT(tp);
 #if 1
 	return (flags);
@@ -448,20 +504,25 @@
 }
 
 static void
-tcp_reass_merge(struct tcpcb *tp, int *tlenp, struct trq *tqe, struct trq *tqen)
+tcp_reass_merge(struct tcpcb *tp, struct trq *tqe, struct trq *tqen)
 {
 #if 0
 	struct mbuf *m;
 #endif
 	int i;
 
+	KASSERT(tqe != NULL && tqen != NULL,
+	    ("%s: ", __func__));
 	KASSERT(SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq),
 	    ("%s: blocks do not overlap, nothing to merge", __func__));
 
 	/* Appended block may reach beyond next block. */
 	while (SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq + tqen->trq_len)) {
-		tcpstat.tcps_rcvpartdupbyte += tqen->trq_len;	/* Statistics */
+		/* TCP Statistics. */
+		tcpstat.tcps_rcvpartdupbyte += tqen->trq_len;
+		tcpstat.tcps_reass_covered++;
 		tp->t_trqmcnt -= tqe->trq_mcnt;
+		tcp_reass_mcnt -= tqe->trq_mcnt;
 		m_freem(tqen->trq_m);
 		TAILQ_REMOVE(&tp->t_trq, tqen, trq_q);
 		uma_zfree(tcp_reass_zone, tqen);
@@ -497,6 +558,54 @@
 	TAILQ_REMOVE(&tp->t_trq, tqen, trq_q);
 	uma_zfree(tcp_reass_zone, tqen);
 	tcp_reass_qsize--;
+	tcpstat.tcps_reass_merge++;
+}
+
+/*
+ * Put the sequence number of the reassembly queue blocks into
+ * the SACK options of an outgoing segment.
+ */
+int
+tcp_reass_sack(struct tcpcb *tp, u_char *optp, int numsacks)
+{
+	struct trq *tqe;
+	tcp_seq sack_seq;
+	int nsacks = 0;
+
+	KASSERT(numsacks > 0,
+	    ("%s: ", __func__));
+	KASSERT(!TAILQ_EMPTY(&tp->t_trq),
+	    ("%s: ", __func__));
+
+	/* The most recent block must appear first. */
+	if (tp->t_trq_last != NULL) {
+		sack_seq = htonl(tp->t_trq_last->trq_seq);
+		bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
+		optp += sizeof(sack_seq);
+		sack_seq = htonl(tp->t_trq_last->trq_seq + tp->t_trq_last->trq_len);
+		bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
+		optp += sizeof(sack_seq);
+		numsacks--;
+		nsacks++;
+	}
+
+	/* Add the other less recent blocks. */
+	TAILQ_FOREACH(tqe, &tp->t_trq, trq_q) {
+		if (numsacks < 1)
+			break;
+		if (tp->t_trq_last == tqe)
+			continue;
+		sack_seq = htonl(tqe->trq_seq);
+		bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
+		optp += sizeof(sack_seq);
+		sack_seq = htonl(tqe->trq_seq + tqe->trq_len);
+		bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq));
+		optp += sizeof(sack_seq);
+		numsacks--;
+		nsacks++;
+	}
+
+	return (nsacks);
 }
 
 /*
@@ -518,4 +627,5 @@
 		uma_zfree(tcp_reass_zone, tqe);
 		tcp_reass_qsize--;
 	}
+	tcp_timer_activate(tp, TT_REASS, 0);
 }

==== //depot/projects/tcp_reass/netinet/tcp_sack.c#2 (text+ko) ====

@@ -145,108 +145,6 @@
     "Global number of TCP SACK holes currently allocated");
 
 /*
- * This function is called upon receipt of new valid data (while not in
- * header prediction mode), and it updates the ordered list of sacks.
- */
-void
-tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end)
-{
-	/*
-	 * First reported block MUST be the most recent one.  Subsequent
-	 * blocks SHOULD be in the order in which they arrived at the
-	 * receiver.  These two conditions make the implementation fully
-	 * compliant with RFC 2018.
-	 */
-	struct sackblk head_blk, saved_blks[MAX_SACK_BLKS];
-	int num_head, num_saved, i;
-
-	INP_LOCK_ASSERT(tp->t_inpcb);
-
-	/* Check arguments. */
-	KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end"));
-
-	/* SACK block for the received segment. */
-	head_blk.start = rcv_start;
-	head_blk.end = rcv_end;
-
-	/*
-	 * Merge updated SACK blocks into head_blk, and save unchanged SACK
-	 * blocks into saved_blks[].  num_saved will have the number of the
-	 * saved SACK blocks.
-	 */
-	num_saved = 0;
-	for (i = 0; i < tp->rcv_numsacks; i++) {
-		tcp_seq start = tp->sackblks[i].start;
-		tcp_seq end = tp->sackblks[i].end;
-		if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) {
-			/*
-			 * Discard this SACK block.
-			 */
-		} else if (SEQ_LEQ(head_blk.start, end) &&
-			   SEQ_GEQ(head_blk.end, start)) {
-			/*
-			 * Merge this SACK block into head_blk.  This SACK
-			 * block itself will be discarded.
-			 */
-			if (SEQ_GT(head_blk.start, start))
-				head_blk.start = start;
-			if (SEQ_LT(head_blk.end, end))
-				head_blk.end = end;
-		} else {
-			/*
-			 * Save this SACK block.
-			 */
-			saved_blks[num_saved].start = start;
-			saved_blks[num_saved].end = end;
-			num_saved++;
-		}
-	}
-
-	/*
-	 * Update SACK list in tp->sackblks[].
-	 */
-	num_head = 0;
-	if (SEQ_GT(head_blk.start, tp->rcv_nxt)) {
-		/*
-		 * The received data segment is an out-of-order segment.  Put
-		 * head_blk at the top of SACK list.
-		 */
-		tp->sackblks[0] = head_blk;
-		num_head = 1;
-		/*
-		 * If the number of saved SACK blocks exceeds its limit,
-		 * discard the last SACK block.
-		 */
-		if (num_saved >= MAX_SACK_BLKS)
-			num_saved--;
-	}
-	if (num_saved > 0) {
-		/*
-		 * Copy the saved SACK blocks back.
-		 */
-		bcopy(saved_blks, &tp->sackblks[num_head],
-		      sizeof(struct sackblk) * num_saved);
-	}
-
-	/* Save the number of SACK blocks. */
-	tp->rcv_numsacks = num_head + num_saved;
-}
-
-/*
- * Delete all receiver-side SACK information.
- */
-void
-tcp_clean_sackreport(struct tcpcb *tp)
-{
-	int i;
-
-	INP_LOCK_ASSERT(tp->t_inpcb);
-	tp->rcv_numsacks = 0;
-	for (i = 0; i < MAX_SACK_BLKS; i++)
-		tp->sackblks[i].start = tp->sackblks[i].end=0;
-}
-
-/*
  * Allocate struct sackhole.
  */
 static struct sackhole *

==== //depot/projects/tcp_reass/netinet/tcp_subr.c#3 (text+ko) ====

@@ -605,6 +605,7 @@
 	callout_init(&tp->t_timers->tt_keep, CALLOUT_MPSAFE);
 	callout_init(&tp->t_timers->tt_2msl, CALLOUT_MPSAFE);
 	callout_init(&tp->t_timers->tt_delack, CALLOUT_MPSAFE);
+	callout_init(&tp->t_timers->tt_reass, CALLOUT_MPSAFE);
 
 	if (tcp_do_rfc1323)
 		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
@@ -682,6 +683,7 @@
 	callout_stop(&tp->t_timers->tt_keep);
 	callout_stop(&tp->t_timers->tt_2msl);
 	callout_stop(&tp->t_timers->tt_delack);
+	callout_stop(&tp->t_timers->tt_reass);
 
 	/*
 	 * If we got enough samples through the srtt filter,
@@ -807,10 +809,8 @@
 			if (inpb->inp_vflag & INP_TIMEWAIT)
 				continue;
 			INP_LOCK(inpb);
-			if ((tcpb = intotcpcb(inpb)) != NULL) {
+			if ((tcpb = intotcpcb(inpb)) != NULL)
 				tcp_reass_qfree(tcpb);
-				tcp_clean_sackreport(tcpb);
-			}
 			INP_UNLOCK(inpb);
 		}
 		INP_INFO_RUNLOCK(&tcbinfo);

==== //depot/projects/tcp_reass/netinet/tcp_timer.c#2 (text+ko) ====

@@ -566,6 +566,42 @@
 }
 
 void
+tcp_timer_reass(void *xtp)
+{
+	struct tcpcb *tp = xtp;
+	struct inpcb *inp;
+
+	INP_INFO_RLOCK(&tcbinfo);
+	inp = tp->t_inpcb;
+	/*
+	 * XXXRW: While this assert is in fact correct, bugs in the tcpcb
+	 * tear-down mean we need it as a work-around for races between
+	 * timers and tcp_discardcb().
+	 *
+	 * KASSERT(inp != NULL, ("tcp_timer_delack: inp == NULL"));
+	 */
+	if (inp == NULL) {
+		tcp_timer_race++;
+		INP_INFO_RUNLOCK(&tcbinfo);
+		return;
+	}
+	INP_LOCK(inp);
+	INP_INFO_RUNLOCK(&tcbinfo);
+	if ((inp->inp_vflag & INP_DROPPED) || callout_pending(&tp->t_timers->tt_reass)
+	    || !callout_active(&tp->t_timers->tt_reass)) {
+		INP_UNLOCK(inp);
+		return;
+	}
+	callout_deactivate(&tp->t_timers->tt_reass);
+
+	tcpstat.tcps_reass_flush++;
+	tcp_reass_qfree(tp);
+	tp->t_flags |= TF_ACKNOW;
+	(void) tcp_output(tp);
+	INP_UNLOCK(inp);
+}
+
+void
 tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta)
 {
 	struct callout *t_callout;
@@ -592,6 +628,10 @@
 			t_callout = &tp->t_timers->tt_2msl;
 			f_callout = tcp_timer_2msl;
 			break;
+		case TT_REASS:
+			t_callout = &tp->t_timers->tt_reass;
+			f_callout = tcp_timer_reass;
+			break;
 		default:
 			panic("bad timer_type");
 		}
@@ -623,6 +663,8 @@
 		case TT_2MSL:
 			t_callout = &tp->t_timers->tt_2msl;
 			break;
+		case TT_REASS:
+			t_callout = &tp->t_timers->tt_reass;
 		default:
 			panic("bad timer_type");
 		}

==== //depot/projects/tcp_reass/netinet/tcp_timer.h#2 (text+ko) ====

@@ -125,7 +125,7 @@
 
 #ifdef	TCPTIMERS
 static const char *tcptimers[] =
-    { "REXMT", "PERSIST", "KEEP", "2MSL" };
+    { "REXMT", "PERSIST", "KEEP", "2MSL", "REASS" };
 #endif
 
 /*
@@ -147,18 +147,21 @@
 	struct	callout tt_keep;	/* keepalive */
 	struct	callout tt_2msl;	/* 2*msl TIME_WAIT timer */
 	struct	callout tt_delack;	/* delayed ACK timer */
+	struct	callout tt_reass;	/* flush reassembly queue */
 };
 #define TT_DELACK	0x01
 #define TT_REXMT	0x02
 #define TT_PERSIST	0x04
 #define TT_KEEP		0x08
 #define TT_2MSL		0x10
+#define TT_REASS	0x20
 
 extern int tcp_keepinit;		/* time to establish connection */
 extern int tcp_keepidle;		/* time before keepalive probes begin */
 extern int tcp_keepintvl;		/* time between keepalive probes */
 extern int tcp_maxidle;			/* time to drop after starting probes */
 extern int tcp_delacktime;		/* time before sending a delayed ACK */
+extern int tcp_reassflush;		/* time before flushing the reassembly queue */
 extern int tcp_maxpersistidle;
 extern int tcp_rexmit_min;
 extern int tcp_rexmit_slop;
@@ -177,6 +180,7 @@
 void	tcp_timer_persist(void *xtp);
 void	tcp_timer_rexmt(void *xtp);
 void	tcp_timer_delack(void *xtp);
+void	tcp_timer_reass(void *xtp);
 
 #endif /* _KERNEL */
 

==== //depot/projects/tcp_reass/netinet/tcp_usrreq.c#4 (text+ko) ====

@@ -1839,8 +1839,8 @@
 	    tp->snd_numholes, TAILQ_FIRST(&tp->snd_holes));
 
 	db_print_indent(indent);
-	db_printf("snd_fack: 0x%08x   rcv_numsacks: %d   sack_newdata: "
-	    "0x%08x\n", tp->snd_fack, tp->rcv_numsacks, tp->sack_newdata);
+	db_printf("snd_fack: 0x%08x   sack_newdata: 0x%08x\n",
+	    tp->snd_fack, tp->sack_newdata);
 
 	/* Skip sackblks, sackhint. */
 

==== //depot/projects/tcp_reass/netinet/tcp_var.h#5 (text+ko) ====

@@ -52,7 +52,6 @@
 	struct mbuf	*trq_ml;	/* last mbuf in chain of data */
 };
 TAILQ_HEAD(trq_head, trq);
-extern	int		tcp_reass_qsize;
 extern	struct uma_zone	*tcp_reass_zone;
 
 struct sackblk {
@@ -97,6 +96,7 @@
  */
 struct tcpcb {
 	struct	trq_head t_trq;		/* segment reassembly queue */
+	struct	trq *t_trq_last;	/* last addition to reassembly queue */
 	int	t_trqmcnt;		/* segment reassembly queue gross usage */
 	int	t_dupacks;		/* consecutive dup acks recd */
 
@@ -203,8 +203,6 @@
 	TAILQ_HEAD(sackhole_head, sackhole) snd_holes;
 					/* SACK scoreboard (sorted) */
 	tcp_seq	snd_fack;		/* last seq number(+1) sack'd by rcv'r*/
-	int	rcv_numsacks;		/* # distinct sack blks present */
-	struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */
 	tcp_seq sack_newdata;		/* New data xmitted in this recovery
 					   episode starts at this seq number */
 	struct sackhint	sackhint;	/* SACK scoreboard hint */
@@ -384,7 +382,6 @@
 	u_long	tcps_rcvpartdupbyte;	/* dup. bytes in part-dup. packets */
 	u_long	tcps_rcvoopack;		/* out-of-order packets received */
 	u_long	tcps_rcvoobyte;		/* out-of-order bytes received */
-	u_long	tcps_rcvreassoverflow;	/* reassembly queue overflows */
 	u_long	tcps_rcvpackafterwin;	/* packets with data after window */
 	u_long	tcps_rcvbyteafterwin;	/* bytes rcvd after window */
 	u_long	tcps_rcvafterclose;	/* packets rcvd after "close" */
@@ -410,6 +407,17 @@
 	u_long	tcps_listendrop;	/* listen queue overflows */
 	u_long	tcps_badrst;		/* ignored RSTs in the window */
 
+	u_long	tcps_reass_blocks;	/* reassembly blocks created */
+	u_long	tcps_reass_missingseg;	/* missing segments received */
+	u_long	tcps_reass_overflow;	/* reassembly queue overflows */
+	u_long	tcps_reass_tail;	/* packet appends to tail block */
+	u_long	tcps_reass_merge;	/* reassembly block merges */
+	u_long	tcps_reass_prepend;	/* packet prepends block */
+	u_long	tcps_reass_append;	/* packet appends block */
+	u_long	tcps_reass_covered;	/* block covered and removed */
+	u_long	tcps_reass_replace;	/* block replaced */
+	u_long	tcps_reass_flush;	/* queue flushes due to timeout */
+
 	u_long	tcps_sc_added;		/* entry added to syncache */
 	u_long	tcps_sc_retransmitted;	/* syncache entry was retransmitted */
 	u_long	tcps_sc_dupsyn;		/* duplicate SYN packet */
@@ -536,6 +544,7 @@
 	    const void *);
 int	 tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *);
 void	 tcp_reass_init(void);
+int	 tcp_reass_sack(struct tcpcb *, u_char *, int);
 void	 tcp_reass_qfree(struct tcpcb *);
 void	 tcp_input(struct mbuf *, int);
 u_long	 tcp_maxmtu(struct in_conninfo *, int *);


More information about the p4-projects mailing list