svn commit: r265338 - in head/sys: netinet sys

Gleb Smirnoff glebius at FreeBSD.org
Sun May 4 23:25:34 UTC 2014


Author: glebius
Date: Sun May  4 23:25:32 2014
New Revision: 265338
URL: http://svnweb.freebsd.org/changeset/base/265338

Log:
  The FreeBSD-SA-14:08.tcp was a lesson on not doing acrobatics with
  mixing on stack memory and UMA memory in one linked list.
  
  Thus, rewrite TCP reassembly code in terms of memory usage. The
  algorithm remains unchanged.
  
  We actually do not need extra memory to build a reassembly queue.
  Arriving mbufs are always packet header mbufs. So we got the length
  of data as pkthdr.len. We got m_nextpkt for linkage. And we need
  only one pointer to point at the tcphdr, use PH_loc for that.
  
  In tcpcb the t_segq fields becomes mbuf pointer. The t_segqlen
  field now counts not packets, but bytes in the queue. This gives
  us more precision when comparing to socket buffer limits.
  
  Sponsored by:	Netflix
  Sponsored by:	Nginx, Inc.

Modified:
  head/sys/netinet/tcp_input.c
  head/sys/netinet/tcp_reass.c
  head/sys/netinet/tcp_subr.c
  head/sys/netinet/tcp_usrreq.c
  head/sys/netinet/tcp_var.h
  head/sys/sys/mbuf.h

Modified: head/sys/netinet/tcp_input.c
==============================================================================
--- head/sys/netinet/tcp_input.c	Sun May  4 20:21:41 2014	(r265337)
+++ head/sys/netinet/tcp_input.c	Sun May  4 23:25:32 2014	(r265338)
@@ -1639,8 +1639,7 @@ tcp_do_segment(struct mbuf *m, struct tc
 	    tp->snd_nxt == tp->snd_max &&
 	    tiwin && tiwin == tp->snd_wnd && 
 	    ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
-	    LIST_EMPTY(&tp->t_segq) &&
-	    ((to.to_flags & TOF_TS) == 0 ||
+	    tp->t_segq == NULL && ((to.to_flags & TOF_TS) == 0 ||
 	     TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
 
 		/*
@@ -2908,8 +2907,7 @@ dodata:							/* XXX */
 		 * immediately when segments are out of order (so
 		 * fast retransmit can work).
 		 */
-		if (th->th_seq == tp->rcv_nxt &&
-		    LIST_EMPTY(&tp->t_segq) &&
+		if (th->th_seq == tp->rcv_nxt && tp->t_segq == NULL &&
 		    TCPS_HAVEESTABLISHED(tp->t_state)) {
 			if (DELAY_ACK(tp, tlen))
 				tp->t_flags |= TF_DELACK;

Modified: head/sys/netinet/tcp_reass.c
==============================================================================
--- head/sys/netinet/tcp_reass.c	Sun May  4 20:21:41 2014	(r265337)
+++ head/sys/netinet/tcp_reass.c	Sun May  4 23:25:32 2014	(r265338)
@@ -71,19 +71,10 @@ __FBSDID("$FreeBSD$");
 #include <netinet/tcp_var.h>
 #include <netinet6/tcp6_var.h>
 #include <netinet/tcpip.h>
-#ifdef TCPDEBUG
-#include <netinet/tcp_debug.h>
-#endif /* TCPDEBUG */
 
 static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
     "TCP Segment Reassembly Queue");
 
-static VNET_DEFINE(int, tcp_reass_maxseg) = 0;
-#define	V_tcp_reass_maxseg		VNET(tcp_reass_maxseg)
-SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN,
-    &VNET_NAME(tcp_reass_maxseg), 0,
-    "Global maximum number of TCP Segments in Reassembly Queue");
-
 static VNET_DEFINE(int, tcp_reass_overflows) = 0;
 #define	V_tcp_reass_overflows		VNET(tcp_reass_overflows)
 SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, overflows,
@@ -91,78 +82,32 @@ SYSCTL_VNET_INT(_net_inet_tcp_reass, OID
     &VNET_NAME(tcp_reass_overflows), 0,
     "Global number of TCP Segment Reassembly Queue Overflows");
 
-static VNET_DEFINE(uma_zone_t, tcp_reass_zone);
-#define	V_tcp_reass_zone		VNET(tcp_reass_zone)
-SYSCTL_UMA_CUR(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_VNET,
-    &VNET_NAME(tcp_reass_zone),
-    "Global number of TCP Segments currently in Reassembly Queue");
-
-/* Initialize TCP reassembly queue */
-static void
-tcp_reass_zone_change(void *tag)
-{
-
-	/* Set the zone limit and read back the effective value. */
-	V_tcp_reass_maxseg = nmbclusters / 16;
-	V_tcp_reass_maxseg = uma_zone_set_max(V_tcp_reass_zone,
-	    V_tcp_reass_maxseg);
-}
-
-void
-tcp_reass_init(void)
-{
-
-	V_tcp_reass_maxseg = nmbclusters / 16;
-	TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments",
-	    &V_tcp_reass_maxseg);
-	V_tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent),
-	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
-	/* Set the zone limit and read back the effective value. */
-	V_tcp_reass_maxseg = uma_zone_set_max(V_tcp_reass_zone,
-	    V_tcp_reass_maxseg);
-	EVENTHANDLER_REGISTER(nmbclusters_change,
-	    tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY);
-}
-
-#ifdef VIMAGE
-void
-tcp_reass_destroy(void)
-{
-
-	uma_zdestroy(V_tcp_reass_zone);
-}
-#endif
-
 void
 tcp_reass_flush(struct tcpcb *tp)
 {
-	struct tseg_qent *qe;
+	struct mbuf *m;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
-	while ((qe = LIST_FIRST(&tp->t_segq)) != NULL) {
-		LIST_REMOVE(qe, tqe_q);
-		m_freem(qe->tqe_m);
-		uma_zfree(V_tcp_reass_zone, qe);
-		tp->t_segqlen--;
+	while ((m = tp->t_segq) != NULL) {
+		tp->t_segq = m->m_nextpkt;
+		tp->t_segqlen -= m->m_pkthdr.len;
+		m_freem(m);
 	}
 
 	KASSERT((tp->t_segqlen == 0),
-	    ("TCP reass queue %p segment count is %d instead of 0 after flush.",
+	    ("TCP reass queue %p length is %d instead of 0 after flush.",
 	    tp, tp->t_segqlen));
 }
 
+#define	M_TCPHDR(m)	((struct tcphdr *)((m)->m_pkthdr.pkt_tcphdr))
+
 int
 tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
 {
-	struct tseg_qent *q;
-	struct tseg_qent *p = NULL;
-	struct tseg_qent *nq;
-	struct tseg_qent *te = NULL;
 	struct socket *so = tp->t_inpcb->inp_socket;
-	char *s = NULL;
-	int flags;
-	struct tseg_qent tqs;
+	struct mbuf *mq, *mp;
+	int flags, wakeup;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
@@ -178,6 +123,10 @@ tcp_reass(struct tcpcb *tp, struct tcphd
 	if (th == NULL)
 		goto present;
 
+	M_ASSERTPKTHDR(m);
+	KASSERT(*tlenp == m->m_pkthdr.len, ("%s: tlenp %u len %u", __func__,
+	    *tlenp, m->m_pkthdr.len));
+
 	/*
 	 * Limit the number of segments that can be queued to reduce the
 	 * potential for mbuf exhaustion. For best performance, we want to be
@@ -188,19 +137,17 @@ tcp_reass(struct tcpcb *tp, struct tcphd
 	 * Always let the missing segment through which caused this queue.
 	 * NB: Access to the socket buffer is left intentionally unlocked as we
 	 * can tolerate stale information here.
-	 *
-	 * XXXLAS: Using sbspace(so->so_rcv) instead of so->so_rcv.sb_hiwat
-	 * should work but causes packets to be dropped when they shouldn't.
-	 * Investigate why and re-evaluate the below limit after the behaviour
-	 * is understood.
 	 */
 	if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) &&
-	    tp->t_segqlen >= (so->so_rcv.sb_hiwat / tp->t_maxseg) + 1) {
+	    tp->t_segqlen + m->m_pkthdr.len >= sbspace(&so->so_rcv)) {
+		char *s;
+
 		V_tcp_reass_overflows++;
 		TCPSTAT_INC(tcps_rcvmemdrop);
 		m_freem(m);
 		*tlenp = 0;
-		if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) {
+		if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
+		    NULL))) {
 			log(LOG_DEBUG, "%s; %s: queue limit reached, "
 			    "segment dropped\n", s, __func__);
 			free(s, M_TCPLOG);
@@ -209,46 +156,13 @@ tcp_reass(struct tcpcb *tp, struct tcphd
 	}
 
 	/*
-	 * Allocate a new queue entry. If we can't, or hit the zone limit
-	 * just drop the pkt.
-	 *
-	 * Use a temporary structure on the stack for the missing segment
-	 * when the zone is exhausted. Otherwise we may get stuck.
-	 */
-	te = uma_zalloc(V_tcp_reass_zone, M_NOWAIT);
-	if (te == NULL) {
-		if (th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) {
-			TCPSTAT_INC(tcps_rcvmemdrop);
-			m_freem(m);
-			*tlenp = 0;
-			if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
-			    NULL))) {
-				log(LOG_DEBUG, "%s; %s: global zone limit "
-				    "reached, segment dropped\n", s, __func__);
-				free(s, M_TCPLOG);
-			}
-			return (0);
-		} else {
-			bzero(&tqs, sizeof(struct tseg_qent));
-			te = &tqs;
-			if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
-			    NULL))) {
-				log(LOG_DEBUG,
-				    "%s; %s: global zone limit reached, using "
-				    "stack for missing segment\n", s, __func__);
-				free(s, M_TCPLOG);
-			}
-		}
-	}
-	tp->t_segqlen++;
-
-	/*
 	 * Find a segment which begins after this one does.
 	 */
-	LIST_FOREACH(q, &tp->t_segq, tqe_q) {
-		if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
+	mp = NULL;
+	for (mq = tp->t_segq; mq != NULL; mq = mq->m_nextpkt) {
+		if (SEQ_GT(M_TCPHDR(mq)->th_seq, th->th_seq))
 			break;
-		p = q;
+		mp = mq;
 	}
 
 	/*
@@ -256,18 +170,16 @@ tcp_reass(struct tcpcb *tp, struct tcphd
 	 * our data already.  If so, drop the data from the incoming
 	 * segment.  If it provides all of our data, drop us.
 	 */
-	if (p != NULL) {
+	if (mp != NULL) {
 		int i;
+
 		/* conversion to int (in i) handles seq wraparound */
-		i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
+		i = M_TCPHDR(mp)->th_seq + mp->m_pkthdr.len - th->th_seq;
 		if (i > 0) {
 			if (i >= *tlenp) {
 				TCPSTAT_INC(tcps_rcvduppack);
 				TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp);
 				m_freem(m);
-				if (te != &tqs)
-					uma_zfree(V_tcp_reass_zone, te);
-				tp->t_segqlen--;
 				/*
 				 * Try to present any queued data
 				 * at the left window edge to the user.
@@ -289,37 +201,40 @@ tcp_reass(struct tcpcb *tp, struct tcphd
 	 * While we overlap succeeding segments trim them or,
 	 * if they are completely covered, dequeue them.
 	 */
-	while (q) {
-		int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
+	while (mq) {
+		struct mbuf *nq;
+		int i;
+
+		i = (th->th_seq + *tlenp) - M_TCPHDR(mq)->th_seq;
 		if (i <= 0)
 			break;
-		if (i < q->tqe_len) {
-			q->tqe_th->th_seq += i;
-			q->tqe_len -= i;
-			m_adj(q->tqe_m, i);
+		if (i < mq->m_pkthdr.len) {
+			M_TCPHDR(mq)->th_seq += i;
+			m_adj(mq, i);
+			tp->t_segqlen -= i;
 			break;
 		}
 
-		nq = LIST_NEXT(q, tqe_q);
-		LIST_REMOVE(q, tqe_q);
-		m_freem(q->tqe_m);
-		uma_zfree(V_tcp_reass_zone, q);
-		tp->t_segqlen--;
-		q = nq;
+		nq = mq->m_nextpkt;
+		tp->t_segqlen -= mq->m_pkthdr.len;
+		m_freem(mq);
+		if (mp)
+			mp->m_nextpkt = nq;
+		else
+			tp->t_segq = nq;
+		mq = nq;
 	}
 
 	/* Insert the new segment queue entry into place. */
-	te->tqe_m = m;
-	te->tqe_th = th;
-	te->tqe_len = *tlenp;
-
-	if (p == NULL) {
-		LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
+	if (mp) {
+		m->m_nextpkt = mp->m_nextpkt;
+		mp->m_nextpkt = m;
 	} else {
-		KASSERT(te != &tqs, ("%s: temporary stack based entry not "
-		    "first element in queue", __func__));
-		LIST_INSERT_AFTER(p, te, tqe_q);
+		m->m_nextpkt = tp->t_segq;
+		tp->t_segq = m ;
 	}
+	m->m_pkthdr.pkt_tcphdr = th;
+	tp->t_segqlen += m->m_pkthdr.len;
 
 present:
 	/*
@@ -328,25 +243,30 @@ present:
 	 */
 	if (!TCPS_HAVEESTABLISHED(tp->t_state))
 		return (0);
-	q = LIST_FIRST(&tp->t_segq);
-	if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
-		return (0);
+
+	flags = 0;
+	wakeup = 0;
 	SOCKBUF_LOCK(&so->so_rcv);
-	do {
-		tp->rcv_nxt += q->tqe_len;
-		flags = q->tqe_th->th_flags & TH_FIN;
-		nq = LIST_NEXT(q, tqe_q);
-		LIST_REMOVE(q, tqe_q);
+	while ((mq = tp->t_segq) != NULL &&
+	    M_TCPHDR(mq)->th_seq == tp->rcv_nxt) {
+		tp->t_segq = mq->m_nextpkt;
+
+		tp->rcv_nxt += mq->m_pkthdr.len;
+		tp->t_segqlen -= mq->m_pkthdr.len;
+		flags = M_TCPHDR(mq)->th_flags & TH_FIN;
+
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
-			m_freem(q->tqe_m);
-		else
-			sbappendstream_locked(&so->so_rcv, q->tqe_m);
-		if (q != &tqs)
-			uma_zfree(V_tcp_reass_zone, q);
-		tp->t_segqlen--;
-		q = nq;
-	} while (q && q->tqe_th->th_seq == tp->rcv_nxt);
+			m_freem(mq);
+		else {
+			mq->m_nextpkt = NULL;
+			sbappendstream_locked(&so->so_rcv, mq);
+			wakeup = 1;
+		}
+	}
 	ND6_HINT(tp);
-	sorwakeup_locked(so);
+	if (wakeup)
+		sorwakeup_locked(so);
+	else
+		SOCKBUF_UNLOCK(&so->so_rcv);
 	return (flags);
 }

Modified: head/sys/netinet/tcp_subr.c
==============================================================================
--- head/sys/netinet/tcp_subr.c	Sun May  4 20:21:41 2014	(r265337)
+++ head/sys/netinet/tcp_subr.c	Sun May  4 23:25:32 2014	(r265338)
@@ -375,7 +375,6 @@ tcp_init(void)
 	tcp_tw_init();
 	syncache_init();
 	tcp_hc_init();
-	tcp_reass_init();
 
 	TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack);
 	V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
@@ -433,7 +432,6 @@ tcp_destroy(void)
 {
 	int error;
 
-	tcp_reass_destroy();
 	tcp_hc_destroy();
 	syncache_destroy();
 	tcp_tw_destroy();

Modified: head/sys/netinet/tcp_usrreq.c
==============================================================================
--- head/sys/netinet/tcp_usrreq.c	Sun May  4 20:21:41 2014	(r265337)
+++ head/sys/netinet/tcp_usrreq.c	Sun May  4 23:25:32 2014	(r265338)
@@ -1949,7 +1949,7 @@ db_print_tcpcb(struct tcpcb *tp, const c
 
 	db_print_indent(indent);
 	db_printf("t_segq first: %p   t_segqlen: %d   t_dupacks: %d\n",
-	   LIST_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks);
+	   tp->t_segq, tp->t_segqlen, tp->t_dupacks);
 
 	db_print_indent(indent);
 	db_printf("tt_rexmt: %p   tt_persist: %p   tt_keep: %p\n",

Modified: head/sys/netinet/tcp_var.h
==============================================================================
--- head/sys/netinet/tcp_var.h	Sun May  4 20:21:41 2014	(r265337)
+++ head/sys/netinet/tcp_var.h	Sun May  4 23:25:32 2014	(r265338)
@@ -46,15 +46,6 @@ VNET_DECLARE(int, tcp_do_rfc1323);
 
 #endif /* _KERNEL */
 
-/* TCP segment queue entry */
-struct tseg_qent {
-	LIST_ENTRY(tseg_qent) tqe_q;
-	int	tqe_len;		/* TCP segment data length */
-	struct	tcphdr *tqe_th;		/* a pointer to tcp header */
-	struct	mbuf	*tqe_m;		/* mbuf contains packet */
-};
-LIST_HEAD(tsegqe_head, tseg_qent);
-
 struct sackblk {
 	tcp_seq start;		/* start seq no. of sack block */
 	tcp_seq end;		/* end seq no. */
@@ -100,7 +91,7 @@ do {								\
  * Organized for 16 byte cacheline efficiency.
  */
 struct tcpcb {
-	struct	tsegqe_head t_segq;	/* segment reassembly queue */
+	struct	mbuf *t_segq;		/* segment reassembly queue */
 	void	*t_pspare[2];		/* new reassembly queue */
 	int	t_segqlen;		/* segment reassembly queue length */
 	int	t_dupacks;		/* consecutive dup acks recd */
@@ -663,11 +654,7 @@ char	*tcp_log_addrs(struct in_conninfo *
 char	*tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *,
 	    const void *);
 int	 tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *);
-void	 tcp_reass_init(void);
 void	 tcp_reass_flush(struct tcpcb *);
-#ifdef VIMAGE
-void	 tcp_reass_destroy(void);
-#endif
 void	 tcp_input(struct mbuf *, int);
 u_long	 tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);
 u_long	 tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *);

Modified: head/sys/sys/mbuf.h
==============================================================================
--- head/sys/sys/mbuf.h	Sun May  4 20:21:41 2014	(r265337)
+++ head/sys/sys/mbuf.h	Sun May  4 23:25:32 2014	(r265338)
@@ -159,6 +159,7 @@ struct pkthdr {
 #define	tso_segsz	PH_per.sixteen[1]
 #define	csum_phsum	PH_per.sixteen[2]
 #define	csum_data	PH_per.thirtytwo[1]
+#define	pkt_tcphdr	PH_loc.ptr
 
 /*
  * Description of external storage mapped into mbuf; valid only if M_EXT is


More information about the svn-src-head mailing list