PERFORCE change 40035 for review

Sam Leffler sam at FreeBSD.org
Mon Oct 20 16:28:09 PDT 2003


http://perforce.freebsd.org/chv.cgi?CH=40035

Change 40035 by sam at sam_ebb on 2003/10/20 16:27:57

	jason thorpe's tail optimizations for stream sockets, as
	passed on by ps/jayanth and fixed by me
	
	add global SOCKBUF_DEBUG option to control debugging code

Affected files ...

.. //depot/projects/netperf/sys/conf/NOTES#12 edit
.. //depot/projects/netperf/sys/conf/options#11 edit
.. //depot/projects/netperf/sys/kern/uipc_socket.c#3 edit
.. //depot/projects/netperf/sys/kern/uipc_socket2.c#3 edit
.. //depot/projects/netperf/sys/netinet/tcp_input.c#5 edit
.. //depot/projects/netperf/sys/netinet/tcp_usrreq.c#2 edit
.. //depot/projects/netperf/sys/sys/socketvar.h#3 edit

Differences ...

==== //depot/projects/netperf/sys/conf/NOTES#12 (text+ko) ====

@@ -2282,6 +2282,7 @@
 # Debug options
 options 	BUS_DEBUG	# enable newbus debugging
 options 	DEBUG_VFS_LOCKS	# enable vfs lock debugging
+options		SOCKBUF_DEBUG	# enable sockbuf last record/mb tail checking
 
 #####################################################################
 # SYSV IPC KERNEL PARAMETERS

==== //depot/projects/netperf/sys/conf/options#11 (text+ko) ====

@@ -618,6 +618,8 @@
 # XXX bogusly global.
 DEVICE_POLLING		opt_global.h
 
+SOCKBUF_DEBUG		opt_global.h
+
 # options for ubsec driver
 UBSEC_DEBUG		opt_ubsec.h
 UBSEC_RNDTEST		opt_ubsec.h

==== //depot/projects/netperf/sys/kern/uipc_socket.c#3 (text+ko) ====

@@ -887,6 +887,8 @@
 			error = EWOULDBLOCK;
 			goto release;
 		}
+		SBLASTRECORDCHK(&so->so_rcv);
+		SBLASTMBUFCHK(&so->so_rcv);
 		sbunlock(&so->so_rcv);
 		error = sbwait(&so->so_rcv);
 		splx(s);
@@ -897,6 +899,8 @@
 dontblock:
 	if (uio->uio_td)
 		uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
+	SBLASTRECORDCHK(&so->so_rcv);
+	SBLASTMBUFCHK(&so->so_rcv);
 	nextrecord = m->m_nextpkt;
 	if (pr->pr_flags & PR_ADDR) {
 		KASSERT(m->m_type == MT_SONAME,
@@ -938,12 +942,32 @@
 		}
 	}
 	if (m) {
-		if ((flags & MSG_PEEK) == 0)
+		if ((flags & MSG_PEEK) == 0) {
 			m->m_nextpkt = nextrecord;
+			/*
+			 * If nextrecord == NULL (this is a single chain),
+			 * then sb_lastrecord may not be valid here if m
+			 * was changed earlier.
+			 */
+			if (nextrecord == NULL) {
+				KASSERT(so->so_rcv.sb_mb == m,
+					("receive tailq 1"));
+				so->so_rcv.sb_lastrecord = m;
+			}
+		}
 		type = m->m_type;
 		if (type == MT_OOBDATA)
 			flags |= MSG_OOB;
+	} else {
+		if ((flags & MSG_PEEK) == 0) {
+			KASSERT(so->so_rcv.sb_mb == m,("receive tailq 2"));
+			so->so_rcv.sb_mb = nextrecord;
+			SB_EMPTY_FIXUP(&so->so_rcv);
+		}
 	}
+	SBLASTRECORDCHK(&so->so_rcv);
+	SBLASTMBUFCHK(&so->so_rcv);
+
 	moff = 0;
 	offset = 0;
 	while (m && uio->uio_resid > 0 && error == 0) {
@@ -970,6 +994,8 @@
 		 * block interrupts again.
 		 */
 		if (mp == 0) {
+			SBLASTRECORDCHK(&so->so_rcv);
+			SBLASTMBUFCHK(&so->so_rcv);
 			splx(s);
 #ifdef ZERO_COPY_SOCKETS
 			if (so_zero_copy_receive) {
@@ -1017,8 +1043,16 @@
 					so->so_rcv.sb_mb = m_free(m);
 					m = so->so_rcv.sb_mb;
 				}
-				if (m)
+				if (m) {
 					m->m_nextpkt = nextrecord;
+					if (nextrecord == NULL)
+						so->so_rcv.sb_lastrecord = m;
+				} else {
+					so->so_rcv.sb_mb = nextrecord;
+					SB_EMPTY_FIXUP(&so->so_rcv);
+				}
+				SBLASTRECORDCHK(&so->so_rcv);
+				SBLASTMBUFCHK(&so->so_rcv);
 			}
 		} else {
 			if (flags & MSG_PEEK)
@@ -1063,6 +1097,8 @@
 			 */
 			if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
 				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
+			SBLASTRECORDCHK(&so->so_rcv);
+			SBLASTMBUFCHK(&so->so_rcv);
 			error = sbwait(&so->so_rcv);
 			if (error) {
 				sbunlock(&so->so_rcv);
@@ -1081,8 +1117,21 @@
 			(void) sbdroprecord(&so->so_rcv);
 	}
 	if ((flags & MSG_PEEK) == 0) {
-		if (m == 0)
+		if (m == 0) {
+			/*
+			 * First part is an inline SB_EMPTY_FIXUP().  Second
+			 * part makes sure sb_lastrecord is up-to-date if
+			 * there is still data in the socket buffer.
+			 */
 			so->so_rcv.sb_mb = nextrecord;
+			if (so->so_rcv.sb_mb == NULL) {
+				so->so_rcv.sb_mbtail = NULL;
+				so->so_rcv.sb_lastrecord = NULL;
+			} else if (nextrecord->m_nextpkt == NULL)
+				so->so_rcv.sb_lastrecord = nextrecord;
+		}
+		SBLASTRECORDCHK(&so->so_rcv);
+		SBLASTMBUFCHK(&so->so_rcv);
 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
 	}

==== //depot/projects/netperf/sys/kern/uipc_socket2.c#3 (text+ko) ====

@@ -468,6 +468,60 @@
  * or sbdroprecord() when the data is acknowledged by the peer.
  */
 
+#ifdef SOCKBUF_DEBUG
+void
+sblastrecordchk(struct sockbuf *sb, const char *file, int line)
+{
+	struct mbuf *m = sb->sb_mb;
+
+	while (m && m->m_nextpkt)
+		m = m->m_nextpkt;
+
+	if (m != sb->sb_lastrecord) {
+		printf("%s: sb_mb %p sb_lastrecord %p last %p\n",
+			__func__, sb->sb_mb, sb->sb_lastrecord, m);
+		printf("packet chain:\n");
+		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
+			printf("\t%p\n", m);
+		panic("%s from %s:%u", __func__, file, line);
+	}
+}
+
+void
+sblastmbufchk(struct sockbuf *sb, const char *file, int line)
+{
+	struct mbuf *m = sb->sb_mb;
+	struct mbuf *n;
+
+	while (m && m->m_nextpkt)
+		m = m->m_nextpkt;
+
+	while (m && m->m_next)
+		m = m->m_next;
+
+	if (m != sb->sb_mbtail) {
+		printf("%s: sb_mb %p sb_mbtail %p last %p\n",
+			__func__, sb->sb_mb, sb->sb_mbtail, m);
+		printf("packet tree:\n");
+		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
+			printf("\t");
+			for (n = m; n != NULL; n = n->m_next)
+				printf("%p ", n);
+			printf("\n");
+		}
+		panic("%s from %s:%u", __func__, file, line);
+	}
+}
+#endif /* SOCKBUF_DEBUG */
+
+#define SBLINKRECORD(sb, m0) do {					\
+	if ((sb)->sb_lastrecord != NULL)				\
+		(sb)->sb_lastrecord->m_nextpkt = (m0);			\
+	else								\
+		(sb)->sb_mb = (m0);					\
+	(sb)->sb_lastrecord = (m0);					\
+} while (/*CONSTCOND*/0)
+
 /*
  * Append mbuf chain m to the last record in the
  * socket buffer sb.  The additional space associated
@@ -483,6 +537,7 @@
 
 	if (m == 0)
 		return;
+	SBLASTRECORDCHK(sb);
 	n = sb->sb_mb;
 	if (n) {
 		while (n->m_nextpkt)
@@ -493,8 +548,53 @@
 				return;
 			}
 		} while (n->m_next && (n = n->m_next));
+	} else {
+		/*
+		 * XXX Would like to simply use sb_mbtail here, but
+		 * XXX I need to verify that I won't miss an EOR that
+		 * XXX way.
+		 */
+		if ((n = sb->sb_lastrecord) != NULL) {
+			do {
+				if (n->m_flags & M_EOR) {
+					sbappendrecord(sb, m); /* XXXXXX!!!! */
+					return;
+				}
+			} while (n->m_next && (n = n->m_next));
+		} else {
+			/*
+			 * If this is the first record in the socket buffer,
+			 * it's also the last record.
+			 */
+			sb->sb_lastrecord = m;
+		}
 	}
 	sbcompress(sb, m, n);
+	SBLASTRECORDCHK(sb);
+}
+
+/*
+ * This version of sbappend() should only be used when the caller
+ * absolutely knows that there will never be more than one record
+ * in the socket buffer, that is, a stream protocol (such as TCP).
+ */
+void
+sbappendstream(struct sockbuf *sb, struct mbuf *m)
+{
+
+	KASSERT(m->m_nextpkt == NULL,("sbappendstream 0"));
+	KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1"));
+
+	SBLASTMBUFCHK(sb);
+
+#ifdef MBUFTRACE
+	m_claim(m, sb->sb_mowner);
+#endif
+
+	sbcompress(sb, m, sb->sb_mbtail);
+
+	sb->sb_lastrecord = sb->sb_mb;
+	SBLASTRECORDCHK(sb);
 }
 
 #ifdef SOCKBUF_DEBUG
@@ -516,7 +616,7 @@
 	    }
 	}
 	if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
-		printf("cc %ld != %ld || mbcnt %ld != %ld\n", len, sb->sb_cc,
+		printf("cc %ld != %u || mbcnt %ld != %u\n", len, sb->sb_cc,
 		    mbcnt, sb->sb_mbcnt);
 		panic("sbcheck");
 	}
@@ -545,6 +645,8 @@
 	 * Note this permits zero length records.
 	 */
 	sballoc(sb, m0);
+	SBLASTRECORDCHK(sb);
+	SBLINKRECORD(sb, m0);
 	if (m)
 		m->m_nextpkt = m0;
 	else
@@ -616,7 +718,7 @@
 	struct sockaddr *asa;
 	struct mbuf *m0, *control;
 {
-	struct mbuf *m, *n;
+	struct mbuf *m, *n, *nlast;
 	int space = asa->sa_len;
 
 	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
@@ -640,15 +742,16 @@
 	else
 		control = m0;
 	m->m_next = control;
-	for (n = m; n; n = n->m_next)
+	for (n = m; n->m_next != NULL; n = n->m_next)
 		sballoc(sb, n);
-	n = sb->sb_mb;
-	if (n) {
-		while (n->m_nextpkt)
-			n = n->m_nextpkt;
-		n->m_nextpkt = m;
-	} else
-		sb->sb_mb = m;
+	sballoc(sb, n);
+	nlast = n;
+	SBLINKRECORD(sb, m);
+
+	sb->sb_mbtail = nlast;
+	SBLASTMBUFCHK(sb);
+
+	SBLASTRECORDCHK(sb);
 	return (1);
 }
 
@@ -657,7 +760,7 @@
 	struct sockbuf *sb;
 	struct mbuf *control, *m0;
 {
-	struct mbuf *m, *n;
+	struct mbuf *m, *n, *mlast;
 	int space;
 
 	if (control == 0)
@@ -666,15 +769,19 @@
 	if (space > sbspace(sb))
 		return (0);
 	n->m_next = m0;			/* concatenate data to control */
-	for (m = control; m; m = m->m_next)
+
+	SBLASTRECORDCHK(sb);
+
+	for (m = control; m->m_next; m = m->m_next)
 		sballoc(sb, m);
-	n = sb->sb_mb;
-	if (n) {
-		while (n->m_nextpkt)
-			n = n->m_nextpkt;
-		n->m_nextpkt = control;
-	} else
-		sb->sb_mb = control;
+	sballoc(sb, m);
+	mlast = m;
+	SBLINKRECORD(sb, control);
+
+	sb->sb_mbtail = mlast;
+	SBLASTMBUFCHK(sb);
+
+	SBLASTRECORDCHK(sb);
 	return (1);
 }
 
@@ -697,6 +804,8 @@
 		    (eor == 0 ||
 		     (((o = m->m_next) || (o = n)) &&
 		      o->m_type == m->m_type))) {
+			if (sb->sb_lastrecord == m)
+				sb->sb_lastrecord = m->m_next;
 			m = m_free(m);
 			continue;
 		}
@@ -720,6 +829,7 @@
 			n->m_next = m;
 		else
 			sb->sb_mb = m;
+		sb->sb_mbtail = m;
 		sballoc(sb, m);
 		n = m;
 		m->m_flags &= ~M_EOR;
@@ -732,6 +842,7 @@
 		else
 			printf("semi-panic: sbcompress\n");
 	}
+	SBLASTMBUFCHK(sb);
 }
 
 /*
@@ -800,6 +911,18 @@
 		m->m_nextpkt = next;
 	} else
 		sb->sb_mb = next;
+	/*
+	 * First part is an inline SB_EMPTY_FIXUP().  Second part
+	 * makes sure sb_lastrecord is up-to-date if we dropped
+	 * part of the last record.
+	 */
+	m = sb->sb_mb;
+	if (m == NULL) {
+		sb->sb_mbtail = NULL;
+		sb->sb_lastrecord = NULL;
+	} else if (m->m_nextpkt == NULL) {
+		sb->sb_lastrecord = m;
+	}
 }
 
 /*
@@ -820,6 +943,7 @@
 			m = m_free(m);
 		} while (m);
 	}
+	SB_EMPTY_FIXUP(sb);
 }
 
 /*

==== //depot/projects/netperf/sys/netinet/tcp_input.c#5 (text+ko) ====

@@ -296,7 +296,7 @@
 		if (so->so_state & SS_CANTRCVMORE)
 			m_freem(q->tqe_m);
 		else
-			sbappend(&so->so_rcv, q->tqe_m);
+			sbappendstream(&so->so_rcv, q->tqe_m);
 		FREE(q, M_TSEGQ);
 		q = nq;
 	} while (q && q->tqe_th->th_seq == tp->rcv_nxt);
@@ -1124,7 +1124,7 @@
 				m_freem(m);
 			} else {
 				m_adj(m, drop_hdrlen);	/* delayed header drop */
-				sbappend(&so->so_rcv, m);
+				sbappendstream(&so->so_rcv, m);
 			}
 			sorwakeup(so);
 			if (DELAY_ACK(tp)) {
@@ -2178,7 +2178,7 @@
 			if (so->so_state & SS_CANTRCVMORE)
 				m_freem(m);
 			else
-				sbappend(&so->so_rcv, m);
+				sbappendstream(&so->so_rcv, m);
 			sorwakeup(so);
 		} else {
 			thflags = tcp_reass(tp, th, &tlen, m);

==== //depot/projects/netperf/sys/netinet/tcp_usrreq.c#2 (text+ko) ====

@@ -685,7 +685,7 @@
 		m_freem(control);	/* empty control, just free it */
 	}
 	if (!(flags & PRUS_OOB)) {
-		sbappend(&so->so_snd, m);
+		sbappendstream(&so->so_snd, m);
 		if (nam && tp->t_state < TCPS_SYN_SENT) {
 			/*
 			 * Do implied connect if not yet connected,
@@ -734,7 +734,7 @@
 		 * of data past the urgent section.
 		 * Otherwise, snd_up should be one lower.
 		 */
-		sbappend(&so->so_snd, m);
+		sbappendstream(&so->so_snd, m);
 		if (nam && tp->t_state < TCPS_SYN_SENT) {
 			/*
 			 * Do implied connect if not yet connected,

==== //depot/projects/netperf/sys/sys/socketvar.h#3 (text+ko) ====

@@ -102,6 +102,9 @@
 		struct	selinfo sb_sel;	/* process selecting read/write */
 #define	sb_startzero	sb_mb
 		struct	mbuf *sb_mb;	/* the mbuf chain */
+		struct	mbuf *sb_mbtail; /* the last mbuf in the chain */
+		struct	mbuf *sb_lastrecord;	/* first mbuf of last record in
+						 * socket buffer */
 		u_int	sb_cc;		/* actual chars in buffer */
 		u_int	sb_hiwat;	/* max actual char count */
 		u_int	sb_mbcnt;	/* chars of mbufs used */
@@ -137,6 +140,13 @@
 	} *so_accf;
 };
 
+#define SB_EMPTY_FIXUP(sb) do {						\
+	if ((sb)->sb_mb == NULL) {					\
+		(sb)->sb_mbtail = NULL;					\
+		(sb)->sb_lastrecord = NULL;				\
+	}								\
+} while (/*CONSTCOND*/0)
+
 /*
  * Socket state bits.
  */
@@ -353,6 +363,7 @@
 int	sockargs(struct mbuf **mp, caddr_t buf, int buflen, int type);
 int	getsockaddr(struct sockaddr **namp, caddr_t uaddr, size_t len);
 void	sbappend(struct sockbuf *sb, struct mbuf *m);
+void	sbappendstream(struct sockbuf *sb, struct mbuf *m);
 int	sbappendaddr(struct sockbuf *sb, struct sockaddr *asa,
 	    struct mbuf *m0, struct mbuf *control);
 int	sbappendcontrol(struct sockbuf *sb, struct mbuf *m0,
@@ -422,6 +433,17 @@
 void	sotoxsocket(struct socket *so, struct xsocket *xso);
 void	sowakeup(struct socket *so, struct sockbuf *sb);
 
+#ifdef SOCKBUF_DEBUG
+void	sblastrecordchk(struct sockbuf *, const char *, int);
+#define	SBLASTRECORDCHK(sb)	sblastrecordchk((sb), __FILE__, __LINE__)
+
+void	sblastmbufchk(struct sockbuf *, const char *, int);
+#define	SBLASTMBUFCHK(sb)	sblastmbufchk((sb), __FILE__, __LINE__)
+#else
+#define	SBLASTRECORDCHK(sb)      /* nothing */
+#define	SBLASTMBUFCHK(sb)        /* nothing */
+#endif /* SOCKBUF_DEBUG */
+
 /*
  * Accept filter functions (duh).
  */


More information about the p4-projects mailing list