kern/181741: [PATCH] Packet loss when 'control' messages are present with large data (sendmsg(2))

Yuri yuri at rawbw.com
Mon Sep 2 04:50:00 UTC 2013


>Number:         181741
>Category:       kern
>Synopsis:       [PATCH] Packet loss when 'control' messages are present with large data (sendmsg(2))
>Confidential:   no
>Severity:       non-critical
>Priority:       low
>Responsible:    freebsd-bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          change-request
>Submitter-Id:   current-users
>Arrival-Date:   Mon Sep 02 04:50:00 UTC 2013
>Closed-Date:
>Last-Modified:
>Originator:     Yuri
>Release:        10 and 9.1
>Organization:
n/a
>Environment:
>Description:
There is the case when sendmsg(2) silently loses packets for AF_LOCAL domain when large packets with control part in them are sent.

* Problem Description
There is the watermark limit on sockbuf determined by net.local.stream.sendspace, default is 8192 bytes (field sockbuf.sb_hiwat).
When sendmsg(2) sends large enough data (8K+ that hits this 8192 limit) with control message, sosend_generic will be cutting the message data into separate mbufs based on 'sbspace' (derived from the above-mentioned sb_hiwat limit) with adjustment for control message size as it sees it. This way it tries to make sure this sb_hiwat limit is enforced.

However, down on uipc level control message is being further modified in two ways: unp_internalize modifies it into some 'internal' form, also unp_addsockcred function adds another control message when LOCAL_CREDS are requested by client. Both functions only increase control message size beyond its original size (seen by sosend_generic). So that the first final mbuf sent (concatenation of control and data) will always be larger than 'sbspace' limit that sosend_generic was cutting data for.

There is also the function sbappendcontrol_locked. It checks the 'sbspace' limit again, and discards the packet when sbspace llimit is exceeded. Its result code is essentially ignored in uipc_send. I believe, sbappendcontrol_locked shouldn't be checking space at all, since packets are expected to be properly sized to begin with. But this won't be the right fix, since sizes would be exceeding the sbspace limit anyway.

sosend_default is one level up over uipc level, and it doesn't know what uipc will do with control message. Therefore it can't know what the real adjustment for control message is needed (to properly cut data). It wrongly takes the original control size and this makes the first packet too large and discarded by sbappendcontrol_locked.

* Patch synopsys
- Added the new function into struct pr_usrreqs:
int     (*pru_finalizecontrol)(struct socket *so, int flags, struct mbuf **pcontrol);
It is called by sosend_generic to update control message to its final form.

- Removed 'sbspace' check from sbappendcontrol_locked. The only context it is called from is uipc_send, and all packet sizes are already conforming.

- Fixed few wrong error codes relevant to this situation

>How-To-Repeat:

>Fix:


Patch attached with submission follows:

Index: kern/uipc_sockbuf.c
===================================================================
--- kern/uipc_sockbuf.c	(revision 255139)
+++ kern/uipc_sockbuf.c	(working copy)
@@ -688,23 +688,16 @@
 	return (retval);
 }
 
-int
+void
 sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0,
     struct mbuf *control)
 {
-	struct mbuf *m, *n, *mlast;
-	int space;
+	struct mbuf *m, *mlast;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
-	if (control == 0)
-		panic("sbappendcontrol_locked");
-	space = m_length(control, &n) + m_length(m0, NULL);
+	m_last(control)->m_next = m0;		/* concatenate data to control */
 
-	if (space > sbspace(sb))
-		return (0);
-	n->m_next = m0;			/* concatenate data to control */
-
 	SBLASTRECORDCHK(sb);
 
 	for (m = control; m->m_next; m = m->m_next)
@@ -717,18 +710,14 @@
 	SBLASTMBUFCHK(sb);
 
 	SBLASTRECORDCHK(sb);
-	return (1);
 }
 
-int
+void
 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
 {
-	int retval;
-
 	SOCKBUF_LOCK(sb);
-	retval = sbappendcontrol_locked(sb, m0, control);
+	sbappendcontrol_locked(sb, m0, control);
 	SOCKBUF_UNLOCK(sb);
-	return (retval);
 }
 
 /*
Index: kern/uipc_socket.c
===================================================================
--- kern/uipc_socket.c	(revision 255139)
+++ kern/uipc_socket.c	(working copy)
@@ -1102,6 +1102,11 @@
 	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
 	    ("sosend_dgram: !PR_ATOMIC"));
 
+	if (so->so_proto->pr_usrreqs->pru_finalizecontrol &&
+	    (error = (*so->so_proto->pr_usrreqs->pru_finalizecontrol)(so,
+	        flags, &control, td)))
+		goto out;
+
 	if (uio != NULL)
 		resid = uio->uio_resid;
 	else
@@ -1168,6 +1173,10 @@
 	space = sbspace(&so->so_snd);
 	if (flags & MSG_OOB)
 		space += 1024;
+	if (clen > space) {
+		error = EMSGSIZE;
+		goto out;
+	}
 	space -= clen;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	if (resid > space) {
@@ -1269,6 +1278,11 @@
 	int clen = 0, error, dontroute;
 	int atomic = sosendallatonce(so) || top;
 
+	if (so->so_proto->pr_usrreqs->pru_finalizecontrol &&
+	    (error = (*so->so_proto->pr_usrreqs->pru_finalizecontrol)(so,
+	        flags, &control, td)))
+		goto out;
+
 	if (uio != NULL)
 		resid = uio->uio_resid;
 	else
@@ -1361,6 +1375,10 @@
 			goto restart;
 		}
 		SOCKBUF_UNLOCK(&so->so_snd);
+		if (clen > space) {
+			error = EMSGSIZE;
+			goto release;
+		}
 		space -= clen;
 		do {
 			if (uio == NULL) {
@@ -1426,8 +1444,11 @@
 				so->so_options &= ~SO_DONTROUTE;
 				SOCK_UNLOCK(so);
 			}
-			clen = 0;
-			control = NULL;
+			if (control) {
+				control = NULL;
+				space += clen;
+				clen = 0;
+			}
 			top = NULL;
 			if (error)
 				goto release;
Index: kern/uipc_usrreq.c
===================================================================
--- kern/uipc_usrreq.c	(revision 255139)
+++ kern/uipc_usrreq.c	(working copy)
@@ -290,7 +290,7 @@
 static void	unp_internalize_fp(struct file *);
 static int	unp_externalize(struct mbuf *, struct mbuf **, int);
 static int	unp_externalize_fp(struct file *);
-static struct mbuf	*unp_addsockcred(struct thread *, struct mbuf *);
+static int	unp_addsockcred(struct mbuf **, struct thread *);
 static void	unp_process_defers(void * __unused, int);
 
 /*
@@ -782,6 +782,47 @@
 }
 
 static int
+uipc_finalizecontrol(struct socket *so, int flags, struct mbuf **pcontrol,
+    struct thread *td)
+{
+	struct unpcb *unp, *unp2;
+	int error = 0;
+
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("uipc_finalizecontrol: unp == NULL"));
+
+	UNP_PCB_LOCK(unp);
+	unp2 = unp->unp_conn;
+
+	if (*pcontrol != NULL && (error = unp_internalize(pcontrol, td))) {
+		UNP_PCB_UNLOCK(unp);
+		return (error);
+	}
+
+	/* Lockless read. */
+	if (unp2->unp_flags & UNP_WANTCRED) {
+		switch (so->so_type) {
+		case SOCK_SEQPACKET:
+		case SOCK_STREAM:
+			/* Credentials are passed only once on SOCK_STREAM. */
+			UNP_PCB_LOCK(unp2);
+			if (unp2->unp_flags & UNP_WANTCRED) {
+				unp2->unp_flags &= ~UNP_WANTCRED;
+				error = unp_addsockcred(pcontrol, td);
+			}
+			UNP_PCB_UNLOCK(unp2);
+			break;
+		case SOCK_DGRAM:
+			error = unp_addsockcred(pcontrol, td);
+			break;
+		}
+	}
+
+	UNP_PCB_UNLOCK(unp);
+	return (error);
+}
+
+static int
 uipc_rcvd(struct socket *so, int flags)
 {
 	struct unpcb *unp, *unp2;
@@ -845,8 +886,6 @@
 		error = EOPNOTSUPP;
 		goto release;
 	}
-	if (control != NULL && (error = unp_internalize(&control, td)))
-		goto release;
 	if ((nam != NULL) || (flags & PRUS_EOF))
 		UNP_LINK_WLOCK();
 	else
@@ -880,9 +919,6 @@
 			error = ENOTCONN;
 			break;
 		}
-		/* Lockless read. */
-		if (unp2->unp_flags & UNP_WANTCRED)
-			control = unp_addsockcred(td, control);
 		UNP_PCB_LOCK(unp);
 		if (unp->unp_addr != NULL)
 			from = (struct sockaddr *)unp->unp_addr;
@@ -949,14 +985,6 @@
 		so2 = unp2->unp_socket;
 		UNP_PCB_LOCK(unp2);
 		SOCKBUF_LOCK(&so2->so_rcv);
-		if (unp2->unp_flags & UNP_WANTCRED) {
-			/*
-			 * Credentials are passed only once on SOCK_STREAM
-			 * and SOCK_SEQPACKET.
-			 */
-			unp2->unp_flags &= ~UNP_WANTCRED;
-			control = unp_addsockcred(td, control);
-		}
 		/*
 		 * Send to paired receive port, and then reduce send buffer
 		 * hiwater marks to maintain backpressure.  Wake up readers.
@@ -964,9 +992,8 @@
 		switch (so->so_type) {
 		case SOCK_STREAM:
 			if (control != NULL) {
-				if (sbappendcontrol_locked(&so2->so_rcv, m,
-				    control))
-					control = NULL;
+				sbappendcontrol_locked(&so2->so_rcv, m, control);
+				control = NULL;
 			} else
 				sbappend_locked(&so2->so_rcv, m);
 			break;
@@ -1114,6 +1141,7 @@
 	.pru_disconnect =	uipc_disconnect,
 	.pru_listen =		uipc_listen,
 	.pru_peeraddr =		uipc_peeraddr,
+	.pru_finalizecontrol =	uipc_finalizecontrol,
 	.pru_rcvd =		uipc_rcvd,
 	.pru_send =		uipc_send,
 	.pru_sense =		uipc_sense,
@@ -1136,6 +1164,7 @@
 	.pru_disconnect =	uipc_disconnect,
 	.pru_listen =		uipc_listen,
 	.pru_peeraddr =		uipc_peeraddr,
+	.pru_finalizecontrol =	uipc_finalizecontrol,
 	.pru_rcvd =		uipc_rcvd,
 	.pru_send =		uipc_send,
 	.pru_sense =		uipc_sense,
@@ -1158,6 +1187,7 @@
 	.pru_disconnect =	uipc_disconnect,
 	.pru_listen =		uipc_listen,
 	.pru_peeraddr =		uipc_peeraddr,
+	.pru_finalizecontrol =	uipc_finalizecontrol,
 	.pru_rcvd =		uipc_rcvd,
 	.pru_send =		uipc_send,
 	.pru_sense =		uipc_sense,
@@ -1747,7 +1777,7 @@
 			    SCM_RIGHTS, SOL_SOCKET);
 			if (*controlp == NULL) {
 				FILEDESC_XUNLOCK(fdesc);
-				error = E2BIG;
+				error = ENOBUFS;
 				unp_freerights(fdep, newfds);
 				goto next;
 			}
@@ -1928,7 +1958,7 @@
 			    SCM_RIGHTS, SOL_SOCKET);
 			if (*controlp == NULL) {
 				FILEDESC_SUNLOCK(fdesc);
-				error = E2BIG;
+				error = ENOBUFS;
 				goto out;
 			}
 			fdp = data;
@@ -1992,9 +2022,10 @@
 	return (error);
 }
 
-static struct mbuf *
-unp_addsockcred(struct thread *td, struct mbuf *control)
+static int
+unp_addsockcred(struct mbuf **pcontrol, struct thread *td)
 {
+	struct mbuf *control = *pcontrol;
 	struct mbuf *m, *n, *n_prev;
 	struct sockcred *sc;
 	const struct cmsghdr *cm;
@@ -2004,7 +2035,7 @@
 	ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX);
 	m = sbcreatecontrol(NULL, SOCKCREDSIZE(ngroups), SCM_CREDS, SOL_SOCKET);
 	if (m == NULL)
-		return (control);
+		return (ENOBUFS);
 
 	sc = (struct sockcred *) CMSG_DATA(mtod(m, struct cmsghdr *));
 	sc->sc_uid = td->td_ucred->cr_ruid;
@@ -2038,7 +2069,8 @@
 
 	/* Prepend it to the head. */
 	m->m_next = control;
-	return (m);
+	*pcontrol = m;
+	return (0);
 }
 
 static struct unpcb *
Index: sys/protosw.h
===================================================================
--- sys/protosw.h	(revision 255139)
+++ sys/protosw.h	(working copy)
@@ -201,6 +201,8 @@
 	int	(*pru_listen)(struct socket *so, int backlog,
 		    struct thread *td);
 	int	(*pru_peeraddr)(struct socket *so, struct sockaddr **nam);
+	int	(*pru_finalizecontrol)(struct socket *so, int flags, struct mbuf **pcontrol,
+		    struct thread *td);
 	int	(*pru_rcvd)(struct socket *so, int flags);
 	int	(*pru_rcvoob)(struct socket *so, struct mbuf *m, int flags);
 	int	(*pru_send)(struct socket *so, int flags, struct mbuf *m,
Index: sys/sockbuf.h
===================================================================
--- sys/sockbuf.h	(revision 255139)
+++ sys/sockbuf.h	(working copy)
@@ -127,9 +127,9 @@
 	    struct mbuf *m0, struct mbuf *control);
 int	sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa,
 	    struct mbuf *m0, struct mbuf *control);
-int	sbappendcontrol(struct sockbuf *sb, struct mbuf *m0,
+void	sbappendcontrol(struct sockbuf *sb, struct mbuf *m0,
 	    struct mbuf *control);
-int	sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0,
+void	sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0,
 	    struct mbuf *control);
 void	sbappendrecord(struct sockbuf *sb, struct mbuf *m0);
 void	sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0);


>Release-Note:
>Audit-Trail:
>Unformatted:


More information about the freebsd-bugs mailing list