unp gc vs socket close/shutdown race

Mateusz Guzik mjguzik at gmail.com
Fri Jul 10 00:17:46 UTC 2015


On Wed, Jul 08, 2015 at 02:15:38AM +0200, Mateusz Guzik wrote:
> First off note the patch below is a total hack with the easiest solution
> possible so that it can be MFCed for 10.2.
> 
> The issue:
> 
> Closing the socket involves:
> 
> if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
> 	(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
> if (pr->pr_usrreqs->pru_detach != NULL)
> 	(*pr->pr_usrreqs->pru_detach)(so);
> 
> unp_dispose which gets rid of file descriptors stored in mbufs attached
> to the socket. It leaves the mbuf chain in place.
> 
> uipc_detach actually unlinks the socket from global unp list.
> 
> In particular, this means there is a socket with unusable mbufs visible
> to unp garbage collector (unp_gc). Also there is no synchronisation of
> any form performed here, so it can inspect mbufs as fds are getting
> freed (or afterwards) leading to panics. Note that uipc_detach waits for
> unp_gc to finish due to UNP_LIST_LOCK, it's the dispose func which
> causes trouble.
> 
> Given that stuff should not be accessed after unp_dispose, and the
> socket is about to die I figured it would be best to mark the socket so
> that unp_gc can ignore it.
> 
> Note that unp_dispose only gets a pointer to mbuf. I have not found any
> way to obtain a socket from this, which in turn results in the hack
> below.
> 
> I added a new func - unp_dispose2, which is not a part of struct domain.
> dom_dispose consumers check for PR_DISPOSE2 flag and call the function
> passing the socket as an argument.
> 
> unp_dispose2(struct socket *so)
> {
>           struct unpcb *unp;
>   
>           unp = sotounpcb(so);
>           UNP_LIST_LOCK();
>           unp->unp_gcflag |= UNPGC_IGNORE;
>           UNP_LIST_UNLOCK();
>           unp_dispose(so->so_rcv.sb_mb);
> }
> 
> The UNP_LIST_LOCK + UNLOCK synchronizes against unp_gc  - either it sees the
> flag and ignores the socket, or gets to inspect it and unp_dispose2 waits for
> it to finish. AFAICT it is completely harmless to proceed with freeing after
> unp_gc had a look.
> 
> There is a similar problem with shutdown(), but the race has a smaller window
> due to it clearing mbufs just after dispose call.
> 
> In general, it feels like something else is also broken, but I don't see what.
> 
> The issue can be reproduced by running this program in a loop:
> https://people.freebsd.org/~mjg/reproducers/unp-gc-panic.c
> 
> With the patch below the issue seems to be fixed:
> 


Deuglified the patch a little. PR_DISPOSE2 flag is introduced. When set
on a protocol, unp_dispose2 is called insted of dom_dispose.

Also fixed a copy-pasto in sorflush case. It used to partially construct
a sockbuf and clear stuff in the original. It is changed to partialy
construct the socket instead, as that's what we obtain in unp_dispose2.
sockbuf is reconstructed to the same extent, socket also gets pcb so
that unp_dispose2 can botain relevant unp pointer.

diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index a431b4b..e78c81d 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -804,8 +804,13 @@ sofree(struct socket *so)
 	ACCEPT_UNLOCK();
 
 	VNET_SO_ASSERT(so);
-	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
-		(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
+	if (pr->pr_flags & PR_RIGHTS) {
+		if (pr->pr_flags & PR_DISPOSE2) {
+			unp_dispose2(so);
+		} else if (pr->pr_domain->dom_dispose != NULL) {
+			(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
+		}
+	}
 	if (pr->pr_usrreqs->pru_detach != NULL)
 		(*pr->pr_usrreqs->pru_detach)(so);
 
@@ -2356,7 +2361,7 @@ sorflush(struct socket *so)
 {
 	struct sockbuf *sb = &so->so_rcv;
 	struct protosw *pr = so->so_proto;
-	struct sockbuf asb;
+	struct socket aso;
 
 	VNET_SO_ASSERT(so);
 
@@ -2381,8 +2386,9 @@ sorflush(struct socket *so)
 	 * and mutex data unchanged.
 	 */
 	SOCKBUF_LOCK(sb);
-	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
-	bcopy(&sb->sb_startzero, &asb.sb_startzero,
+	bzero(&aso, sizeof(aso));
+	aso.so_pcb = so->so_pcb;
+	bcopy(&sb->sb_startzero, &aso.so_rcv.sb_startzero,
 	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
 	bzero(&sb->sb_startzero,
 	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
@@ -2393,9 +2399,14 @@ sorflush(struct socket *so)
 	 * Dispose of special rights and flush the socket buffer.  Don't call
 	 * any unsafe routines (that rely on locks being initialized) on asb.
 	 */
-	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
-		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
-	sbrelease_internal(&asb, so);
+	if (pr->pr_flags & PR_RIGHTS) {
+		if (pr->pr_flags & PR_DISPOSE2) {
+			unp_dispose2(&aso);
+		} else if (pr->pr_domain->dom_dispose != NULL) {
+			(*pr->pr_domain->dom_dispose)(aso.so_rcv.sb_mb);
+		}
+	}
+	sbrelease_internal(&aso.so_rcv, so);
 }
 
 /*
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index acf9fe9..98ef6dc 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -302,14 +302,15 @@ static struct protosw localsw[] = {
 {
 	.pr_type =		SOCK_STREAM,
 	.pr_domain =		&localdomain,
-	.pr_flags =		PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS,
+	.pr_flags =		PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS|
+				    PR_DISPOSE2,
 	.pr_ctloutput =		&uipc_ctloutput,
 	.pr_usrreqs =		&uipc_usrreqs_stream
 },
 {
 	.pr_type =		SOCK_DGRAM,
 	.pr_domain =		&localdomain,
-	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_RIGHTS,
+	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_RIGHTS|PR_DISPOSE2,
 	.pr_ctloutput =		&uipc_ctloutput,
 	.pr_usrreqs =		&uipc_usrreqs_dgram
 },
@@ -323,7 +324,7 @@ static struct protosw localsw[] = {
 	 * that supports both atomic record writes and control data.
 	 */
 	.pr_flags =		PR_ADDR|PR_ATOMIC|PR_CONNREQUIRED|PR_WANTRCVD|
-				    PR_RIGHTS,
+				    PR_RIGHTS|PR_DISPOSE2,
 	.pr_ctloutput =		&uipc_ctloutput,
 	.pr_usrreqs =		&uipc_usrreqs_seqpacket,
 },
@@ -2193,8 +2194,7 @@ unp_gc_process(struct unpcb *unp)
 	struct socket *so;
 	struct file *fp;
 
-	/* Already processed. */
-	if (unp->unp_gcflag & UNPGC_SCANNED)
+	if (unp->unp_gcflag & (UNPGC_SCANNED | UNPGC_IGNORE))
 		return;
 	fp = unp->unp_file;
 
@@ -2252,11 +2252,11 @@ unp_gc(__unused void *arg, int pending)
 	unp_taskcount++;
 	UNP_LIST_LOCK();
 	/*
-	 * First clear all gc flags from previous runs.
+	 * First clear all gc flags from previous runs, apart from UNPGC_IGNORE.
 	 */
 	for (head = heads; *head != NULL; head++)
 		LIST_FOREACH(unp, *head, unp_link)
-			unp->unp_gcflag = 0;
+			unp->unp_gcflag = unp->unp_gcflag & UNPGC_IGNORE;
 
 	/*
 	 * Scan marking all reachable sockets with UNPGC_REF.  Once a socket
@@ -2333,6 +2333,24 @@ unp_dispose(struct mbuf *m)
 		unp_scan(m, unp_freerights);
 }
 
+/*
+ * XXX A hack working around a difenciency in domain API.
+ * dom_dispose handler does not get the socket it is supposed to operate on,
+ * which makes it very problematic to synchronize against unp_gc, which in turn
+ * can trip over data as we are freeing it.
+ */
+void
+unp_dispose2(struct socket *so)
+{
+	struct unpcb *unp;
+
+	unp = sotounpcb(so);
+	UNP_LIST_LOCK();
+	unp->unp_gcflag |= UNPGC_IGNORE;
+	UNP_LIST_UNLOCK();
+	unp_dispose(so->so_rcv.sb_mb);
+}
+
 static void
 unp_scan(struct mbuf *m0, void (*op)(struct filedescent **, int))
 {
diff --git a/sys/sys/protosw.h b/sys/sys/protosw.h
index 55db0e3..af45532 100644
--- a/sys/sys/protosw.h
+++ b/sys/sys/protosw.h
@@ -120,6 +120,7 @@ struct protosw {
 #define	PR_RIGHTS	0x10		/* passes capabilities */
 #define PR_IMPLOPCL	0x20		/* implied open/close */
 #define	PR_LASTHDR	0x40		/* enforce ipsec policy; last header */
+#define	PR_DISPOSE2	0x80		/* call unp_dispose2 */
 
 /*
  * In earlier BSD network stacks, a single pr_usrreq() function pointer was
diff --git a/sys/sys/socket.h b/sys/sys/socket.h
index 18e2de1..e927cdc 100644
--- a/sys/sys/socket.h
+++ b/sys/sys/socket.h
@@ -666,6 +666,8 @@ void so_unlock(struct socket *so);
 
 void so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg);
 
+void unp_dispose2(struct socket *so);
+
 #endif
 
 
diff --git a/sys/sys/unpcb.h b/sys/sys/unpcb.h
index ba63f30..ead9f0a 100644
--- a/sys/sys/unpcb.h
+++ b/sys/sys/unpcb.h
@@ -106,6 +106,7 @@ struct unpcb {
 #define	UNPGC_REF			0x1	/* unpcb has external ref. */
 #define	UNPGC_DEAD			0x2	/* unpcb might be dead. */
 #define	UNPGC_SCANNED			0x4	/* Has been scanned. */
+#define	UNPGC_IGNORE			0x4	/* Someone will clear it. */
 
 /*
  * These flags are used to handle non-atomicity in connect() and bind()

-- 
Mateusz Guzik <mjguzik gmail.com>


More information about the freebsd-current mailing list