svn commit: r333744 - in head/sys: kern sys

Matt Macy mmacy at FreeBSD.org
Thu May 17 17:59:36 UTC 2018


Author: mmacy
Date: Thu May 17 17:59:35 2018
New Revision: 333744
URL: https://svnweb.freebsd.org/changeset/base/333744

Log:
  AF_UNIX: make unix socket locking finer grained
  
  This change moves to using a reference count across lock drop / reacquire
  to guarantee liveness.
  
  Currently sends on unix sockets contend heavily on read locking the list lock.
  unix1_processes in will-it-scale peaks at 6 processes and then declines.
  
  With this change I get a substantial improvement in number of operations per second
  with 96 processes:
  
  x before
  + after
      N           Min           Max        Median           Avg        Stddev
  x  11       1688420       1696389       1693578     1692766.3     2971.1702
  +  10      63417955      71030114      70662504      69576423     2374684.6
  Difference at 95.0% confidence
          6.78837e+07 +/- 1.49463e+06
          4010.22% +/- 88.4246%
          (Student's t, pooled s = 1.63437e+06)
  
  And even for 2 processes shows a ~18% improvement.
  "Small" iron changes (1, 2, and 4 processes):
  
  x before1
  + after1.2
  +------------------------------------------------------------------------+
  |                                                                  +     |
  |                                                           x      +     |
  |                                                           x      +     |
  |                                                           x      +     |
  |                                                           x     ++     |
  |                                                          xx     ++     |
  |x                                                       x xx     ++     |
  |                                  |__________________A_____M_____AM____||
  +------------------------------------------------------------------------+
      N           Min           Max        Median           Avg        Stddev
  x  10       1131648       1197750     1197138.5     1190369.3     20651.839
  +  10       1203840       1205056       1204919     1204827.9     353.27404
  Difference at 95.0% confidence
          14458.6 +/- 13723
          1.21463% +/- 1.16683%
          (Student's t, pooled s = 14605.2)
  
  x before2
  + after2.2
  +------------------------------------------------------------------------+
  |                                                                       +|
  |                                                                       +|
  |                                                                       +|
  |                                                                       +|
  |                                                                       +|
  |                                                                       +|
  |           x                                                           +|
  |           x                                                           +|
  |         x xx                                                          +|
  |x        xxxx                                                          +|
  |      |___AM_|                                                         A|
  +------------------------------------------------------------------------+
      N           Min           Max        Median           Avg        Stddev
  x  10       1972843       2045866     2038186.5     2030443.8     21367.694
  +  10       2400853       2402196     2401043.5     2401172.7     385.40024
  Difference at 95.0% confidence
          370729 +/- 14198.9
          18.2585% +/- 0.826943%
          (Student's t, pooled s = 15111.7)
  
  x before4
  + after4.2
      N           Min           Max        Median           Avg        Stddev
  x  10       3986994       3991728     3990137.5     3989985.2     1300.0164
  +  10       4799990       4806664     4806116.5       4805194     1990.6625
  Difference at 95.0% confidence
          815209 +/- 1579.64
          20.4314% +/- 0.0421713%
          (Student's t, pooled s = 1681.19)
  
  Tested by: pho
  Reported by:	mjg
  Approved by:	sbruno
  Sponsored by:	Limelight Networks
  Differential Revision:	https://reviews.freebsd.org/D15430

Modified:
  head/sys/kern/uipc_usrreq.c
  head/sys/sys/unpcb.h

Modified: head/sys/kern/uipc_usrreq.c
==============================================================================
--- head/sys/kern/uipc_usrreq.c	Thu May 17 17:57:41 2018	(r333743)
+++ head/sys/kern/uipc_usrreq.c	Thu May 17 17:59:35 2018	(r333744)
@@ -191,13 +191,41 @@ SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD,
 /*
  * Locking and synchronization:
  *
- * Two types of locks exist in the local domain socket implementation: a
- * a global linkage rwlock and per-unpcb mutexes.  The linkage lock protects
- * the socket count, global generation number, stream/datagram global lists and
- * interconnection of unpcbs, the v_socket and unp_vnode pointers, and can be
- * held exclusively over the acquisition of multiple unpcb locks to prevent
- * deadlock.
+ * Three types of locks exist in the local domain socket implementation: a
+ * a global linkage rwlock, the mtxpool lock, and per-unpcb mutexes.
+ * The linkage lock protects the socket count, global generation number,
+ * and stream/datagram global lists.
  *
+ * The mtxpool lock protects the vnode from being modified while referenced.
+ * Lock ordering requires that it be acquired before any unpcb locks.
+ *
+ * The unpcb lock (unp_mtx) protects all fields in the unpcb. Of particular
+ * note is that this includes the unp_conn field. So long as the unpcb lock
+ * is held the reference to the unpcb pointed to by unp_conn is valid. If we
+ * require that the unpcb pointed to by unp_conn remain live in cases where
+ * we need to drop the unp_mtx as when we need to acquire the lock for a
+ * second unpcb the caller must first acquire an additional reference on the
+ * second unpcb and then revalidate any state (typically check that unp_conn
+ * is non-NULL) upon requiring the initial unpcb lock. The lock ordering
+ * between unpcbs is the conventional ascending address order. Two helper
+ * routines exist for this:
+ *
+ *   - unp_pcb_lock2(unp, unp2) - which just acquires the two locks in the
+ *     safe ordering.
+ *
+ *   - unp_pcb_owned_lock2(unp, unp2, freed) - the lock for unp is held
+ *     when called. If unp is unlocked and unp2 is subsequently freed
+ *     freed will be set to 1.
+ *
+ * The helper routines for references are:
+ *
+ *   - unp_pcb_hold(unp): Can be called any time we currently hold a valid
+ *     reference to unp.
+ *
+ *    - unp_pcb_rele(unp): The caller must hold the unp lock. If we are
+ *      releasing the last reference, detach must have been called thus
+ *      unp->unp_socket be NULL.
+ *
  * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer,
  * allocated in pru_attach() and freed in pru_detach().  The validity of that
  * pointer is an invariant, so no lock is required to dereference the so_pcb
@@ -210,16 +238,9 @@ SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD,
  * to the unpcb is held.  Typically, this reference will be from the socket,
  * or from another unpcb when the referring unpcb's lock is held (in order
  * that the reference not be invalidated during use).  For example, to follow
- * unp->unp_conn->unp_socket, you need unlock the lock on unp, not unp_conn,
- * as unp_socket remains valid as long as the reference to unp_conn is valid.
+ * unp->unp_conn->unp_socket, you need to hold a lock on unp_conn to guarantee
+ * that detach is not run clearing unp_socket.
  *
- * Fields of unpcbss are locked using a per-unpcb lock, unp_mtx.  Individual
- * atomic reads without the lock may be performed "lockless", but more
- * complex reads and read-modify-writes require the mutex to be held.  No
- * lock order is defined between unpcb locks -- multiple unpcb locks may be
- * acquired at the same time only when holding the linkage rwlock
- * exclusively, which prevents deadlocks.
- *
  * Blocking with UNIX domain sockets is a tricky issue: unlike most network
  * protocols, bind() is a non-atomic operation, and connect() requires
  * potential sleeping in the protocol, due to potentially waiting on local or
@@ -257,13 +278,19 @@ static struct mtx	unp_defers_lock;
 #define	UNP_DEFERRED_LOCK()		mtx_lock(&unp_defers_lock)
 #define	UNP_DEFERRED_UNLOCK()		mtx_unlock(&unp_defers_lock)
 
+#define UNP_REF_LIST_LOCK()		UNP_DEFERRED_LOCK();
+#define UNP_REF_LIST_UNLOCK()		UNP_DEFERRED_UNLOCK();
+
 #define UNP_PCB_LOCK_INIT(unp)		mtx_init(&(unp)->unp_mtx,	\
 					    "unp_mtx", "unp_mtx",	\
-					    MTX_DUPOK|MTX_DEF|MTX_RECURSE)
+					    MTX_DUPOK|MTX_DEF)
 #define	UNP_PCB_LOCK_DESTROY(unp)	mtx_destroy(&(unp)->unp_mtx)
 #define	UNP_PCB_LOCK(unp)		mtx_lock(&(unp)->unp_mtx)
+#define	UNP_PCB_TRYLOCK(unp)		mtx_trylock(&(unp)->unp_mtx)
 #define	UNP_PCB_UNLOCK(unp)		mtx_unlock(&(unp)->unp_mtx)
+#define	UNP_PCB_OWNED(unp)		mtx_owned(&(unp)->unp_mtx)
 #define	UNP_PCB_LOCK_ASSERT(unp)	mtx_assert(&(unp)->unp_mtx, MA_OWNED)
+#define	UNP_PCB_UNLOCK_ASSERT(unp)	mtx_assert(&(unp)->unp_mtx, MA_NOTOWNED)
 
 static int	uipc_connect2(struct socket *, struct socket *);
 static int	uipc_ctloutput(struct socket *, struct sockopt *);
@@ -289,6 +316,75 @@ static int	unp_externalize_fp(struct file *);
 static struct mbuf	*unp_addsockcred(struct thread *, struct mbuf *);
 static void	unp_process_defers(void * __unused, int);
 
+
+static void
+unp_pcb_hold(struct unpcb *unp)
+{
+	MPASS(unp->unp_refcount);
+	refcount_acquire(&unp->unp_refcount);
+}
+
+static int
+unp_pcb_rele(struct unpcb *unp)
+{
+	int freed;
+
+	UNP_PCB_LOCK_ASSERT(unp);
+	MPASS(unp->unp_refcount);
+	if ((freed = refcount_release(&unp->unp_refcount))) {
+		/* we got here with having detached? */
+		MPASS(unp->unp_socket == NULL);
+		UNP_PCB_UNLOCK(unp);
+		UNP_PCB_LOCK_DESTROY(unp);
+		uma_zfree(unp_zone, unp);
+	}
+	return (freed);
+}
+
+static void
+unp_pcb_lock2(struct unpcb *unp, struct unpcb *unp2)
+{
+	UNP_PCB_UNLOCK_ASSERT(unp);
+	UNP_PCB_UNLOCK_ASSERT(unp2);
+	if ((uintptr_t)unp2 > (uintptr_t)unp) {
+		UNP_PCB_LOCK(unp);
+		UNP_PCB_LOCK(unp2);
+	} else {
+		UNP_PCB_LOCK(unp2);
+		UNP_PCB_LOCK(unp);
+	}
+}
+
+static __noinline void
+unp_pcb_owned_lock2_slowpath(struct unpcb *unp, struct unpcb **unp2p, int *freed)
+
+{
+	struct unpcb *unp2;
+
+	unp2 = *unp2p;
+	unp_pcb_hold((unp2));
+	UNP_PCB_UNLOCK((unp));
+	UNP_PCB_LOCK((unp2));
+	UNP_PCB_LOCK((unp));
+	*freed = unp_pcb_rele((unp2));
+	if (*freed)
+		*unp2p = NULL;
+}
+
+#define unp_pcb_owned_lock2(unp, unp2, freed) do {					\
+		freed = 0;													\
+		UNP_PCB_LOCK_ASSERT((unp));									\
+		UNP_PCB_UNLOCK_ASSERT((unp2));								\
+		if (__predict_true(UNP_PCB_TRYLOCK((unp2))))				\
+			break;													\
+		else if ((uintptr_t)(unp2) > (uintptr_t)(unp))				\
+			UNP_PCB_LOCK((unp2));									\
+		else {														\
+			unp_pcb_owned_lock2_slowpath((unp), &(unp2), &freed);	\
+		}															\
+} while (0)
+
+
 /*
  * Definitions of protocols supported in the LOCAL domain.
  */
@@ -344,17 +440,16 @@ uipc_abort(struct socket *so)
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_abort: unp == NULL"));
+	UNP_PCB_UNLOCK_ASSERT(unp);
 
-	UNP_LINK_WLOCK();
 	UNP_PCB_LOCK(unp);
 	unp2 = unp->unp_conn;
 	if (unp2 != NULL) {
-		UNP_PCB_LOCK(unp2);
+		unp_pcb_hold(unp2);
+		UNP_PCB_UNLOCK(unp);
 		unp_drop(unp2);
-		UNP_PCB_UNLOCK(unp2);
-	}
-	UNP_PCB_UNLOCK(unp);
-	UNP_LINK_WUNLOCK();
+	} else
+		UNP_PCB_UNLOCK(unp);
 }
 
 static int
@@ -551,14 +646,12 @@ restart:
 	ASSERT_VOP_ELOCKED(vp, "uipc_bind");
 	soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
 
-	UNP_LINK_WLOCK();
 	UNP_PCB_LOCK(unp);
 	VOP_UNP_BIND(vp, unp);
 	unp->unp_vnode = vp;
 	unp->unp_addr = soun;
 	unp->unp_flags &= ~UNP_BINDING;
 	UNP_PCB_UNLOCK(unp);
-	UNP_LINK_WUNLOCK();
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 	free(buf, M_TEMP);
@@ -585,9 +678,7 @@ uipc_connect(struct socket *so, struct sockaddr *nam, 
 	int error;
 
 	KASSERT(td == curthread, ("uipc_connect: td != curthread"));
-	UNP_LINK_WLOCK();
 	error = unp_connect(so, nam, td);
-	UNP_LINK_WUNLOCK();
 	return (error);
 }
 
@@ -598,9 +689,7 @@ uipc_connectat(int fd, struct socket *so, struct socka
 	int error;
 
 	KASSERT(td == curthread, ("uipc_connectat: td != curthread"));
-	UNP_LINK_WLOCK();
 	error = unp_connectat(fd, so, nam, td);
-	UNP_LINK_WUNLOCK();
 	return (error);
 }
 
@@ -609,26 +698,41 @@ uipc_close(struct socket *so)
 {
 	struct unpcb *unp, *unp2;
 	struct vnode *vp = NULL;
-
+	struct mtx *vplock;
+	int freed;
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_close: unp == NULL"));
 
-	UNP_LINK_WLOCK();
+
+	vplock = NULL;
+	if ((vp = unp->unp_vnode) != NULL) {
+		vplock = mtx_pool_find(mtxpool_sleep, vp);
+		mtx_lock(vplock);
+	}
 	UNP_PCB_LOCK(unp);
-	unp2 = unp->unp_conn;
-	if (unp2 != NULL) {
-		UNP_PCB_LOCK(unp2);
-		unp_disconnect(unp, unp2);
-		UNP_PCB_UNLOCK(unp2);
+	if (vp && unp->unp_vnode == NULL) {
+		mtx_unlock(vplock);
+		vp = NULL;
 	}
-	if (SOLISTENING(so) && ((vp = unp->unp_vnode) != NULL)) {
+	if (vp != NULL) {
 		VOP_UNP_DETACH(vp);
 		unp->unp_vnode = NULL;
 	}
-	UNP_PCB_UNLOCK(unp);
-	UNP_LINK_WUNLOCK();
-	if (vp)
+	unp2 = unp->unp_conn;
+	unp_pcb_hold(unp);
+	if (unp2 != NULL) {
+		unp_pcb_hold(unp2);
+		unp_pcb_owned_lock2(unp, unp2, freed);
+		unp_disconnect(unp, unp2);
+		if (unp_pcb_rele(unp2) == 0)
+			UNP_PCB_UNLOCK(unp2);
+	}
+	if (unp_pcb_rele(unp) == 0)
+		UNP_PCB_UNLOCK(unp);
+	if (vp) {
+		mtx_unlock(vplock);
 		vrele(vp);
+	}
 }
 
 static int
@@ -637,17 +741,14 @@ uipc_connect2(struct socket *so1, struct socket *so2)
 	struct unpcb *unp, *unp2;
 	int error;
 
-	UNP_LINK_WLOCK();
 	unp = so1->so_pcb;
 	KASSERT(unp != NULL, ("uipc_connect2: unp == NULL"));
-	UNP_PCB_LOCK(unp);
 	unp2 = so2->so_pcb;
 	KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL"));
-	UNP_PCB_LOCK(unp2);
+	unp_pcb_lock2(unp, unp2);
 	error = unp_connect2(so1, so2, PRU_CONNECT2);
 	UNP_PCB_UNLOCK(unp2);
 	UNP_PCB_UNLOCK(unp);
-	UNP_LINK_WUNLOCK();
 	return (error);
 }
 
@@ -655,6 +756,7 @@ static void
 uipc_detach(struct socket *so)
 {
 	struct unpcb *unp, *unp2;
+	struct mtx *vplock;
 	struct sockaddr_un *saved_unp_addr;
 	struct vnode *vp;
 	int freeunp, local_unp_rights;
@@ -669,49 +771,77 @@ uipc_detach(struct socket *so)
 	LIST_REMOVE(unp, unp_link);
 	unp->unp_gencnt = ++unp_gencnt;
 	--unp_count;
+	UNP_LINK_WUNLOCK();
+
+	UNP_PCB_UNLOCK_ASSERT(unp);
+ restart:
+	if ((vp = unp->unp_vnode) != NULL) {
+		vplock = mtx_pool_find(mtxpool_sleep, vp);
+		mtx_lock(vplock);
+	}
 	UNP_PCB_LOCK(unp);
-	if ((unp->unp_flags & UNP_NASCENT) != 0)
+	if ((unp2 = unp->unp_conn) != NULL) {
+		unp_pcb_owned_lock2(unp, unp2, freeunp);
+		if (freeunp)
+			unp2 = NULL;
+	}
+	if (unp->unp_vnode != vp &&
+		unp->unp_vnode != NULL) {
+		mtx_unlock(vplock);
+		UNP_PCB_UNLOCK(unp);
+		if (unp2)
+			UNP_PCB_UNLOCK(unp2);
+		goto restart;
+	}
+	if ((unp->unp_flags & UNP_NASCENT) != 0) {
+		if (unp2)
+			UNP_PCB_UNLOCK(unp2);
 		goto teardown;
-
+	}
 	if ((vp = unp->unp_vnode) != NULL) {
 		VOP_UNP_DETACH(vp);
 		unp->unp_vnode = NULL;
 	}
-	unp2 = unp->unp_conn;
+	unp_pcb_hold(unp);
 	if (unp2 != NULL) {
-		UNP_PCB_LOCK(unp2);
+		unp_pcb_hold(unp2);
 		unp_disconnect(unp, unp2);
-		UNP_PCB_UNLOCK(unp2);
+		if (unp_pcb_rele(unp2) == 0)
+			UNP_PCB_UNLOCK(unp2);
 	}
-
-	/*
-	 * We hold the linkage lock exclusively, so it's OK to acquire
-	 * multiple pcb locks at a time.
-	 */
+	UNP_PCB_UNLOCK(unp);
+	UNP_REF_LIST_LOCK();
 	while (!LIST_EMPTY(&unp->unp_refs)) {
 		struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
 
-		UNP_PCB_LOCK(ref);
+		unp_pcb_hold(ref);
+		UNP_REF_LIST_UNLOCK();
+
+		MPASS(ref != unp);
+		UNP_PCB_UNLOCK_ASSERT(ref);
 		unp_drop(ref);
-		UNP_PCB_UNLOCK(ref);
+		UNP_REF_LIST_LOCK();
 	}
+
+	UNP_REF_LIST_UNLOCK();
+	UNP_PCB_LOCK(unp);
+	freeunp = unp_pcb_rele(unp);
+	MPASS(freeunp == 0);
 	local_unp_rights = unp_rights;
 teardown:
-	UNP_LINK_WUNLOCK();
 	unp->unp_socket->so_pcb = NULL;
 	saved_unp_addr = unp->unp_addr;
 	unp->unp_addr = NULL;
-	unp->unp_refcount--;
-	freeunp = (unp->unp_refcount == 0);
+	unp->unp_socket = NULL;
+	freeunp = unp_pcb_rele(unp);
 	if (saved_unp_addr != NULL)
 		free(saved_unp_addr, M_SONAME);
-	if (freeunp) {
-		UNP_PCB_LOCK_DESTROY(unp);
-		uma_zfree(unp_zone, unp);
-	} else
+	if (!freeunp)
 		UNP_PCB_UNLOCK(unp);
-	if (vp)
+	if (vp) {
+		mtx_unlock(vplock);
 		vrele(vp);
+	}
 	if (local_unp_rights)
 		taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1);
 }
@@ -720,20 +850,28 @@ static int
 uipc_disconnect(struct socket *so)
 {
 	struct unpcb *unp, *unp2;
+	int freed;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL"));
 
-	UNP_LINK_WLOCK();
 	UNP_PCB_LOCK(unp);
-	unp2 = unp->unp_conn;
-	if (unp2 != NULL) {
-		UNP_PCB_LOCK(unp2);
-		unp_disconnect(unp, unp2);
-		UNP_PCB_UNLOCK(unp2);
+	if ((unp2 = unp->unp_conn) == NULL) {
+		UNP_PCB_UNLOCK(unp);
+		return (0);
 	}
-	UNP_PCB_UNLOCK(unp);
-	UNP_LINK_WUNLOCK();
+	unp_pcb_owned_lock2(unp, unp2, freed);
+	if (__predict_false(freed)) {
+		UNP_PCB_UNLOCK(unp);
+		return (0);
+	}
+	unp_pcb_hold(unp2);
+	unp_pcb_hold(unp);
+	unp_disconnect(unp, unp2);
+	if (unp_pcb_rele(unp) == 0)
+		UNP_PCB_UNLOCK(unp);
+	if (unp_pcb_rele(unp2) == 0)
+		UNP_PCB_UNLOCK(unp2);
 	return (0);
 }
 
@@ -852,13 +990,35 @@ uipc_rcvd(struct socket *so, int flags)
 }
 
 static int
+connect_internal(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+	int error;
+	struct unpcb *unp;
+
+	unp = so->so_pcb;
+	if (unp->unp_conn != NULL)
+		return (EISCONN);
+	error = unp_connect(so, nam, td);
+	if (error)
+		return (error);
+	UNP_PCB_LOCK(unp);
+	if (unp->unp_conn == NULL) {
+		UNP_PCB_UNLOCK(unp);
+		if (error == 0)
+			error = ENOTCONN;
+	}
+	return (error);
+}
+
+
+static int
 uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
     struct mbuf *control, struct thread *td)
 {
 	struct unpcb *unp, *unp2;
 	struct socket *so2;
 	u_int mbcnt, sbcc;
-	int error = 0;
+	int freed, error;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
@@ -866,49 +1026,66 @@ uipc_send(struct socket *so, int flags, struct mbuf *m
 	    so->so_type == SOCK_SEQPACKET,
 	    ("%s: socktype %d", __func__, so->so_type));
 
+	freed = error = 0;
 	if (flags & PRUS_OOB) {
 		error = EOPNOTSUPP;
 		goto release;
 	}
 	if (control != NULL && (error = unp_internalize(&control, td)))
 		goto release;
-	if ((nam != NULL) || (flags & PRUS_EOF))
-		UNP_LINK_WLOCK();
-	else
-		UNP_LINK_RLOCK();
+
+	unp2 = NULL;
 	switch (so->so_type) {
 	case SOCK_DGRAM:
 	{
 		const struct sockaddr *from;
 
-		unp2 = unp->unp_conn;
 		if (nam != NULL) {
-			UNP_LINK_WLOCK_ASSERT();
-			if (unp2 != NULL) {
-				error = EISCONN;
+			/*
+			 * We return with UNP_PCB_LOCK_HELD so we know that
+			 * the reference is live if the pointer is valid.
+			 */
+			if ((error = connect_internal(so, nam, td)))
 				break;
-			}
-			error = unp_connect(so, nam, td);
-			if (error)
-				break;
+			MPASS(unp->unp_conn != NULL);
 			unp2 = unp->unp_conn;
-		}
+		} else  {
+			UNP_PCB_LOCK(unp);
 
+			/*
+			 * Because connect() and send() are non-atomic in a sendto()
+			 * with a target address, it's possible that the socket will
+			 * have disconnected before the send() can run.  In that case
+			 * return the slightly counter-intuitive but otherwise
+			 * correct error that the socket is not connected.
+			 */
+			if ((unp2 = unp->unp_conn)  == NULL) {
+				UNP_PCB_UNLOCK(unp);
+				error = ENOTCONN;
+				break;
+			}
+		}
+		unp_pcb_owned_lock2(unp, unp2, freed);
+		if (__predict_false(freed)) {
+			UNP_PCB_UNLOCK(unp);
+			error = ENOTCONN;
+			break;
+		}
 		/*
-		 * Because connect() and send() are non-atomic in a sendto()
-		 * with a target address, it's possible that the socket will
-		 * have disconnected before the send() can run.  In that case
-		 * return the slightly counter-intuitive but otherwise
-		 * correct error that the socket is not connected.
+		 * The socket referencing unp2 may have been closed
+		 * or unp may have been disconnected if the unp lock
+		 * was dropped to acquire unp2.
 		 */
-		if (unp2 == NULL) {
+		if (__predict_false(unp->unp_conn == NULL) ||
+			unp2->unp_socket == NULL) {
+			UNP_PCB_UNLOCK(unp);
+			if (unp_pcb_rele(unp2) == 0)
+				UNP_PCB_UNLOCK(unp2);
 			error = ENOTCONN;
 			break;
 		}
-		/* Lockless read. */
 		if (unp2->unp_flags & UNP_WANTCRED)
 			control = unp_addsockcred(td, control);
-		UNP_PCB_LOCK(unp);
 		if (unp->unp_addr != NULL)
 			from = (struct sockaddr *)unp->unp_addr;
 		else
@@ -924,12 +1101,9 @@ uipc_send(struct socket *so, int flags, struct mbuf *m
 			SOCKBUF_UNLOCK(&so2->so_rcv);
 			error = ENOBUFS;
 		}
-		if (nam != NULL) {
-			UNP_LINK_WLOCK_ASSERT();
-			UNP_PCB_LOCK(unp2);
+		if (nam != NULL)
 			unp_disconnect(unp, unp2);
-			UNP_PCB_UNLOCK(unp2);
-		}
+		UNP_PCB_UNLOCK(unp2);
 		UNP_PCB_UNLOCK(unp);
 		break;
 	}
@@ -938,42 +1112,37 @@ uipc_send(struct socket *so, int flags, struct mbuf *m
 	case SOCK_STREAM:
 		if ((so->so_state & SS_ISCONNECTED) == 0) {
 			if (nam != NULL) {
-				UNP_LINK_WLOCK_ASSERT();
-				error = unp_connect(so, nam, td);
-				if (error)
-					break;	/* XXX */
-			} else {
+				if ((error = connect_internal(so, nam, td)))
+					break;
+			} else  {
 				error = ENOTCONN;
 				break;
 			}
-		}
-
-		/* Lockless read. */
-		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+		} else if ((unp2 = unp->unp_conn) == NULL) {
+			error = ENOTCONN;
+			break;
+		} else if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 			error = EPIPE;
 			break;
+		} else {
+			UNP_PCB_LOCK(unp);
+			if ((unp2 = unp->unp_conn) == NULL) {
+				UNP_PCB_UNLOCK(unp);
+				error = ENOTCONN;
+				break;
+			}
 		}
-
-		/*
-		 * Because connect() and send() are non-atomic in a sendto()
-		 * with a target address, it's possible that the socket will
-		 * have disconnected before the send() can run.  In that case
-		 * return the slightly counter-intuitive but otherwise
-		 * correct error that the socket is not connected.
-		 *
-		 * Locking here must be done carefully: the linkage lock
-		 * prevents interconnections between unpcbs from changing, so
-		 * we can traverse from unp to unp2 without acquiring unp's
-		 * lock.  Socket buffer locks follow unpcb locks, so we can
-		 * acquire both remote and lock socket buffer locks.
-		 */
-		unp2 = unp->unp_conn;
-		if (unp2 == NULL) {
+		unp_pcb_owned_lock2(unp, unp2, freed);
+		UNP_PCB_UNLOCK(unp);
+		if (__predict_false(freed)) {
 			error = ENOTCONN;
 			break;
 		}
-		so2 = unp2->unp_socket;
-		UNP_PCB_LOCK(unp2);
+		if ((so2 = unp2->unp_socket) == NULL) {
+			UNP_PCB_UNLOCK(unp2);
+			error = ENOTCONN;
+			break;
+		}
 		SOCKBUF_LOCK(&so2->so_rcv);
 		if (unp2->unp_flags & UNP_WANTCRED) {
 			/*
@@ -1046,12 +1215,6 @@ uipc_send(struct socket *so, int flags, struct mbuf *m
 		unp_shutdown(unp);
 		UNP_PCB_UNLOCK(unp);
 	}
-
-	if ((nam != NULL) || (flags & PRUS_EOF))
-		UNP_LINK_WUNLOCK();
-	else
-		UNP_LINK_RUNLOCK();
-
 	if (control != NULL && error != 0)
 		unp_dispose_mbuf(control);
 
@@ -1124,12 +1287,10 @@ uipc_shutdown(struct socket *so)
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL"));
 
-	UNP_LINK_WLOCK();
 	UNP_PCB_LOCK(unp);
 	socantsendmore(so);
 	unp_shutdown(unp);
 	UNP_PCB_UNLOCK(unp);
-	UNP_LINK_WUNLOCK();
 	return (0);
 }
 
@@ -1333,16 +1494,11 @@ unp_connectat(int fd, struct socket *so, struct sockad
 	char buf[SOCK_MAXADDRLEN];
 	struct sockaddr *sa;
 	cap_rights_t rights;
-	int error, len;
+	int error, len, freed;
+	struct mtx *vplock;
 
 	if (nam->sa_family != AF_UNIX)
 		return (EAFNOSUPPORT);
-
-	UNP_LINK_WLOCK_ASSERT();
-
-	unp = sotounpcb(so);
-	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
-
 	if (nam->sa_len > sizeof(struct sockaddr_un))
 		return (EINVAL);
 	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
@@ -1351,12 +1507,12 @@ unp_connectat(int fd, struct socket *so, struct sockad
 	bcopy(soun->sun_path, buf, len);
 	buf[len] = 0;
 
+	unp = sotounpcb(so);
 	UNP_PCB_LOCK(unp);
 	if (unp->unp_flags & UNP_CONNECTING) {
 		UNP_PCB_UNLOCK(unp);
 		return (EALREADY);
 	}
-	UNP_LINK_WUNLOCK();
 	unp->unp_flags |= UNP_CONNECTING;
 	UNP_PCB_UNLOCK(unp);
 
@@ -1389,11 +1545,8 @@ unp_connectat(int fd, struct socket *so, struct sockad
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
 
-	/*
-	 * Lock linkage lock for two reasons: make sure v_socket is stable,
-	 * and to protect simultaneous locking of multiple pcbs.
-	 */
-	UNP_LINK_WLOCK();
+	vplock = mtx_pool_find(mtxpool_sleep, vp);
+	mtx_lock(vplock);
 	VOP_UNP_CONNECT(vp, &unp2);
 	if (unp2 == NULL) {
 		error = ECONNREFUSED;
@@ -1404,8 +1557,7 @@ unp_connectat(int fd, struct socket *so, struct sockad
 		error = EPROTOTYPE;
 		goto bad2;
 	}
-	UNP_PCB_LOCK(unp);
-	UNP_PCB_LOCK(unp2);
+	unp_pcb_lock2(unp, unp2);
 	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
 		if (so2->so_options & SO_ACCEPTCONN) {
 			CURVNET_SET(so2->so_vnet);
@@ -1418,7 +1570,9 @@ unp_connectat(int fd, struct socket *so, struct sockad
 			goto bad3;
 		}
 		unp3 = sotounpcb(so2);
-		UNP_PCB_LOCK(unp3);
+		UNP_PCB_UNLOCK(unp);
+		unp_pcb_owned_lock2(unp2, unp3, freed);
+		MPASS(!freed);
 		if (unp2->unp_addr != NULL) {
 			bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
 			unp3->unp_addr = (struct sockaddr_un *) sa;
@@ -1445,6 +1599,8 @@ unp_connectat(int fd, struct socket *so, struct sockad
 			unp3->unp_flags |= UNP_WANTCRED;
 		UNP_PCB_UNLOCK(unp2);
 		unp2 = unp3;
+		unp_pcb_owned_lock2(unp2, unp, freed);
+		MPASS(!freed);
 #ifdef MAC
 		mac_socketpeer_set_from_socket(so, so2);
 		mac_socketpeer_set_from_socket(so2, so);
@@ -1459,12 +1615,12 @@ bad3:
 	UNP_PCB_UNLOCK(unp2);
 	UNP_PCB_UNLOCK(unp);
 bad2:
-	UNP_LINK_WUNLOCK();
+	mtx_unlock(vplock);
 bad:
-	if (vp != NULL)
+	if (vp != NULL) {
 		vput(vp);
+	}
 	free(sa, M_SONAME);
-	UNP_LINK_WLOCK();
 	UNP_PCB_LOCK(unp);
 	unp->unp_flags &= ~UNP_CONNECTING;
 	UNP_PCB_UNLOCK(unp);
@@ -1482,7 +1638,6 @@ unp_connect2(struct socket *so, struct socket *so2, in
 	unp2 = sotounpcb(so2);
 	KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL"));
 
-	UNP_LINK_WLOCK_ASSERT();
 	UNP_PCB_LOCK_ASSERT(unp);
 	UNP_PCB_LOCK_ASSERT(unp2);
 
@@ -1490,10 +1645,13 @@ unp_connect2(struct socket *so, struct socket *so2, in
 		return (EPROTOTYPE);
 	unp2->unp_flags &= ~UNP_NASCENT;
 	unp->unp_conn = unp2;
-
+	unp_pcb_hold(unp2);
+	unp_pcb_hold(unp);
 	switch (so->so_type) {
 	case SOCK_DGRAM:
+		UNP_REF_LIST_LOCK();
 		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
+		UNP_REF_LIST_UNLOCK();
 		soisconnected(so);
 		break;
 
@@ -1517,31 +1675,48 @@ unp_connect2(struct socket *so, struct socket *so2, in
 static void
 unp_disconnect(struct unpcb *unp, struct unpcb *unp2)
 {
-	struct socket *so;
+	struct socket *so, *so2;
+	int rele, freed;
 
 	KASSERT(unp2 != NULL, ("unp_disconnect: unp2 == NULL"));
 
-	UNP_LINK_WLOCK_ASSERT();
 	UNP_PCB_LOCK_ASSERT(unp);
 	UNP_PCB_LOCK_ASSERT(unp2);
 
+	if (unp->unp_conn == NULL && unp2->unp_conn == NULL)
+		return;
+
+	MPASS(unp->unp_conn == unp2);
 	unp->unp_conn = NULL;
+	rele = 0;
+	so = unp->unp_socket;
+	so2 = unp2->unp_socket;
 	switch (unp->unp_socket->so_type) {
 	case SOCK_DGRAM:
+		UNP_REF_LIST_LOCK();
 		LIST_REMOVE(unp, unp_reflink);
-		so = unp->unp_socket;
-		SOCK_LOCK(so);
-		so->so_state &= ~SS_ISCONNECTED;
-		SOCK_UNLOCK(so);
+		UNP_REF_LIST_UNLOCK();
+		if (so) {
+			SOCK_LOCK(so);
+			so->so_state &= ~SS_ISCONNECTED;
+			SOCK_UNLOCK(so);
+		}
 		break;
 
 	case SOCK_STREAM:
 	case SOCK_SEQPACKET:
-		soisdisconnected(unp->unp_socket);
+		if (so)
+			soisdisconnected(so);
+		MPASS(unp2->unp_conn == unp);
 		unp2->unp_conn = NULL;
-		soisdisconnected(unp2->unp_socket);
+		if (so2)
+			soisdisconnected(so2);
 		break;
 	}
+	freed = unp_pcb_rele(unp);
+	MPASS(freed == 0);
+	freed = unp_pcb_rele(unp2);
+	MPASS(freed == 0);
 }
 
 /*
@@ -1625,7 +1800,7 @@ unp_pcblist(SYSCTL_HANDLER_ARGS)
 				continue;
 			}
 			unp_list[i++] = unp;
-			unp->unp_refcount++;
+			unp_pcb_hold(unp);
 		}
 		UNP_PCB_UNLOCK(unp);
 	}
@@ -1637,8 +1812,9 @@ unp_pcblist(SYSCTL_HANDLER_ARGS)
 	for (i = 0; i < n; i++) {
 		unp = unp_list[i];
 		UNP_PCB_LOCK(unp);
-		unp->unp_refcount--;
-	        if (unp->unp_refcount != 0 && unp->unp_gencnt <= gencnt) {
+		freeunp = unp_pcb_rele(unp);
+
+		if (freeunp == 0 && unp->unp_gencnt <= gencnt) {
 			xu->xu_len = sizeof *xu;
 			xu->xu_unpp = unp;
 			/*
@@ -1665,14 +1841,8 @@ unp_pcblist(SYSCTL_HANDLER_ARGS)
 			sotoxsocket(unp->unp_socket, &xu->xu_socket);
 			UNP_PCB_UNLOCK(unp);
 			error = SYSCTL_OUT(req, xu, sizeof *xu);
-		} else {
-			freeunp = (unp->unp_refcount == 0);
+		} else  if (freeunp == 0)
 			UNP_PCB_UNLOCK(unp);
-			if (freeunp) {
-				UNP_PCB_LOCK_DESTROY(unp);
-				uma_zfree(unp_zone, unp);
-			}
-		}
 	}
 	free(xu, M_TEMP);
 	if (!error) {
@@ -1709,7 +1879,6 @@ unp_shutdown(struct unpcb *unp)
 	struct unpcb *unp2;
 	struct socket *so;
 
-	UNP_LINK_WLOCK_ASSERT();
 	UNP_PCB_LOCK_ASSERT(unp);
 
 	unp2 = unp->unp_conn;
@@ -1726,22 +1895,28 @@ unp_drop(struct unpcb *unp)
 {
 	struct socket *so = unp->unp_socket;
 	struct unpcb *unp2;
+	int freed;
 
-	UNP_LINK_WLOCK_ASSERT();
-	UNP_PCB_LOCK_ASSERT(unp);
-
 	/*
 	 * Regardless of whether the socket's peer dropped the connection
 	 * with this socket by aborting or disconnecting, POSIX requires
 	 * that ECONNRESET is returned.
 	 */
-	so->so_error = ECONNRESET;
+	/* acquire a reference so that unp isn't freed from underneath us */
+
+	UNP_PCB_LOCK(unp);
+	if (so)
+		so->so_error = ECONNRESET;
 	unp2 = unp->unp_conn;
-	if (unp2 == NULL)
-		return;
-	UNP_PCB_LOCK(unp2);
-	unp_disconnect(unp, unp2);
-	UNP_PCB_UNLOCK(unp2);
+	if (unp2 != NULL) {
+		unp_pcb_hold(unp2);
+		unp_pcb_owned_lock2(unp, unp2, freed);
+		unp_disconnect(unp, unp2);
+		if (unp_pcb_rele(unp2) == 0)
+			UNP_PCB_UNLOCK(unp2);
+	}
+	if (unp_pcb_rele(unp) == 0)
+		UNP_PCB_UNLOCK(unp);
 }
 
 static void
@@ -1881,7 +2056,7 @@ unp_init(void)
 		return;
 #endif
 	unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL,
-	    NULL, NULL, UMA_ALIGN_PTR, 0);
+	    NULL, NULL, UMA_ALIGN_CACHE, 0);
 	if (unp_zone == NULL)
 		panic("unp_init");
 	uma_zone_set_max(unp_zone, maxsockets);
@@ -2464,13 +2639,15 @@ vfs_unp_reclaim(struct vnode *vp)
 {
 	struct unpcb *unp;
 	int active;
+	struct mtx *vplock;
 
 	ASSERT_VOP_ELOCKED(vp, "vfs_unp_reclaim");
 	KASSERT(vp->v_type == VSOCK,
 	    ("vfs_unp_reclaim: vp->v_type != VSOCK"));
 
 	active = 0;
-	UNP_LINK_WLOCK();
+	vplock = mtx_pool_find(mtxpool_sleep, vp);
+	mtx_lock(vplock);
 	VOP_UNP_CONNECT(vp, &unp);
 	if (unp == NULL)
 		goto done;
@@ -2481,8 +2658,8 @@ vfs_unp_reclaim(struct vnode *vp)
 		active = 1;
 	}
 	UNP_PCB_UNLOCK(unp);
-done:
-	UNP_LINK_WUNLOCK();
+ done:
+	mtx_unlock(vplock);
 	if (active)
 		vunref(vp);
 }

Modified: head/sys/sys/unpcb.h
==============================================================================
--- head/sys/sys/unpcb.h	Thu May 17 17:57:41 2018	(r333743)
+++ head/sys/sys/unpcb.h	Thu May 17 17:59:35 2018	(r333744)
@@ -69,23 +69,25 @@ typedef uint64_t unp_gen_t;
 LIST_HEAD(unp_head, unpcb);
 
 struct unpcb {
-	LIST_ENTRY(unpcb) unp_link; 	/* glue on list of all PCBs */
-	struct	socket *unp_socket;	/* pointer back to socket */
-	struct	file *unp_file;		/* back-pointer to file for gc. */
-	struct	vnode *unp_vnode;	/* if associated with file */
-	ino_t	unp_ino;		/* fake inode number */
+	/* Cache line 1 */
+	struct	mtx unp_mtx;		/* mutex */
 	struct	unpcb *unp_conn;	/* control block of connected socket */
-	struct	unp_head unp_refs;	/* referencing socket linked list */
-	LIST_ENTRY(unpcb) unp_reflink;	/* link in unp_refs list */
-	struct	sockaddr_un *unp_addr;	/* bound address of socket */
-	unp_gen_t unp_gencnt;		/* generation count of this instance */
+	volatile u_int	unp_refcount;
 	short	unp_flags;		/* flags */
 	short	unp_gcflag;		/* Garbage collector flags. */
+	struct	sockaddr_un *unp_addr;	/* bound address of socket */
+	struct	socket *unp_socket;	/* pointer back to socket */
+	/* Cache line 2 */
+	struct	vnode *unp_vnode;	/* if associated with file */
 	struct	xucred unp_peercred;	/* peer credentials, if applicable */
-	u_int	unp_refcount;
+	LIST_ENTRY(unpcb) unp_reflink;	/* link in unp_refs list */
+	LIST_ENTRY(unpcb) unp_link; 	/* glue on list of all PCBs */
+	struct	unp_head unp_refs;	/* referencing socket linked list */
+	unp_gen_t unp_gencnt;		/* generation count of this instance */
+	struct	file *unp_file;		/* back-pointer to file for gc. */
 	u_int	unp_msgcount;		/* references from message queue */
-	struct	mtx unp_mtx;		/* mutex */
-};
+	ino_t	unp_ino;		/* fake inode number */
+} __aligned(CACHE_LINE_SIZE);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-all mailing list