git: f7496dcab036 - main - jail: Change the locking around pr_ref and pr_uref

Jamie Gritton jamie at FreeBSD.org
Sun Feb 21 18:56:58 UTC 2021


The branch main has been updated by jamie:

URL: https://cgit.FreeBSD.org/src/commit/?id=f7496dcab0360a74bfb00cd6118f66323fffda61

commit f7496dcab0360a74bfb00cd6118f66323fffda61
Author:     Jamie Gritton <jamie at FreeBSD.org>
AuthorDate: 2021-02-21 18:55:44 +0000
Commit:     Jamie Gritton <jamie at FreeBSD.org>
CommitDate: 2021-02-21 18:55:44 +0000

    jail: Change the locking around pr_ref and pr_uref
    
    Require both the prison mutex and allprison_lock when pr_ref or
    pr_uref go to/from zero.  Adding a non-first or removing a non-last
    reference remain lock-free.  This means that a shared hold on
    allprison_lock is sufficient for prison_isalive() to be useful, which
    removes a number of cases of lock/check/unlock on the prison mutex.
    
    Expand the locking in kern_jail_set() to keep allprison_lock held
    exclusive until the new prison is valid, thus making invalid prisons
    invisible to any thread holding allprison_lock (except of course the
    one creating or destroying the prison).  This renders prison_isvalid()
    nearly redundant, now used only in asserts.
    
    Differential Revision:  https://reviews.freebsd.org/D28419
    Differential Revision:  https://reviews.freebsd.org/D28458
---
 sys/kern/kern_jail.c   | 423 ++++++++++++++++++++++++-------------------------
 sys/kern/sysv_msg.c    |   2 +-
 sys/kern/sysv_sem.c    |   2 +-
 sys/kern/sysv_shm.c    |   2 +-
 sys/kern/uipc_mqueue.c |  35 ++--
 sys/sys/jail.h         |   3 +-
 6 files changed, 232 insertions(+), 235 deletions(-)

diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
index 65201eb12951..48c91a95bf1a 100644
--- a/sys/kern/kern_jail.c
+++ b/sys/kern/kern_jail.c
@@ -137,9 +137,11 @@ LIST_HEAD(, prison_racct) allprison_racct;
 int	lastprid = 0;
 
 static int get_next_prid(struct prison **insprp);
-static int do_jail_attach(struct thread *td, struct prison *pr);
+static int do_jail_attach(struct thread *td, struct prison *pr, int drflags);
 static void prison_complete(void *context, int pending);
 static void prison_deref(struct prison *pr, int flags);
+static int prison_lock_xlock(struct prison *pr, int flags);
+static void prison_free_not_last(struct prison *pr);
 static void prison_set_allow_locked(struct prison *pr, unsigned flag,
     int enable);
 static char *prison_path(struct prison *pr1, struct prison *pr2);
@@ -1006,18 +1008,15 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 		 * where it can be inserted later.
 		 */
 		TAILQ_FOREACH(inspr, &allprison, pr_list) {
-			if (inspr->pr_id == jid) {
-				mtx_lock(&inspr->pr_mtx);
-				if (prison_isvalid(inspr)) {
-					pr = inspr;
-					drflags |= PD_LOCKED;
-					inspr = NULL;
-				} else
-					mtx_unlock(&inspr->pr_mtx);
-				break;
-			}
+			if (inspr->pr_id < jid)
+				continue;
 			if (inspr->pr_id > jid)
 				break;
+			pr = inspr;
+			mtx_lock(&pr->pr_mtx);
+			drflags |= PD_LOCKED;
+			inspr = NULL;
+			break;
 		}
 		if (pr != NULL) {
 			ppr = pr->pr_parent;
@@ -1041,13 +1040,15 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 				error = ENOENT;
 				vfs_opterror(opts, "jail %d not found", jid);
 				goto done_deref;
-			} else if (!prison_isalive(pr)) {
+			}
+			if (!prison_isalive(pr)) {
 				if (!(flags & JAIL_DYING)) {
 					error = ENOENT;
 					vfs_opterror(opts, "jail %d is dying",
 					    jid);
 					goto done_deref;
-				} else if ((flags & JAIL_ATTACH) ||
+				}
+				if ((flags & JAIL_ATTACH) ||
 				    (pr_flags & PR_PERSIST)) {
 					/*
 					 * A dying jail might be resurrected
@@ -1121,12 +1122,10 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 		if (namelc[0] != '\0') {
 			pnamelen =
 			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
- name_again:
 			deadpr = NULL;
 			FOREACH_PRISON_CHILD(ppr, tpr) {
 				if (tpr != pr &&
 				    !strcmp(tpr->pr_name + pnamelen, namelc)) {
-					mtx_lock(&tpr->pr_mtx);
 					if (prison_isalive(tpr)) {
 						if (pr == NULL &&
 						    cuflags != JAIL_CREATE) {
@@ -1135,6 +1134,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 							 * for updates.
 							 */
 							pr = tpr;
+							mtx_lock(&pr->pr_mtx);
 							drflags |= PD_LOCKED;
 							break;
 						}
@@ -1144,28 +1144,22 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 						 * active sibling jail.
 						 */
 						error = EEXIST;
-						mtx_unlock(&tpr->pr_mtx);
 						vfs_opterror(opts,
 						   "jail \"%s\" already exists",
 						   name);
 						goto done_deref;
 					}
 					if (pr == NULL &&
-					    cuflags != JAIL_CREATE &&
-					    prison_isvalid(tpr))
+					    cuflags != JAIL_CREATE) {
 						deadpr = tpr;
-					mtx_unlock(&tpr->pr_mtx);
+					}
 				}
 			}
 			/* If no active jail is found, use a dying one. */
 			if (deadpr != NULL && pr == NULL) {
 				if (flags & JAIL_DYING) {
-					mtx_lock(&deadpr->pr_mtx);
-					if (!prison_isvalid(deadpr)) {
-						mtx_unlock(&deadpr->pr_mtx);
-						goto name_again;
-					}
 					pr = deadpr;
+					mtx_lock(&pr->pr_mtx);
 					drflags |= PD_LOCKED;
 				} else if (cuflags == JAIL_UPDATE) {
 					error = ENOENT;
@@ -1199,19 +1193,11 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 				vfs_opterror(opts, "prison limit exceeded");
 				goto done_deref;
 			}
-		mtx_lock(&ppr->pr_mtx);
-		if (!prison_isvalid(ppr)) {
-			mtx_unlock(&ppr->pr_mtx);
-			error = ENOENT;
-			vfs_opterror(opts, "jail \"%s\" not found",
-			    prison_name(mypr, ppr));
-			goto done_deref;
-		}
 		prison_hold(ppr);
-		if (refcount_acquire(&ppr->pr_uref))
-			mtx_unlock(&ppr->pr_mtx);
-		else {
+		if (!refcount_acquire_if_not_zero(&ppr->pr_uref)) {
 			/* This brings the parent back to life. */
+			mtx_lock(&ppr->pr_mtx);
+			refcount_acquire(&ppr->pr_uref);
 			mtx_unlock(&ppr->pr_mtx);
 			error = osd_jail_call(ppr, PR_METHOD_CREATE, opts);
 			if (error) {
@@ -1219,7 +1205,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 				drflags |= PD_DEREF | PD_DEUREF;
 				goto done_deref;
 			}
-                }
+		}
 
 		if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) {
 			error = EAGAIN;
@@ -1230,6 +1216,8 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 		}
 
 		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
+		refcount_init(&pr->pr_ref, 0);
+		refcount_init(&pr->pr_uref, 0);
 		LIST_INIT(&pr->pr_children);
 		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
@@ -1452,7 +1440,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 #ifdef VIMAGE
 			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
 #endif
-			    refcount_load(&tpr->pr_uref) == 0) {
+			    !prison_isalive(tpr)) {
 				descend = 0;
 				continue;
 			}
@@ -1520,7 +1508,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 #ifdef VIMAGE
 			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
 #endif
-			    refcount_load(&tpr->pr_uref) == 0) {
+			    !prison_isalive(tpr)) {
 				descend = 0;
 				continue;
 			}
@@ -1759,8 +1747,8 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 			prison_hold(pr);
 			refcount_acquire(&pr->pr_uref);
 		} else {
-			refcount_release(&pr->pr_ref);
 			drflags |= PD_DEUREF;
+			prison_free_not_last(pr);
 		}
 	}
 	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
@@ -1824,8 +1812,6 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 #endif
 
 	/* Let the modules do their work. */
-	sx_downgrade(&allprison_lock);
-	drflags = (drflags & ~PD_LIST_XLOCKED) | PD_LIST_SLOCKED;
 	if (born) {
 		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
 		if (error) {
@@ -1842,9 +1828,8 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 
 	/* Attach this process to the prison if requested. */
 	if (flags & JAIL_ATTACH) {
-		mtx_lock(&pr->pr_mtx);
-		error = do_jail_attach(td, pr);
-		drflags &= ~PD_LIST_SLOCKED;
+		error = do_jail_attach(td, pr, prison_lock_xlock(pr, drflags));
+		drflags &= ~(PD_LOCKED | PD_LIST_XLOCKED);
 		if (error) {
 			if (created) {
 				/* do_jail_attach has removed the prison. */
@@ -1857,9 +1842,9 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 
 #ifdef RACCT
 	if (racct_enable && !created) {
-		if (drflags & PD_LIST_SLOCKED) {
-			sx_sunlock(&allprison_lock);
-			drflags &= ~PD_LIST_SLOCKED;
+		if (drflags & PD_LIST_XLOCKED) {
+			sx_xunlock(&allprison_lock);
+			drflags &= ~PD_LIST_XLOCKED;
 		}
 		prison_racct_modify(pr);
 	}
@@ -1874,8 +1859,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 		 * not be publicly visible).
 		 */
 		if (pr_flags & PR_PERSIST) {
-			mtx_lock(&pr->pr_mtx);
-			drflags |= PD_LOCKED;
+			drflags = prison_lock_xlock(pr, drflags);
 			refcount_acquire(&pr->pr_ref);
 			refcount_acquire(&pr->pr_uref);
 		} else {
@@ -1952,13 +1936,8 @@ get_next_prid(struct prison **insprp)
 			TAILQ_FOREACH(inspr, &allprison, pr_list) {
 				if (inspr->pr_id < jid)
 					continue;
-				if (inspr->pr_id > jid ||
-				    refcount_load(&inspr->pr_ref) == 0) {
-					/*
-					 * Found an opening.  This may be a gap
-					 * in the list, or a dead jail with the
-					 * same ID.
-					 */
+				if (inspr->pr_id > jid) {
+					/* Found an opening. */
 					maxid = 0;
 					break;
 				}
@@ -2047,18 +2026,14 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags)
 	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
 	if (error == 0) {
 		TAILQ_FOREACH(pr, &allprison, pr_list) {
-			if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
+			if (pr->pr_id > jid &&
+			    ((flags & JAIL_DYING) || prison_isalive(pr)) &&
+			    prison_ischild(mypr, pr)) {
 				mtx_lock(&pr->pr_mtx);
-				if ((flags & JAIL_DYING)
-				    ? prison_isvalid(pr) : prison_isalive(pr))
-					break;
-				mtx_unlock(&pr->pr_mtx);
+				drflags |= PD_LOCKED;
+				goto found_prison;
 			}
 		}
-		if (pr != NULL) {
-			drflags |= PD_LOCKED;
-			goto found_prison;
-		}
 		error = ENOENT;
 		vfs_opterror(opts, "no jail after %d", jid);
 		goto done;
@@ -2314,7 +2289,7 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags)
 int
 sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
 {
-	struct prison *pr, *cpr, *lpr, *tpr;
+	struct prison *pr, *cpr, *lpr;
 	int descend, error;
 
 	error = priv_check(td, PRIV_JAIL_REMOVE);
@@ -2334,21 +2309,13 @@ sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
 		mtx_unlock(&pr->pr_mtx);
 		lpr = NULL;
 		FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
-			mtx_lock(&cpr->pr_mtx);
-			if (prison_isvalid(cpr)) {
-				tpr = cpr;
-				prison_hold(cpr);
-			} else {
-				/* Already removed - do not do it again. */
-				tpr = NULL;
-			}
-			mtx_unlock(&cpr->pr_mtx);
+			prison_hold(cpr);
 			if (lpr != NULL) {
 				mtx_lock(&lpr->pr_mtx);
 				prison_remove_one(lpr);
 				sx_xlock(&allprison_lock);
 			}
-			lpr = tpr;
+			lpr = cpr;
 		}
 		if (lpr != NULL) {
 			mtx_lock(&lpr->pr_mtx);
@@ -2377,8 +2344,8 @@ prison_remove_one(struct prison *pr)
 
 	/* If the prison was persistent, it is not anymore. */
 	if (pr->pr_flags & PR_PERSIST) {
-		refcount_release(&pr->pr_ref);
 		drflags |= PD_DEUREF;
+		prison_free_not_last(pr);
 		pr->pr_flags &= ~PR_PERSIST;
 	}
 
@@ -2428,14 +2395,7 @@ sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
 	if (error)
 		return (error);
 
-	/*
-	 * Start with exclusive hold on allprison_lock to ensure that a possible
-	 * PR_METHOD_REMOVE call isn't concurrent with jail_set or jail_remove.
-	 * But then immediately downgrade it since we don't need to stop
-	 * readers.
-	 */
-	sx_xlock(&allprison_lock);
-	sx_downgrade(&allprison_lock);
+	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
 	if (pr == NULL) {
 		sx_sunlock(&allprison_lock);
@@ -2449,16 +2409,18 @@ sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
 		return (EINVAL);
 	}
 
-	return (do_jail_attach(td, pr));
+	return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED));
 }
 
 static int
-do_jail_attach(struct thread *td, struct prison *pr)
+do_jail_attach(struct thread *td, struct prison *pr, int drflags)
 {
 	struct proc *p;
 	struct ucred *newcred, *oldcred;
 	int error;
 
+	mtx_assert(&pr->pr_mtx, MA_OWNED);
+	sx_assert(&allprison_lock, SX_LOCKED);
 	/*
 	 * XXX: Note that there is a slight race here if two threads
 	 * in the same privileged process attempt to attach to two
@@ -2469,15 +2431,18 @@ do_jail_attach(struct thread *td, struct prison *pr)
 	 */
 	refcount_acquire(&pr->pr_ref);
 	refcount_acquire(&pr->pr_uref);
+	drflags |= PD_DEREF | PD_DEUREF;
 	mtx_unlock(&pr->pr_mtx);
+	drflags &= ~PD_LOCKED;
 
 	/* Let modules do whatever they need to prepare for attaching. */
 	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
 	if (error) {
-		prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
+		prison_deref(pr, drflags);
 		return (error);
 	}
-	sx_sunlock(&allprison_lock);
+	sx_unlock(&allprison_lock);
+	drflags &= ~(PD_LIST_SLOCKED | PD_LIST_XLOCKED);
 
 	/*
 	 * Reparent the newly attached process to this jail.
@@ -2513,7 +2478,7 @@ do_jail_attach(struct thread *td, struct prison *pr)
 	rctl_proc_ucred_changed(p, newcred);
 	crfree(newcred);
 #endif
-	prison_deref(oldcred->cr_prison, PD_DEREF | PD_DEUREF);
+	prison_deref(oldcred->cr_prison, drflags);
 	crfree(oldcred);
 
 	/*
@@ -2533,8 +2498,9 @@ do_jail_attach(struct thread *td, struct prison *pr)
  e_revert_osd:
 	/* Tell modules this thread is still in its old jail after all. */
 	sx_slock(&allprison_lock);
+	drflags |= PD_LIST_SLOCKED;
 	(void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
-	prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
+	prison_deref(pr, drflags);
 	return (error);
 }
 
@@ -2548,19 +2514,13 @@ prison_find(int prid)
 
 	sx_assert(&allprison_lock, SX_LOCKED);
 	TAILQ_FOREACH(pr, &allprison, pr_list) {
-		if (pr->pr_id == prid) {
-			mtx_lock(&pr->pr_mtx);
-			if (prison_isvalid(pr))
-				return (pr);
-			/*
-			 * Any active prison with the same ID would have
-			 * been inserted before a dead one.
-			 */
-			mtx_unlock(&pr->pr_mtx);
-			break;
-		}
+		if (pr->pr_id < prid)
+			continue;
 		if (pr->pr_id > prid)
 			break;
+		KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr));
+		mtx_lock(&pr->pr_mtx);
+		return (pr);
 	}
 	return (NULL);
 }
@@ -2577,10 +2537,10 @@ prison_find_child(struct prison *mypr, int prid)
 	sx_assert(&allprison_lock, SX_LOCKED);
 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
 		if (pr->pr_id == prid) {
+			KASSERT(prison_isvalid(pr),
+			    ("Found invalid prison %p", pr));
 			mtx_lock(&pr->pr_mtx);
-			if (prison_isvalid(pr))
-				return (pr);
-			mtx_unlock(&pr->pr_mtx);
+			return (pr);
 		}
 	}
 	return (NULL);
@@ -2598,26 +2558,21 @@ prison_find_name(struct prison *mypr, const char *name)
 
 	sx_assert(&allprison_lock, SX_LOCKED);
 	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
- again:
 	deadpr = NULL;
 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
 		if (!strcmp(pr->pr_name + mylen, name)) {
-			mtx_lock(&pr->pr_mtx);
-			if (prison_isalive(pr))
+			KASSERT(prison_isvalid(pr),
+			    ("Found invalid prison %p", pr));
+			if (prison_isalive(pr)) {
+				mtx_lock(&pr->pr_mtx);
 				return (pr);
-			if (prison_isvalid(pr))
-				deadpr = pr;
-			mtx_unlock(&pr->pr_mtx);
+			}
+			deadpr = pr;
 		}
 	}
 	/* There was no valid prison - perhaps there was a dying one. */
-	if (deadpr != NULL) {
+	if (deadpr != NULL)
 		mtx_lock(&deadpr->pr_mtx);
-		if (!prison_isvalid(deadpr)) {
-			mtx_unlock(&deadpr->pr_mtx);
-			goto again;
-		}
-	}
 	return (deadpr);
 }
 
@@ -2671,45 +2626,53 @@ prison_hold(struct prison *pr)
 
 /*
  * Remove a prison reference.  If that was the last reference, the
- * prison will be removed (at a later time).  Return with the prison
- * unlocked.
+ * prison will be removed (at a later time).
  */
 void
 prison_free_locked(struct prison *pr)
 {
-	int lastref;
 
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
+	/*
+	 * Locking is no longer required, but unlock because the caller
+	 * expects it.
+	 */
+	mtx_unlock(&pr->pr_mtx);
+	prison_free(pr);
+}
+
+void
+prison_free(struct prison *pr)
+{
+
 	KASSERT(refcount_load(&pr->pr_ref) > 0,
 	    ("Trying to free dead prison %p (jid=%d).",
 	     pr, pr->pr_id));
-	lastref = refcount_release(&pr->pr_ref);
-	mtx_unlock(&pr->pr_mtx);
-	if (lastref) {
+	if (!refcount_release_if_not_last(&pr->pr_ref)) {
 		/*
-		 * Don't remove the prison itself in this context,
+		 * Don't remove the last reference in this context,
 		 * in case there are locks held.
 		 */
 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
 	}
 }
 
-void
-prison_free(struct prison *pr)
+static void
+prison_free_not_last(struct prison *pr)
 {
+#ifdef INVARIANTS
+	int lastref;
 
-	/*
-	 * Locking is only required when releasing the last reference.
-	 * This allows assurance that a locked prison will remain valid
-	 * until it is unlocked.
-	 */
 	KASSERT(refcount_load(&pr->pr_ref) > 0,
 	    ("Trying to free dead prison %p (jid=%d).",
 	     pr, pr->pr_id));
-	if (refcount_release_if_not_last(&pr->pr_ref))
-		return;
-	mtx_lock(&pr->pr_mtx);
-	prison_free_locked(pr);
+	lastref = refcount_release(&pr->pr_ref);
+	KASSERT(!lastref,
+	    ("prison_free_not_last freed last ref on prison %p (jid=%d).",
+	     pr, pr->pr_id));
+#else
+	refcount_release(&pr>pr_ref);
+#endif
 }
 
 /*
@@ -2718,7 +2681,8 @@ prison_free(struct prison *pr)
  * user-visible, except through the the jail system calls.  It is also
  * an error to hold an invalid prison.  A prison record will remain
  * alive as long as it has at least one user reference, and will not
- * be set to the dying state was long as the prison mutex is held.
+ * be set to the dying state until the prison mutex and allprison_lock
+ * are both freed.
  */
 void
 prison_proc_hold(struct prison *pr)
@@ -2756,7 +2720,7 @@ prison_proc_free(struct prison *pr)
 		 * but also half dead.  Add a reference so any calls to
 		 * prison_free() won't re-submit the task.
 		 */
-		refcount_acquire(&pr->pr_ref);
+		prison_hold(pr);
 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
 	}
 }
@@ -2768,18 +2732,18 @@ static void
 prison_complete(void *context, int pending)
 {
 	struct prison *pr = context;
+	int drflags;
 
-	sx_xlock(&allprison_lock);
-	mtx_lock(&pr->pr_mtx);
 	/*
-	 * If this is completing a call to prison_proc_free, there will still
-	 * be a user reference held; clear that as well as the reference that
-	 * was added.  No references are expected if this is completing a call
-	 * to prison_free, but prison_deref is still called for the cleanup.
+	 * This could be called to release the last reference, or the
+	 * last user reference; the existence of a user reference implies
+	 * the latter.  There will always be a reference to remove, as
+	 * prison_proc_free adds one.
 	 */
-	prison_deref(pr, refcount_load(&pr->pr_uref) > 0
-	    ? PD_DEREF | PD_DEUREF | PD_LOCKED | PD_LIST_XLOCKED
-	    : PD_LOCKED | PD_LIST_XLOCKED);
+	drflags = prison_lock_xlock(pr, PD_DEREF);
+	if (refcount_load(&pr->pr_uref) > 0)
+		drflags |= PD_DEUREF;
+	prison_deref(pr, drflags);
 }
 
 /*
@@ -2794,84 +2758,86 @@ static void
 prison_deref(struct prison *pr, int flags)
 {
 	struct prisonlist freeprison;
-	struct prison *rpr, *tpr;
-	int lastref, lasturef;
+	struct prison *rpr, *ppr, *tpr;
 
 	TAILQ_INIT(&freeprison);
-	if (!(flags & PD_LOCKED))
-		mtx_lock(&pr->pr_mtx);
 	/*
 	 * Release this prison as requested, which may cause its parent
 	 * to be released, and then maybe its grandparent, etc.
 	 */
 	for (;;) {
 		if (flags & PD_DEUREF) {
+			/* Drop a user reference. */
 			KASSERT(refcount_load(&pr->pr_uref) > 0,
 			    ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
 			     pr->pr_id));
-			lasturef = refcount_release(&pr->pr_uref);
-			if (lasturef)
-				refcount_acquire(&pr->pr_ref);
-			KASSERT(refcount_load(&prison0.pr_uref) > 0,
-			    ("prison0 pr_uref=0"));
-		} else
-			lasturef = 0;
+			if (!refcount_release_if_not_last(&pr->pr_uref)) {
+				if (!(flags & PD_DEREF)) {
+					prison_hold(pr);
+					flags |= PD_DEREF;
+				}
+				flags = prison_lock_xlock(pr, flags);
+				if (refcount_release(&pr->pr_uref)) {
+					/*
+					 * When the last user references goes,
+					 * this becomes a dying prison.
+					 */
+					KASSERT(
+					    refcount_load(&prison0.pr_uref) > 0,
+					    ("prison0 pr_uref=0"));
+					mtx_unlock(&pr->pr_mtx);
+					flags &= ~PD_LOCKED;
+					(void)osd_jail_call(pr,
+					    PR_METHOD_REMOVE, NULL);
+				}
+			}
+		}
 		if (flags & PD_DEREF) {
+			/* Drop a reference. */
 			KASSERT(refcount_load(&pr->pr_ref) > 0,
 			    ("prison_deref PD_DEREF on a dead prison (jid=%d)",
 			     pr->pr_id));
-			lastref = refcount_release(&pr->pr_ref);
-		}
-		else
-			lastref = refcount_load(&pr->pr_ref) == 0;
-		mtx_unlock(&pr->pr_mtx);
-
-		/*
-		 * Tell the modules if the last user reference was removed
-		 * (even it sticks around in dying state).
-		 */
-		if (lasturef) {
-			if (!(flags & (PD_LIST_SLOCKED | PD_LIST_XLOCKED))) {
-				if (atomic_load_acq_int(&pr->pr_ref) > 1) {
-					sx_slock(&allprison_lock);
-					flags |= PD_LIST_SLOCKED;
-				} else {
-					sx_xlock(&allprison_lock);
-					flags |= PD_LIST_XLOCKED;
+			if (!refcount_release_if_not_last(&pr->pr_ref)) {
+				flags = prison_lock_xlock(pr, flags);
+				if (refcount_release(&pr->pr_ref)) {
+					/*
+					 * When the last reference goes,
+					 * unlink the prison and set it aside.
+					 */
+					KASSERT(
+					    refcount_load(&pr->pr_uref) == 0,
+					    ("prison_deref: last ref, "
+					     "but still has %d urefs (jid=%d)",
+					     pr->pr_uref, pr->pr_id));
+					KASSERT(
+					    refcount_load(&prison0.pr_ref) != 0,
+					    ("prison0 pr_ref=0"));
+					TAILQ_REMOVE(&allprison, pr, pr_list);
+					LIST_REMOVE(pr, pr_sibling);
+					TAILQ_INSERT_TAIL(&freeprison, pr,
+					    pr_list);
+					for (ppr = pr->pr_parent;
+					     ppr != NULL;
+					     ppr = ppr->pr_parent)
+						ppr->pr_childcount--;
+					/*
+					 * Removing a prison frees references
+					 * from its parent.
+					 */
+					mtx_unlock(&pr->pr_mtx);
+					flags &= ~PD_LOCKED;
+					pr = pr->pr_parent;
+					flags |= PD_DEREF | PD_DEUREF;
+					continue;
 				}
 			}
-			(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
-			mtx_lock(&pr->pr_mtx);
-			lastref = refcount_release(&pr->pr_ref);
-			mtx_unlock(&pr->pr_mtx);
 		}
-
-		if (!lastref)
-			break;
-
-		if (flags & PD_LIST_SLOCKED) {
-			if (!sx_try_upgrade(&allprison_lock)) {
-				sx_sunlock(&allprison_lock);
-				sx_xlock(&allprison_lock);
-			}
-			flags &= ~PD_LIST_SLOCKED;
-		} else if (!(flags & PD_LIST_XLOCKED))
-			sx_xlock(&allprison_lock);
-		flags |= PD_LIST_XLOCKED;
-
-		TAILQ_REMOVE(&allprison, pr, pr_list);
-		LIST_REMOVE(pr, pr_sibling);
-		TAILQ_INSERT_TAIL(&freeprison, pr, pr_list);
-		for (tpr = pr->pr_parent; tpr != NULL; tpr = tpr->pr_parent)
-			tpr->pr_childcount--;
-
-		/* Removing a prison frees a reference on its parent. */
-		pr = pr->pr_parent;
-		mtx_lock(&pr->pr_mtx);
-		flags |= PD_DEREF | PD_DEUREF;
+		break;
 	}
 
 	/* Release all the prison locks. */
+	if (flags & PD_LOCKED)
+		mtx_unlock(&pr->pr_mtx);
 	if (flags & PD_LIST_SLOCKED)
 		sx_sunlock(&allprison_lock);
 	else if (flags & PD_LIST_XLOCKED)
@@ -2902,10 +2868,47 @@ prison_deref(struct prison *pr, int flags)
 		if (racct_enable)
 			prison_racct_detach(rpr);
 #endif
+		TAILQ_REMOVE(&freeprison, rpr, pr_list);
 		free(rpr, M_PRISON);
 	}
 }
 
+/*
+ * Given the current locking state in the flags, make sure allprison_lock
+ * is held exclusive, and the prison is locked.  Return flags indicating
+ * the new state.
+ */
+static int
+prison_lock_xlock(struct prison *pr, int flags)
+{
+
+	if (!(flags & PD_LIST_XLOCKED)) {
+		/*
+		 * Get allprison_lock, which may be an upgrade,
+		 * and may require unlocking the prison.
+		 */
+		if (flags & PD_LOCKED) {
+			mtx_unlock(&pr->pr_mtx);
+			flags &= ~PD_LOCKED;
+		}
+		if (flags & PD_LIST_SLOCKED) {
+			if (!sx_try_upgrade(&allprison_lock)) {
+				sx_sunlock(&allprison_lock);
+				sx_xlock(&allprison_lock);
+			}
+			flags &= ~PD_LIST_SLOCKED;
+		} else
+			sx_xlock(&allprison_lock);
+		flags |= PD_LIST_XLOCKED;
+	}
+	if (!(flags & PD_LOCKED)) {
+		/* Lock the prison mutex. */
+		mtx_lock(&pr->pr_mtx);
+		flags |= PD_LOCKED;
+	}
+	return flags;
+}
+
 /*
  * Set or clear a permission bit in the pr_allow field, passing restrictions
  * (cleared permission) down to child jails.
@@ -3068,15 +3071,13 @@ prison_ischild(struct prison *pr1, struct prison *pr2)
 }
 
 /*
- * Return true if the prison is currently alive.  A prison is alive if it is
- * valid and holds user references, and it isn't being removed.
+ * Return true if the prison is currently alive.  A prison is alive if it
+ * holds user references and it isn't being removed.
  */
 bool
 prison_isalive(struct prison *pr)
 {
 
-	if (__predict_false(refcount_load(&pr->pr_ref) == 0))
-		return (false);
 	if (__predict_false(refcount_load(&pr->pr_uref) == 0))
 		return (false);
 	if (__predict_false(pr->pr_flags & PR_REMOVE))
@@ -3087,7 +3088,9 @@ prison_isalive(struct prison *pr)
 /*
  * Return true if the prison is currently valid.  A prison is valid if it has
  * been fully created, and is not being destroyed.  Note that dying prisons
- * are still considered valid.
+ * are still considered valid.  Invalid prisons won't be found under normal
+ * circumstances, as they're only put in that state by functions that have
+ * an exclusive hold on allprison_lock.
  */
 bool
 prison_isvalid(struct prison *pr)
@@ -3754,10 +3757,6 @@ sysctl_jail_list(SYSCTL_HANDLER_ARGS)
 			    cpr->pr_ip6s * sizeof(struct in6_addr));
 		}
 #endif
-		if (!prison_isvalid(cpr)) {
-			mtx_unlock(&cpr->pr_mtx);
-			continue;
-		}
 		bzero(xp, sizeof(*xp));
 		xp->pr_version = XPRISON_VERSION;
 		xp->pr_id = cpr->pr_id;
diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c
index f048234f3a33..435235f0384d 100644
--- a/sys/kern/sysv_msg.c
+++ b/sys/kern/sysv_msg.c
@@ -290,7 +290,7 @@ msginit()
 		if (rsv == NULL)
 			rsv = osd_reserve(msg_prison_slot);
 		prison_lock(pr);
-		if (prison_isvalid(pr) && (pr->pr_allow & PR_ALLOW_SYSVIPC)) {
+		if (pr->pr_allow & PR_ALLOW_SYSVIPC) {
 			(void)osd_jail_set_reserved(pr, msg_prison_slot, rsv,
 			    &prison0);
 			rsv = NULL;
diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c
index deee60d87a5a..dd8925246d1e 100644
--- a/sys/kern/sysv_sem.c
+++ b/sys/kern/sysv_sem.c
@@ -321,7 +321,7 @@ seminit(void)
 		if (rsv == NULL)
 			rsv = osd_reserve(sem_prison_slot);
 		prison_lock(pr);
-		if (prison_isvalid(pr) && (pr->pr_allow & PR_ALLOW_SYSVIPC)) {
+		if (pr->pr_allow & PR_ALLOW_SYSVIPC) {
 			(void)osd_jail_set_reserved(pr, sem_prison_slot, rsv,
 			    &prison0);
 			rsv = NULL;
diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c
index ad5f0030b965..2e7ae927dcc3 100644
--- a/sys/kern/sysv_shm.c
+++ b/sys/kern/sysv_shm.c
@@ -979,7 +979,7 @@ shminit(void)
 		if (rsv == NULL)
 			rsv = osd_reserve(shm_prison_slot);
 		prison_lock(pr);
-		if (prison_isvalid(pr) && (pr->pr_allow & PR_ALLOW_SYSVIPC)) {
+		if (pr->pr_allow & PR_ALLOW_SYSVIPC) {
 			(void)osd_jail_set_reserved(pr, shm_prison_slot, rsv,
 			    &prison0);
 			rsv = NULL;
diff --git a/sys/kern/uipc_mqueue.c b/sys/kern/uipc_mqueue.c
index dc94ce213d08..5c1775a261fc 100644
--- a/sys/kern/uipc_mqueue.c
+++ b/sys/kern/uipc_mqueue.c
@@ -1564,29 +1564,26 @@ mqfs_prison_remove(void *obj, void *data __unused)
 	const struct prison *pr = obj;
 	struct prison *tpr;
 	struct mqfs_node *pn, *tpn;
-	int found;
+	struct vnode *pr_root;
 
-	found = 0;
+	pr_root = pr->pr_root;
+	if (pr->pr_parent->pr_root == pr_root)
+		return (0);
 	TAILQ_FOREACH(tpr, &allprison, pr_list) {
-		prison_lock(tpr);
-		if (tpr != pr && prison_isvalid(tpr) &&
-		    tpr->pr_root == pr->pr_root)
-			found = 1;
-		prison_unlock(tpr);
+		if (tpr != pr && tpr->pr_root == pr_root)
+			return (0);
 	}
-	if (!found) {
-		/*
-		 * No jails are rooted in this directory anymore,
-		 * so no queues should be either.
-		 */
-		sx_xlock(&mqfs_data.mi_lock);
-		LIST_FOREACH_SAFE(pn, &mqfs_data.mi_root->mn_children,
-		    mn_sibling, tpn) {
-			if (pn->mn_pr_root == pr->pr_root)
-				(void)do_unlink(pn, curthread->td_ucred);
-		}
-		sx_xunlock(&mqfs_data.mi_lock);
+	/*
+	 * No jails are rooted in this directory anymore,
+	 * so no queues should be either.
+	 */
+	sx_xlock(&mqfs_data.mi_lock);
+	LIST_FOREACH_SAFE(pn, &mqfs_data.mi_root->mn_children,
+	    mn_sibling, tpn) {
+		if (pn->mn_pr_root == pr_root)
+			(void)do_unlink(pn, curthread->td_ucred);
 	}
+	sx_xunlock(&mqfs_data.mi_lock);
 	return (0);
 }
 
diff --git a/sys/sys/jail.h b/sys/sys/jail.h
index 2ac6aabdbd43..a48e189729dc 100644
--- a/sys/sys/jail.h
+++ b/sys/sys/jail.h
@@ -155,7 +155,8 @@ struct prison_racct;
  *   (m) locked by pr_mtx
  *   (p) locked by pr_mtx, and also at least shared allprison_lock required
  *       to update
- *   (r) atomic via refcount(9), pr_mtx required to decrement to zero
+ *   (r) atomic via refcount(9), pr_mtx and allprison_lock required to
+ *       decrement to zero
  */
 struct prison {
 	TAILQ_ENTRY(prison) pr_list;			/* (a) all prisons */


More information about the dev-commits-src-all mailing list