git: 7bb960ce6447 - stable/12 - MFC kern: cpuset: properly rebase when attaching to a jail

Kyle Evans kevans at FreeBSD.org
Sun Dec 27 22:45:37 UTC 2020


The branch stable/12 has been updated by kevans:

URL: https://cgit.FreeBSD.org/src/commit/?id=7bb960ce6447bd535e0fbb648e4d9edbb1dc067f

commit 7bb960ce6447bd535e0fbb648e4d9edbb1dc067f
Author:     Kyle Evans <kevans at FreeBSD.org>
AuthorDate: 2020-11-25 03:14:25 +0000
Commit:     Kyle Evans <kevans at FreeBSD.org>
CommitDate: 2020-12-27 21:44:23 +0000

    MFC kern: cpuset: properly rebase when attaching to a jail
    
    The current logic is a fine choice for a system administrator modifying
    process cpusets or a process creating a new cpuset(2), but not ideal for
    processes attaching to a jail.
    
    Currently, when a process attaches to a jail, it does exactly what any other
    process does and loses any mask it might have applied in the process of
    doing so because cpuset_setproc() is entirely based around the assumption
    that non-anonymous cpusets in the process can be replaced with the new
    parent set.
    
    This approach slightly improves the jail attach integration by modifying
    cpuset_setproc() callers to indicate if they should rebase their cpuset to
    the indicated set or not (i.e. cpuset_setproc_update_set).
    
    If we're rebasing and the process currently has a cpuset assigned that is
    not the containing jail's root set, then we will now create a new base set
    for it hanging off the jail's root with the existing mask applied instead of
    using the jail's root set as the new base set.
    
    Note that the common case will be that the process doesn't have a cpuset
    within the jail root, but the system root can freely assign a cpuset from
    a jail to a process outside of the jail with no restriction. We assume that
    that may have happened or that it could happen due to a race when we drop
    the proc lock, so we must recheck both within the loop to gather up
    sufficient freed cpusets and after the loop.
    
    To recap, here's how it worked before in all cases:
    
    0     4 <-- jail              0      4 <-- jail / process
    |                             |
    1                 ->          1
    |
    3 <-- process
    
    Here's how it works now:
    
    0     4 <-- jail             0       4 <-- jail
    |                            |       |
    1                 ->         1       5 <-- process
    |
    3 <-- process
    
    or
    
    0     4 <-- jail             0       4 <-- jail / process
    |                            |
    1 <-- process     ->         1
    
    More importantly, in both cases, the attaching process still retains the
    mask it had prior to attaching or the attach fails with EDEADLK if it's
    left with no CPUs to run on or the domain policy is incompatible. The
    author of this patch considers this almost a security feature, because a MAC
    policy could grant PRIV_JAIL_ATTACH to an unprivileged user that's
    restricted to some subset of available CPUs the ability to attach to a jail,
    which might lift the user's restrictions if they attach to a jail with a
    wider mask.
    
    In most cases, it's anticipated that admins will use this to be able to,
    for example, `cpuset -c -l 1 jail -c path=/ command=/long/running/cmd`,
    and avoid the need for contortions to spawn a command inside a jail with a
    more limited cpuset than the jail.
    
    (cherry picked from commit d431dea5ac2aaab012f90182cf7ca13cc6dde5ea)
---
 sys/kern/kern_cpuset.c | 121 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 100 insertions(+), 21 deletions(-)

diff --git a/sys/kern/kern_cpuset.c b/sys/kern/kern_cpuset.c
index 7695f07983ac..288349633614 100644
--- a/sys/kern/kern_cpuset.c
+++ b/sys/kern/kern_cpuset.c
@@ -1138,14 +1138,63 @@ cpuset_setproc_setthread(struct cpuset *tdset, struct cpuset *set,
 	    domainlist);
 }
 
+static int
+cpuset_setproc_newbase(struct thread *td, struct cpuset *set,
+    struct cpuset *nroot, struct cpuset **nsetp,
+    struct setlist *cpusets, struct domainlist *domainlist)
+{
+	struct domainset ndomain;
+	cpuset_t nmask;
+	struct cpuset *pbase;
+	int error;
+
+	pbase = cpuset_getbase(td->td_cpuset);
+
+	/* Copy process mask, then further apply the new root mask. */
+	CPU_COPY(&pbase->cs_mask, &nmask);
+	CPU_AND(&nmask, &nroot->cs_mask);
+
+	domainset_copy(pbase->cs_domain, &ndomain);
+	DOMAINSET_AND(&ndomain.ds_mask, &set->cs_domain->ds_mask);
+
+	/* Policy is too restrictive, will not work. */
+	if (CPU_EMPTY(&nmask) || DOMAINSET_EMPTY(&ndomain.ds_mask))
+		return (EDEADLK);
+
+	/*
+	 * Remove pbase from the freelist in advance, it'll be pushed to
+	 * cpuset_ids on success.  We assume here that cpuset_create() will not
+	 * touch pbase on failure, and we just enqueue it back to the freelist
+	 * to remain in a consistent state.
+	 */
+	pbase = LIST_FIRST(cpusets);
+	LIST_REMOVE(pbase, cs_link);
+	error = cpuset_create(&pbase, set, &nmask);
+	if (error != 0) {
+		LIST_INSERT_HEAD(cpusets, pbase, cs_link);
+		return (error);
+	}
+
+	/* Duplicates some work from above... oh well. */
+	pbase->cs_domain = domainset_shadow(set->cs_domain, &ndomain,
+	    domainlist);
+	*nsetp = pbase;
+	return (0);
+}
+
 /*
- * Handle three cases for updating an entire process.
+ * Handle four cases for updating an entire process.
  *
- * 1) Set is non-null.  This reparents all anonymous sets to the provided
- *    set and replaces all non-anonymous td_cpusets with the provided set.
- * 2) Mask is non-null.  This replaces or creates anonymous sets for every
+ * 1) Set is non-null and the process is not rebasing onto a new root.  This
+ *    reparents all anonymous sets to the provided set and replaces all
+ *    non-anonymous td_cpusets with the provided set.
+ * 2) Set is non-null and the process is rebasing onto a new root.  This
+ *    creates a new base set if the process previously had its own base set,
+ *    then reparents all anonymous sets either to that set or the provided set
+ *    if one was not created.  Non-anonymous sets are similarly replaced.
+ * 3) Mask is non-null.  This replaces or creates anonymous sets for every
  *    thread with the existing base as a parent.
- * 3) domain is non-null.  This creates anonymous sets for every thread
+ * 4) domain is non-null.  This creates anonymous sets for every thread
  *    and replaces the domain set.
  *
  * This is overly complicated because we can't allocate while holding a 
@@ -1154,15 +1203,15 @@ cpuset_setproc_setthread(struct cpuset *tdset, struct cpuset *set,
  */
 static int
 cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask,
-    struct domainset *domain)
+    struct domainset *domain, bool rebase)
 {
 	struct setlist freelist;
 	struct setlist droplist;
 	struct domainlist domainlist;
-	struct cpuset *nset;
+	struct cpuset *base, *nset, *nroot, *tdroot;
 	struct thread *td;
 	struct proc *p;
-	int threads;
+	int needed;
 	int nfree;
 	int error;
 
@@ -1178,21 +1227,49 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask,
 	nfree = 1;
 	LIST_INIT(&droplist);
 	nfree = 0;
+	base = set;
+	nroot = NULL;
+	if (set != NULL)
+		nroot = cpuset_getroot(set);
 	for (;;) {
 		error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
 		if (error)
 			goto out;
-		if (nfree >= p->p_numthreads)
+		tdroot = cpuset_getroot(td->td_cpuset);
+		needed = p->p_numthreads;
+		if (set != NULL && rebase && tdroot != nroot)
+			needed++;
+		if (nfree >= needed)
 			break;
-		threads = p->p_numthreads;
 		PROC_UNLOCK(p);
-		if (nfree < threads) {
-			cpuset_freelist_add(&freelist, threads - nfree);
-			domainset_freelist_add(&domainlist, threads - nfree);
-			nfree = threads;
+		if (nfree < needed) {
+			cpuset_freelist_add(&freelist, needed - nfree);
+			domainset_freelist_add(&domainlist, needed - nfree);
+			nfree = needed;
 		}
 	}
 	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	/*
+	 * If we're changing roots and the root set is what has been specified
+	 * as the parent, then we'll check if the process was previously using
+	 * the root set and, if it wasn't, create a new base with the process's
+	 * mask applied to it.
+	 */
+	if (set != NULL && rebase && nroot != tdroot) {
+		cpusetid_t base_id, root_id;
+
+		root_id = td->td_ucred->cr_prison->pr_cpuset->cs_id;
+		base_id = cpuset_getbase(td->td_cpuset)->cs_id;
+
+		if (base_id != root_id) {
+			error = cpuset_setproc_newbase(td, set, nroot, &base,
+			    &freelist, &domainlist);
+			if (error != 0)
+				goto unlock_out;
+		}
+	}
+
 	/*
 	 * Now that the appropriate locks are held and we have enough cpusets,
 	 * make sure the operation will succeed before applying changes. The
@@ -1203,7 +1280,7 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask,
 		thread_lock(td);
 		if (set != NULL)
 			error = cpuset_setproc_test_setthread(td->td_cpuset,
-			    set);
+			    base);
 		else
 			error = cpuset_setproc_test_maskthread(td->td_cpuset,
 			    mask, domain);
@@ -1219,7 +1296,7 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask,
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		if (set != NULL)
-			error = cpuset_setproc_setthread(td->td_cpuset, set,
+			error = cpuset_setproc_setthread(td->td_cpuset, base,
 			    &nset, &freelist, &domainlist);
 		else
 			error = cpuset_setproc_maskthread(td->td_cpuset, mask,
@@ -1234,6 +1311,8 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask,
 unlock_out:
 	PROC_UNLOCK(p);
 out:
+	if (base != NULL && base != set)
+		cpuset_rel(base);
 	while ((nset = LIST_FIRST(&droplist)) != NULL)
 		cpuset_rel_complete(nset);
 	cpuset_freelist_free(&freelist);
@@ -1618,7 +1697,7 @@ cpuset_setproc_update_set(struct proc *p, struct cpuset *set)
 	KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__));
 
 	cpuset_ref(set);
-	error = cpuset_setproc(p->p_pid, set, NULL, NULL);
+	error = cpuset_setproc(p->p_pid, set, NULL, NULL, true);
 	if (error)
 		return (error);
 	cpuset_rel(set);
@@ -1668,7 +1747,7 @@ sys_cpuset(struct thread *td, struct cpuset_args *uap)
 		return (error);
 	error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
 	if (error == 0)
-		error = cpuset_setproc(-1, set, NULL, NULL);
+		error = cpuset_setproc(-1, set, NULL, NULL, false);
 	cpuset_rel(set);
 	return (error);
 }
@@ -1702,7 +1781,7 @@ kern_cpuset_setid(struct thread *td, cpuwhich_t which,
 	set = cpuset_lookup(setid, td);
 	if (set == NULL)
 		return (ESRCH);
-	error = cpuset_setproc(id, set, NULL, NULL);
+	error = cpuset_setproc(id, set, NULL, NULL, false);
 	cpuset_rel(set);
 	return (error);
 }
@@ -1981,7 +2060,7 @@ kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
 			error = cpuset_setthread(id, mask);
 			break;
 		case CPU_WHICH_PID:
-			error = cpuset_setproc(id, NULL, mask, NULL);
+			error = cpuset_setproc(id, NULL, mask, NULL, false);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
@@ -2270,7 +2349,7 @@ kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
 			error = _cpuset_setthread(id, NULL, &domain);
 			break;
 		case CPU_WHICH_PID:
-			error = cpuset_setproc(id, NULL, NULL, &domain);
+			error = cpuset_setproc(id, NULL, NULL, &domain, false);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:


More information about the dev-commits-src-all mailing list