svn commit: r368011 - head/sys/kern

Kyle Evans kevans at FreeBSD.org
Wed Nov 25 03:14:26 UTC 2020


Author: kevans
Date: Wed Nov 25 03:14:25 2020
New Revision: 368011
URL: https://svnweb.freebsd.org/changeset/base/368011

Log:
  kern: cpuset: properly rebase when attaching to a jail
  
  The current logic is a fine choice for a system administrator modifying
  process cpusets or a process creating a new cpuset(2), but not ideal for
  processes attaching to a jail.
  
  Currently, when a process attaches to a jail, it does exactly what any other
  process does and loses any mask it might have applied in the process of
  doing so because cpuset_setproc() is entirely based around the assumption
  that non-anonymous cpusets in the process can be replaced with the new
  parent set.
  
  This approach slightly improves the jail attach integration by modifying
  cpuset_setproc() callers to indicate if they should rebase their cpuset to
  the indicated set or not (i.e. cpuset_setproc_update_set).
  
  If we're rebasing and the process currently has a cpuset assigned that is
  not the containing jail's root set, then we will now create a new base set
  for it hanging off the jail's root with the existing mask applied instead of
  using the jail's root set as the new base set.
  
  Note that the common case will be that the process doesn't have a cpuset
  within the jail root, but the system root can freely assign a cpuset from
  a jail to a process outside of the jail with no restriction. We assume that
  that may have happened or that it could happen due to a race when we drop
  the proc lock, so we must recheck both within the loop to gather up
  sufficient freed cpusets and after the loop.
  
  To recap, here's how it worked before in all cases:
  
  0     4 <-- jail              0      4 <-- jail / process
  |                             |
  1                 ->          1
  |
  3 <-- process
  
  Here's how it works now:
  
  0     4 <-- jail             0       4 <-- jail
  |                            |       |
  1                 ->         1       5 <-- process
  |
  3 <-- process
  
  or
  
  0     4 <-- jail             0       4 <-- jail / process
  |                            |
  1 <-- process     ->         1
  
  More importantly, in both cases, the attaching process still retains the
  mask it had prior to attaching or the attach fails with EDEADLK if it's
  left with no CPUs to run on or the domain policy is incompatible. The
  author of this patch considers this almost a security feature, because a MAC
  policy could grant PRIV_JAIL_ATTACH to an unprivileged user that's
  restricted to some subset of available CPUs the ability to attach to a jail,
  which might lift the user's restrictions if they attach to a jail with a
  wider mask.
  
  In most cases, it's anticipated that admins will use this to be able to,
  for example, `cpuset -c -l 1 jail -c path=/ command=/long/running/cmd`,
  and avoid the need for contortions to spawn a command inside a jail with a
  more limited cpuset than the jail.
  
  Reviewed by:	jamie
  MFC after:	1 month (maybe)
  Differential Revision:	https://reviews.freebsd.org/D27298

Modified:
  head/sys/kern/kern_cpuset.c

Modified: head/sys/kern/kern_cpuset.c
==============================================================================
--- head/sys/kern/kern_cpuset.c	Wed Nov 25 02:12:24 2020	(r368010)
+++ head/sys/kern/kern_cpuset.c	Wed Nov 25 03:14:25 2020	(r368011)
@@ -1104,14 +1104,63 @@ cpuset_setproc_setthread(struct cpuset *tdset, struct 
 	    domainlist);
 }
 
+static int
+cpuset_setproc_newbase(struct thread *td, struct cpuset *set,
+    struct cpuset *nroot, struct cpuset **nsetp,
+    struct setlist *cpusets, struct domainlist *domainlist)
+{
+	struct domainset ndomain;
+	cpuset_t nmask;
+	struct cpuset *pbase;
+	int error;
+
+	pbase = cpuset_getbase(td->td_cpuset);
+
+	/* Copy process mask, then further apply the new root mask. */
+	CPU_COPY(&pbase->cs_mask, &nmask);
+	CPU_AND(&nmask, &nroot->cs_mask);
+
+	domainset_copy(pbase->cs_domain, &ndomain);
+	DOMAINSET_AND(&ndomain.ds_mask, &set->cs_domain->ds_mask);
+
+	/* Policy is too restrictive, will not work. */
+	if (CPU_EMPTY(&nmask) || DOMAINSET_EMPTY(&ndomain.ds_mask))
+		return (EDEADLK);
+
+	/*
+	 * Remove pbase from the freelist in advance, it'll be pushed to
+	 * cpuset_ids on success.  We assume here that cpuset_create() will not
+	 * touch pbase on failure, and we just enqueue it back to the freelist
+	 * to remain in a consistent state.
+	 */
+	pbase = LIST_FIRST(cpusets);
+	LIST_REMOVE(pbase, cs_link);
+	error = cpuset_create(&pbase, set, &nmask);
+	if (error != 0) {
+		LIST_INSERT_HEAD(cpusets, pbase, cs_link);
+		return (error);
+	}
+
+	/* Duplicates some work from above... oh well. */
+	pbase->cs_domain = domainset_shadow(set->cs_domain, &ndomain,
+	    domainlist);
+	*nsetp = pbase;
+	return (0);
+}
+
 /*
- * Handle three cases for updating an entire process.
+ * Handle four cases for updating an entire process.
  *
- * 1) Set is non-null.  This reparents all anonymous sets to the provided
- *    set and replaces all non-anonymous td_cpusets with the provided set.
- * 2) Mask is non-null.  This replaces or creates anonymous sets for every
+ * 1) Set is non-null and the process is not rebasing onto a new root.  This
+ *    reparents all anonymous sets to the provided set and replaces all
+ *    non-anonymous td_cpusets with the provided set.
+ * 2) Set is non-null and the process is rebasing onto a new root.  This
+ *    creates a new base set if the process previously had its own base set,
+ *    then reparents all anonymous sets either to that set or the provided set
+ *    if one was not created.  Non-anonymous sets are similarly replaced.
+ * 3) Mask is non-null.  This replaces or creates anonymous sets for every
  *    thread with the existing base as a parent.
- * 3) domain is non-null.  This creates anonymous sets for every thread
+ * 4) domain is non-null.  This creates anonymous sets for every thread
  *    and replaces the domain set.
  *
  * This is overly complicated because we can't allocate while holding a 
@@ -1120,15 +1169,15 @@ cpuset_setproc_setthread(struct cpuset *tdset, struct 
  */
 static int
 cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask,
-    struct domainset *domain)
+    struct domainset *domain, bool rebase)
 {
 	struct setlist freelist;
 	struct setlist droplist;
 	struct domainlist domainlist;
-	struct cpuset *nset;
+	struct cpuset *base, *nset, *nroot, *tdroot;
 	struct thread *td;
 	struct proc *p;
-	int threads;
+	int needed;
 	int nfree;
 	int error;
 
@@ -1144,22 +1193,50 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t
 	nfree = 1;
 	LIST_INIT(&droplist);
 	nfree = 0;
+	base = set;
+	nroot = NULL;
+	if (set != NULL)
+		nroot = cpuset_getroot(set);
 	for (;;) {
 		error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
 		if (error)
 			goto out;
-		if (nfree >= p->p_numthreads)
+		tdroot = cpuset_getroot(td->td_cpuset);
+		needed = p->p_numthreads;
+		if (set != NULL && rebase && tdroot != nroot)
+			needed++;
+		if (nfree >= needed)
 			break;
-		threads = p->p_numthreads;
 		PROC_UNLOCK(p);
-		if (nfree < threads) {
-			cpuset_freelist_add(&freelist, threads - nfree);
-			domainset_freelist_add(&domainlist, threads - nfree);
-			nfree = threads;
+		if (nfree < needed) {
+			cpuset_freelist_add(&freelist, needed - nfree);
+			domainset_freelist_add(&domainlist, needed - nfree);
+			nfree = needed;
 		}
 	}
 	PROC_LOCK_ASSERT(p, MA_OWNED);
+
 	/*
+	 * If we're changing roots and the root set is what has been specified
+	 * as the parent, then we'll check if the process was previously using
+	 * the root set and, if it wasn't, create a new base with the process's
+	 * mask applied to it.
+	 */
+	if (set != NULL && rebase && nroot != tdroot) {
+		cpusetid_t base_id, root_id;
+
+		root_id = td->td_ucred->cr_prison->pr_cpuset->cs_id;
+		base_id = cpuset_getbase(td->td_cpuset)->cs_id;
+
+		if (base_id != root_id) {
+			error = cpuset_setproc_newbase(td, set, nroot, &base,
+			    &freelist, &domainlist);
+			if (error != 0)
+				goto unlock_out;
+		}
+	}
+
+	/*
 	 * Now that the appropriate locks are held and we have enough cpusets,
 	 * make sure the operation will succeed before applying changes. The
 	 * proc lock prevents td_cpuset from changing between calls.
@@ -1169,7 +1246,7 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t
 		thread_lock(td);
 		if (set != NULL)
 			error = cpuset_setproc_test_setthread(td->td_cpuset,
-			    set);
+			    base);
 		else
 			error = cpuset_setproc_test_maskthread(td->td_cpuset,
 			    mask, domain);
@@ -1185,7 +1262,7 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		if (set != NULL)
-			error = cpuset_setproc_setthread(td->td_cpuset, set,
+			error = cpuset_setproc_setthread(td->td_cpuset, base,
 			    &nset, &freelist, &domainlist);
 		else
 			error = cpuset_setproc_maskthread(td->td_cpuset, mask,
@@ -1200,6 +1277,8 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t
 unlock_out:
 	PROC_UNLOCK(p);
 out:
+	if (base != NULL && base != set)
+		cpuset_rel(base);
 	while ((nset = LIST_FIRST(&droplist)) != NULL)
 		cpuset_rel_complete(nset);
 	cpuset_freelist_free(&freelist);
@@ -1584,7 +1663,7 @@ cpuset_setproc_update_set(struct proc *p, struct cpuse
 	KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__));
 
 	cpuset_ref(set);
-	error = cpuset_setproc(p->p_pid, set, NULL, NULL);
+	error = cpuset_setproc(p->p_pid, set, NULL, NULL, true);
 	if (error)
 		return (error);
 	cpuset_rel(set);
@@ -1634,7 +1713,7 @@ sys_cpuset(struct thread *td, struct cpuset_args *uap)
 		return (error);
 	error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
 	if (error == 0)
-		error = cpuset_setproc(-1, set, NULL, NULL);
+		error = cpuset_setproc(-1, set, NULL, NULL, false);
 	cpuset_rel(set);
 	return (error);
 }
@@ -1668,7 +1747,7 @@ kern_cpuset_setid(struct thread *td, cpuwhich_t which,
 	set = cpuset_lookup(setid, td);
 	if (set == NULL)
 		return (ESRCH);
-	error = cpuset_setproc(id, set, NULL, NULL);
+	error = cpuset_setproc(id, set, NULL, NULL, false);
 	cpuset_rel(set);
 	return (error);
 }
@@ -1942,7 +2021,7 @@ kern_cpuset_setaffinity(struct thread *td, cpulevel_t 
 			error = cpuset_setthread(id, mask);
 			break;
 		case CPU_WHICH_PID:
-			error = cpuset_setproc(id, NULL, mask, NULL);
+			error = cpuset_setproc(id, NULL, mask, NULL, false);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
@@ -2226,7 +2305,7 @@ kern_cpuset_setdomain(struct thread *td, cpulevel_t le
 			error = _cpuset_setthread(id, NULL, &domain);
 			break;
 		case CPU_WHICH_PID:
-			error = cpuset_setproc(id, NULL, NULL, &domain);
+			error = cpuset_setproc(id, NULL, NULL, &domain, false);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:


More information about the svn-src-head mailing list