PERFORCE change 104658 for review

Chris Jones cdjones at FreeBSD.org
Mon Aug 21 07:18:36 UTC 2006


http://perforce.freebsd.org/chv.cgi?CH=104658

Change 104658 by cdjones at cdjones-impulse on 2006/08/21 07:18:05

	Introduce security.jail.limit_jail_memory and security.jail.jail_pager_interval sysctls.  Bring jpager_td back into the build, running iff limit_jail_memory sysctl set.  Get rid of old scheduler td cruft.  Add jail_set_resource_limits syscall.

Affected files ...

.. //depot/projects/soc2006/cdjones_jail/src/sys/kern/kern_jail.c#23 edit

Differences ...

==== //depot/projects/soc2006/cdjones_jail/src/sys/kern/kern_jail.c#23 (text+ko) ====

@@ -5,6 +5,35 @@
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
+ *
+ *  Portions copyright (c) 2006 Chris Jones
+ *  All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project by Chris Jones
+ * thanks to the support of Google's Summer of Code program and
+ * mentoring by Kip Macy.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE. 
+ *
  */
 
 #include <sys/cdefs.h>
@@ -78,6 +107,17 @@
     &jail_chflags_allowed, 0,
     "Processes in jail can alter system file flags");
 
+int     jail_limit_memory = 0;
+SYSCTL_INT(_security_jail, OID_AUTO, limit_jail_memory, CTLFLAG_RW,
+	   &jail_limit_memory, 0,
+	   "Limit jails' memory usage");
+
+int     jail_memory_pager_interval = 5;
+SYSCTL_INT(_security_jail, OID_AUTO, jail_pager_interval,
+	   CTLTYPE_INT | CTLFLAG_RW,
+	   &jail_memory_pager_interval, 0,
+	   "Interval between jail memory limit checks");
+
 /* allprison, lastprid, and prisoncount are protected by allprison_mtx. */
 struct	prisonlist allprison;
 struct	mtx allprison_mtx;
@@ -99,111 +139,104 @@
 
 SYSINIT(prison, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_prison, NULL);
 
-#if 0
 static void
 jpager_td(void *arg)
 {
-  struct proc *p;
-  struct prison *pr = arg;
-  struct thread *td;
-  long limit, cursize, newsize, usage;
-  int breakout;
-  int pr_id = pr->pr_id;
-  int flags = J_SCHED_TD_ACTIVE;
-  pr->pr_scheduler_flags_ptr = &flags;
-    
-  printf("Starting jpager/%d with memory limit %ld bytes\n", 
-         pr_id, (long) prison_memory_limit(pr));
-  
-  for (;;) {
-    if (flags & J_PAGER_TD_DIE)
-      break;
+	struct proc *p;
+	struct prison *pr = arg;
+	struct thread *td;
+	long limit, cursize, newsize, usage;
+	int breakout;
+	int pr_id = pr->pr_id;
+	int flags = J_PAGER_TD_ACTIVE;
+	pr->pr_pager_flags_ptr = &flags;
+	
+	for (;;) {
+		if (flags & J_PAGER_TD_DIE)
+			break;
+	       
+		if (jail_limit_memory && pr->pr_mem_limit) {
+			/* TODO: consider whether it might be better to start
+			 * pushing back when we approach the limit, rather than
+			 * when we hit it.
+			 */
+			limit = (long) prison_memory_limit(pr);
+			usage = (long) prison_memory(pr);
+			
+			/* The logic from vm_daemon() really needs to go here.
+			 * Problem: we want to push things below their rlimits.
+			 *
+			 * TODO: refactor vm_daemon to optionally act on specific jails?
+			 */
+			
+			printf("jpager/%d: memory %ld / %ld bytes\n", 
+			       pr_id, usage, limit);
+			
+			if ((usage - limit) > 0) {
+				printf("jpager/%d: overcommitted by %ld bytes (%f percent)\n",
+				       pr_id, usage - limit,
+				       (double) 100 * ((double) (usage - limit) / (double) limit)); 
+				sx_slock(&allproc_lock);
+				LIST_FOREACH(p, &allproc, p_list) {
+					
+					if (pr != p->p_ucred->cr_prison)
+						continue;
+					
+					PROC_LOCK(p);
+					if (p->p_flag & (P_SYSTEM | P_WEXIT)) {
+						PROC_UNLOCK(p);
+						continue;
+					}
+					
+					mtx_lock_spin(&sched_lock);
+					breakout = 0;
+					FOREACH_THREAD_IN_PROC(p, td) {
+						if (!TD_ON_RUNQ(td) &&
+						    !TD_IS_RUNNING(td) &&
+						    !TD_IS_SLEEPING(td)) {
+							breakout = 1;
+							break;
+						}
+					}
+					mtx_unlock_spin(&sched_lock);
+					if (breakout) {
+						PROC_UNLOCK(p);
+						continue;
+					}
+					
+					/* NOTE: we differ here from vm_daemon b/c we don't 
+					 * care about the rlimit; things that are exceeding that will
+					 * get caught in due course.  We need, however, to decrease
+					 * the pressure on our permitted memory allocation.  Fortunately, 
+					 * we only care about eventually hitting the limit, so if we
+					 * don't get there right away, it's okay.
+					 */      
+					
+					/* TODO: this arbitrarily reduces each process's space by
+					 * 5% (until it's completely swapped out) while
+					 * we're under memory pressure.  A better way would be 
+					 * to either hit large processes first, or to hit the
+					 * least-active processes first, or go proportionally,
+					 * or .... 
+					 */
+					newsize = cursize = (long) vmspace_resident_count(p->p_vmspace);
+					newsize -= newsize / 20;
+					if (cursize < 0)
+						newsize = 0;
+					PROC_UNLOCK(p);
+					printf("jpager/%d: squeezing process %d from %ld to %ld\n", 
+					       pr_id, p->p_pid, cursize, newsize);
+					vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map, newsize);
+				} /* end LIST_FOREACH procs */
+				sx_sunlock(&allproc_lock);
+			}
+		}
 
-    /* TODO: consider whether it might be better to start
-     * pushing back when we approach the limit, rather than
-     * when we hit it.
-     */
-    limit = (long) prison_memory_limit(pr);
-    usage = (long) prison_memory(pr);
-
-    /* The logic from vm_daemon() really needs to go here.
-     * Problem: we want to push things below their rlimits.
-     *
-     * TODO: refactor vm_daemon to optionally act on specific jails?
-     */
-
-    printf("jpager/%d: memory %ld / %ld bytes\n", 
-           pr_id, usage, limit);
-
-    if ((usage - limit) > 0) {
-      printf("jpager/%d: overcommitted by %ld bytes (%f percent)\n",
-             pr_id, usage - limit,
-             (double) 100 * ((double) (usage - limit) / (double) limit)); 
-      sx_slock(&allproc_lock);
-      LIST_FOREACH(p, &allproc, p_list) {
-	
-	if (pr != p->p_ucred->cr_prison)
-	  continue;
-	
-	PROC_LOCK(p);
-	if (p->p_flag & (P_SYSTEM | P_WEXIT)) {
-	  PROC_UNLOCK(p);
-	  continue;
+		tsleep(pr, 0, "-", jail_memory_pager_interval * hz); 
 	}
 	
-	mtx_lock_spin(&sched_lock);
-	breakout = 0;
-	FOREACH_THREAD_IN_PROC(p, td) {
-	  if (!TD_ON_RUNQ(td) &&
-	      !TD_IS_RUNNING(td) &&
-	      !TD_IS_SLEEPING(td)) {
-	    breakout = 1;
-	    break;
-	  }
-	}
-	mtx_unlock_spin(&sched_lock);
-	if (breakout) {
-	  PROC_UNLOCK(p);
-	  continue;
-	}
-	
-	/* NOTE: we differ here from vm_daemon b/c we don't 
-	 * care about the rlimit; things that are exceeding that will
-	 * get caught in due course.  We need, however, to decrease
-	 * the pressure on our permitted memory allocation.  Fortunately, 
-	 * we only care about eventually hitting the limit, so if we
-	 * don't get there right away, it's okay.
-	 */      
-	
-	/* TODO: this arbitrarily reduces each process's space by
-	 * 5% (until it's completely swapped out) while
-	 * we're under memory pressure.  A better way would be 
-	 * to either hit large processes first, or to hit the
-	 * least-active processes first, or go proportionally,
-         * or .... 
-	 */
-	newsize = cursize = (long) vmspace_resident_count(p->p_vmspace);
-	newsize -= newsize / 20;
-	if (cursize < 0)
-	  newsize = 0;
-	PROC_UNLOCK(p);
-	printf("jpager/%d: squeezing process %d from %ld to %ld\n", 
-               pr_id, p->p_pid, cursize, newsize);
-	vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map, newsize);
-      } /* end LIST_FOREACH procs */
-      sx_sunlock(&allproc_lock);
-    }
-    
-    /* TODO --- make interval into a sysctl. */
-    /* 6 seconds because VM recomputes totals every 5. */
-    printf("jpager_td sleeping\n");
-    tsleep(pr, 0, "-", 6 * hz); 
-  }
-
-  printf("Exiting jpager_td\n");
-  kthread_exit(0);
+	kthread_exit(0);
 }
-#endif
 
 /*
  * MPSAFE
@@ -219,7 +252,7 @@
 	struct prison *pr, *tpr;
 	struct jail j;
 	struct jail_attach_args jaa;
-	/*	struct proc *j_pager_proc = NULL; */
+	struct proc *j_pager_proc = NULL;
 	int vfslocked, error, tryprid;
 
 	error = copyin(uap->jail, &j, sizeof(j));
@@ -275,10 +308,10 @@
 	prisoncount++;
 	mtx_unlock(&allprison_mtx);
 
-	/*	if (kthread_create(jpager_td, pr, (void *) j_pager_proc, 0, 0, "jpager %d", pr->pr_id))
-	  goto e_dropprref;
+	if (kthread_create(jpager_td, pr, (void *) j_pager_proc, 0, 0, "jpager %d", pr->pr_id))
+		goto e_dropprref;
 	KASSERT(j_pager_proc != NULL, ("NULL j_pager_proc"));
-	pr->pr_pager = j_pager_proc; */
+	pr->pr_pager = j_pager_proc;
 
 	error = jail_attach(td, &jaa);
 	if (error)
@@ -404,8 +437,7 @@
 		mtx_unlock(&allprison_mtx);
 
                 /* Tell scheduler, pager to die.  No need to wait. */
-/*	        *pr->pr_scheduler_flags_ptr = J_SCHED_TD_DIE;
-		*pr->pr_pager_flags_ptr = J_PAGER_TD_DIE; */
+		*pr->pr_pager_flags_ptr = J_PAGER_TD_DIE;
 		wakeup(pr);
 
 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
@@ -523,40 +555,36 @@
 vm_pindex_t
 prison_memory(struct prison *pr)
 {
-  struct proc *p;
-  u_int mem_used = 0;
-
-  /* TODO: cut this to search only procs in given jail. */
-  FOREACH_PROC_IN_SYSTEM(p) {
-    if (!jailed(p->p_ucred) ||
-	(pr != p->p_ucred->cr_prison)) {
-      continue;
-    }
-
-    /* Get memory usage (see vm/vm_map.h). */
-    /* TODO maybe use vm_swrss? */
-    mem_used += (p->p_vmspace)->vm_tsize; /* text size (pages) */
-    mem_used += (p->p_vmspace)->vm_dsize; /* data size (pages) */
-    mem_used += (p->p_vmspace)->vm_ssize; /* stack size (pages) */
-  }
-
-  /* Convert to bytes, cache (maybe unncessary?). */
-  mem_used *= PAGE_SIZE;
-  /*  mtx_lock(&pr->pr_mtx);
-  pr->pr_mem_usage = mem_used;
-  mtx_unlock(&pr->pr_mtx); */
-  return mem_used;
+	struct proc *p;
+	u_int mem_used = 0;
+	
+	/* TODO: cut this to search only procs in given jail. */
+	FOREACH_PROC_IN_SYSTEM(p) {
+		if (!jailed(p->p_ucred) ||
+		    (pr != p->p_ucred->cr_prison)) {
+			continue;
+		}
+		
+		/* Get memory usage (see vm/vm_map.h). */
+		/* TODO maybe use vm_swrss? */
+		mem_used += (p->p_vmspace)->vm_tsize; /* text size (pages) */
+		mem_used += (p->p_vmspace)->vm_dsize; /* data size (pages) */
+		mem_used += (p->p_vmspace)->vm_ssize; /* stack size (pages) */
+	}
+	
+	mem_used *= PAGE_SIZE;
+	return mem_used;
 }
 
 /* Given credential, return permitted memory usage in bytes. */
 vm_pindex_t
 prison_memory_limit(struct prison *pr)
 {
-  vm_pindex_t memlimit;
-  mtx_lock(&pr->pr_mtx);
-  memlimit = (vm_pindex_t) pr->pr_mem_limit;
-  mtx_unlock(&pr->pr_mtx);
-  return memlimit;
+	vm_pindex_t memlimit;
+	mtx_lock(&pr->pr_mtx);
+	memlimit = (vm_pindex_t) pr->pr_mem_limit;
+	mtx_unlock(&pr->pr_mtx);
+	return memlimit;
 }
 
 /*
@@ -689,6 +717,52 @@
 	}
 }
 
+/* 
+ * Change resource limit for a prison.
+ * 
+ * unsigned int jid: id of jail to mess with
+ *
+ * int cpushares:  0 -> remove prison from cpu limits
+ *                -1 -> don't change existing shares
+ *                >0 -> set cpu shares
+ *
+ * int memlimit:   0 -> remove prison from mem limits
+ *                -1 -> don't change existing limit
+ *                >1 -> set memory limit (bytes)
+ *
+ * TODO: might this be better handled via a writable 
+ * sysctl than with a new syscall?
+ */
+int
+jail_set_resource_limits(struct thread *td, struct jail_set_resource_limits_args *uap)
+{
+	struct prison *pr;
+	int error;
+
+	error = suser(td);
+	if (error)
+		return (error);
+
+	mtx_lock(&allprison_mtx);
+	LIST_FOREACH(pr, &allprison, pr_list) {
+		if (pr->pr_id == uap->jid)
+			break;
+	}
+	if (NULL == pr) {
+		mtx_unlock(&allprison_mtx);
+		return 1;
+	}
+	
+	mtx_lock(&pr->pr_mtx);
+	if (-1 != uap->cpushares)
+		pr->pr_sched_shares = uap->cpushares;
+	if (-1 != uap->memlimit)
+		pr->pr_mem_limit = uap->memlimit;
+	mtx_unlock(&pr->pr_mtx);
+	mtx_unlock(&allprison_mtx);
+	return 0;
+}
+
 static int
 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
 {


More information about the p4-projects mailing list