PERFORCE change 104658 for review
Chris Jones
cdjones at FreeBSD.org
Mon Aug 21 07:18:36 UTC 2006
http://perforce.freebsd.org/chv.cgi?CH=104658
Change 104658 by cdjones at cdjones-impulse on 2006/08/21 07:18:05
Introduce security.jail.limit_jail_memory and security.jail.jail_pager_interval sysctls. Bring jpager_td back into the build, running iff limit_jail_memory sysctl set. Get rid of old scheduler td cruft. Add jail_set_resource_limits syscall.
Affected files ...
.. //depot/projects/soc2006/cdjones_jail/src/sys/kern/kern_jail.c#23 edit
Differences ...
==== //depot/projects/soc2006/cdjones_jail/src/sys/kern/kern_jail.c#23 (text+ko) ====
@@ -5,6 +5,35 @@
* can do whatever you want with this stuff. If we meet some day, and you think
* this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
* ----------------------------------------------------------------------------
+ *
+ * Portions copyright (c) 2006 Chris Jones
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project by Chris Jones
+ * thanks to the support of Google's Summer of Code program and
+ * mentoring by Kip Macy.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
*/
#include <sys/cdefs.h>
@@ -78,6 +107,17 @@
&jail_chflags_allowed, 0,
"Processes in jail can alter system file flags");
+int jail_limit_memory = 0;
+SYSCTL_INT(_security_jail, OID_AUTO, limit_jail_memory, CTLFLAG_RW,
+ &jail_limit_memory, 0,
+ "Limit jails' memory usage");
+
+int jail_memory_pager_interval = 5;
+SYSCTL_INT(_security_jail, OID_AUTO, jail_pager_interval,
+ CTLTYPE_INT | CTLFLAG_RW,
+ &jail_memory_pager_interval, 0,
+ "Interval between jail memory limit checks");
+
/* allprison, lastprid, and prisoncount are protected by allprison_mtx. */
struct prisonlist allprison;
struct mtx allprison_mtx;
@@ -99,111 +139,104 @@
SYSINIT(prison, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_prison, NULL);
-#if 0
static void
jpager_td(void *arg)
{
- struct proc *p;
- struct prison *pr = arg;
- struct thread *td;
- long limit, cursize, newsize, usage;
- int breakout;
- int pr_id = pr->pr_id;
- int flags = J_SCHED_TD_ACTIVE;
- pr->pr_scheduler_flags_ptr = &flags;
-
- printf("Starting jpager/%d with memory limit %ld bytes\n",
- pr_id, (long) prison_memory_limit(pr));
-
- for (;;) {
- if (flags & J_PAGER_TD_DIE)
- break;
+ struct proc *p;
+ struct prison *pr = arg;
+ struct thread *td;
+ long limit, cursize, newsize, usage;
+ int breakout;
+ int pr_id = pr->pr_id;
+ int flags = J_PAGER_TD_ACTIVE;
+ pr->pr_pager_flags_ptr = &flags;
+
+ for (;;) {
+ if (flags & J_PAGER_TD_DIE)
+ break;
+
+ if (jail_limit_memory && pr->pr_mem_limit) {
+ /* TODO: consider whether it might be better to start
+ * pushing back when we approach the limit, rather than
+ * when we hit it.
+ */
+ limit = (long) prison_memory_limit(pr);
+ usage = (long) prison_memory(pr);
+
+ /* The logic from vm_daemon() really needs to go here.
+ * Problem: we want to push things below their rlimits.
+ *
+ * TODO: refactor vm_daemon to optionally act on specific jails?
+ */
+
+ printf("jpager/%d: memory %ld / %ld bytes\n",
+ pr_id, usage, limit);
+
+ if ((usage - limit) > 0) {
+ printf("jpager/%d: overcommitted by %ld bytes (%f percent)\n",
+ pr_id, usage - limit,
+ (double) 100 * ((double) (usage - limit) / (double) limit));
+ sx_slock(&allproc_lock);
+ LIST_FOREACH(p, &allproc, p_list) {
+
+ if (pr != p->p_ucred->cr_prison)
+ continue;
+
+ PROC_LOCK(p);
+ if (p->p_flag & (P_SYSTEM | P_WEXIT)) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+
+ mtx_lock_spin(&sched_lock);
+ breakout = 0;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if (!TD_ON_RUNQ(td) &&
+ !TD_IS_RUNNING(td) &&
+ !TD_IS_SLEEPING(td)) {
+ breakout = 1;
+ break;
+ }
+ }
+ mtx_unlock_spin(&sched_lock);
+ if (breakout) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+
+ /* NOTE: we differ here from vm_daemon b/c we don't
+ * care about the rlimit; things that are exceeding that will
+ * get caught in due course. We need, however, to decrease
+ * the pressure on our permitted memory allocation. Fortunately,
+ * we only care about eventually hitting the limit, so if we
+ * don't get there right away, it's okay.
+ */
+
+ /* TODO: this arbitrarily reduces each process's space by
+ * 5% (until it's completely swapped out) while
+ * we're under memory pressure. A better way would be
+ * to either hit large processes first, or to hit the
+ * least-active processes first, or go proportionally,
+ * or ....
+ */
+ newsize = cursize = (long) vmspace_resident_count(p->p_vmspace);
+ newsize -= newsize / 20;
+ if (cursize < 0)
+ newsize = 0;
+ PROC_UNLOCK(p);
+ printf("jpager/%d: squeezing process %d from %ld to %ld\n",
+ pr_id, p->p_pid, cursize, newsize);
+ vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map, newsize);
+ } /* end LIST_FOREACH procs */
+ sx_sunlock(&allproc_lock);
+ }
+ }
- /* TODO: consider whether it might be better to start
- * pushing back when we approach the limit, rather than
- * when we hit it.
- */
- limit = (long) prison_memory_limit(pr);
- usage = (long) prison_memory(pr);
-
- /* The logic from vm_daemon() really needs to go here.
- * Problem: we want to push things below their rlimits.
- *
- * TODO: refactor vm_daemon to optionally act on specific jails?
- */
-
- printf("jpager/%d: memory %ld / %ld bytes\n",
- pr_id, usage, limit);
-
- if ((usage - limit) > 0) {
- printf("jpager/%d: overcommitted by %ld bytes (%f percent)\n",
- pr_id, usage - limit,
- (double) 100 * ((double) (usage - limit) / (double) limit));
- sx_slock(&allproc_lock);
- LIST_FOREACH(p, &allproc, p_list) {
-
- if (pr != p->p_ucred->cr_prison)
- continue;
-
- PROC_LOCK(p);
- if (p->p_flag & (P_SYSTEM | P_WEXIT)) {
- PROC_UNLOCK(p);
- continue;
+ tsleep(pr, 0, "-", jail_memory_pager_interval * hz);
}
- mtx_lock_spin(&sched_lock);
- breakout = 0;
- FOREACH_THREAD_IN_PROC(p, td) {
- if (!TD_ON_RUNQ(td) &&
- !TD_IS_RUNNING(td) &&
- !TD_IS_SLEEPING(td)) {
- breakout = 1;
- break;
- }
- }
- mtx_unlock_spin(&sched_lock);
- if (breakout) {
- PROC_UNLOCK(p);
- continue;
- }
-
- /* NOTE: we differ here from vm_daemon b/c we don't
- * care about the rlimit; things that are exceeding that will
- * get caught in due course. We need, however, to decrease
- * the pressure on our permitted memory allocation. Fortunately,
- * we only care about eventually hitting the limit, so if we
- * don't get there right away, it's okay.
- */
-
- /* TODO: this arbitrarily reduces each process's space by
- * 5% (until it's completely swapped out) while
- * we're under memory pressure. A better way would be
- * to either hit large processes first, or to hit the
- * least-active processes first, or go proportionally,
- * or ....
- */
- newsize = cursize = (long) vmspace_resident_count(p->p_vmspace);
- newsize -= newsize / 20;
- if (cursize < 0)
- newsize = 0;
- PROC_UNLOCK(p);
- printf("jpager/%d: squeezing process %d from %ld to %ld\n",
- pr_id, p->p_pid, cursize, newsize);
- vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map, newsize);
- } /* end LIST_FOREACH procs */
- sx_sunlock(&allproc_lock);
- }
-
- /* TODO --- make interval into a sysctl. */
- /* 6 seconds because VM recomputes totals every 5. */
- printf("jpager_td sleeping\n");
- tsleep(pr, 0, "-", 6 * hz);
- }
-
- printf("Exiting jpager_td\n");
- kthread_exit(0);
+ kthread_exit(0);
}
-#endif
/*
* MPSAFE
@@ -219,7 +252,7 @@
struct prison *pr, *tpr;
struct jail j;
struct jail_attach_args jaa;
- /* struct proc *j_pager_proc = NULL; */
+ struct proc *j_pager_proc = NULL;
int vfslocked, error, tryprid;
error = copyin(uap->jail, &j, sizeof(j));
@@ -275,10 +308,10 @@
prisoncount++;
mtx_unlock(&allprison_mtx);
- /* if (kthread_create(jpager_td, pr, (void *) j_pager_proc, 0, 0, "jpager %d", pr->pr_id))
- goto e_dropprref;
+ if (kthread_create(jpager_td, pr, (void *) j_pager_proc, 0, 0, "jpager %d", pr->pr_id))
+ goto e_dropprref;
KASSERT(j_pager_proc != NULL, ("NULL j_pager_proc"));
- pr->pr_pager = j_pager_proc; */
+ pr->pr_pager = j_pager_proc;
error = jail_attach(td, &jaa);
if (error)
@@ -404,8 +437,7 @@
mtx_unlock(&allprison_mtx);
/* Tell scheduler, pager to die. No need to wait. */
-/* *pr->pr_scheduler_flags_ptr = J_SCHED_TD_DIE;
- *pr->pr_pager_flags_ptr = J_PAGER_TD_DIE; */
+ *pr->pr_pager_flags_ptr = J_PAGER_TD_DIE;
wakeup(pr);
TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
@@ -523,40 +555,36 @@
vm_pindex_t
prison_memory(struct prison *pr)
{
- struct proc *p;
- u_int mem_used = 0;
-
- /* TODO: cut this to search only procs in given jail. */
- FOREACH_PROC_IN_SYSTEM(p) {
- if (!jailed(p->p_ucred) ||
- (pr != p->p_ucred->cr_prison)) {
- continue;
- }
-
- /* Get memory usage (see vm/vm_map.h). */
- /* TODO maybe use vm_swrss? */
- mem_used += (p->p_vmspace)->vm_tsize; /* text size (pages) */
- mem_used += (p->p_vmspace)->vm_dsize; /* data size (pages) */
- mem_used += (p->p_vmspace)->vm_ssize; /* stack size (pages) */
- }
-
- /* Convert to bytes, cache (maybe unncessary?). */
- mem_used *= PAGE_SIZE;
- /* mtx_lock(&pr->pr_mtx);
- pr->pr_mem_usage = mem_used;
- mtx_unlock(&pr->pr_mtx); */
- return mem_used;
+ struct proc *p;
+ u_int mem_used = 0;
+
+ /* TODO: cut this to search only procs in given jail. */
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (!jailed(p->p_ucred) ||
+ (pr != p->p_ucred->cr_prison)) {
+ continue;
+ }
+
+ /* Get memory usage (see vm/vm_map.h). */
+ /* TODO maybe use vm_swrss? */
+ mem_used += (p->p_vmspace)->vm_tsize; /* text size (pages) */
+ mem_used += (p->p_vmspace)->vm_dsize; /* data size (pages) */
+ mem_used += (p->p_vmspace)->vm_ssize; /* stack size (pages) */
+ }
+
+ mem_used *= PAGE_SIZE;
+ return mem_used;
}
/* Given credential, return permitted memory usage in bytes. */
vm_pindex_t
prison_memory_limit(struct prison *pr)
{
- vm_pindex_t memlimit;
- mtx_lock(&pr->pr_mtx);
- memlimit = (vm_pindex_t) pr->pr_mem_limit;
- mtx_unlock(&pr->pr_mtx);
- return memlimit;
+ vm_pindex_t memlimit;
+ mtx_lock(&pr->pr_mtx);
+ memlimit = (vm_pindex_t) pr->pr_mem_limit;
+ mtx_unlock(&pr->pr_mtx);
+ return memlimit;
}
/*
@@ -689,6 +717,52 @@
}
}
+/*
+ * Change resource limit for a prison.
+ *
+ * unsigned int jid: id of jail to mess with
+ *
+ * int cpushares: 0 -> remove prison from cpu limits
+ * -1 -> don't change existing shares
+ * >0 -> set cpu shares
+ *
+ * int memlimit: 0 -> remove prison from mem limits
+ * -1 -> don't change existing limit
+ * >1 -> set memory limit (bytes)
+ *
+ * TODO: might this be better handled via a writable
+ * sysctl than with a new syscall?
+ */
+int
+jail_set_resource_limits(struct thread *td, struct jail_set_resource_limits_args *uap)
+{
+ struct prison *pr;
+ int error;
+
+ error = suser(td);
+ if (error)
+ return (error);
+
+ mtx_lock(&allprison_mtx);
+ LIST_FOREACH(pr, &allprison, pr_list) {
+ if (pr->pr_id == uap->jid)
+ break;
+ }
+ if (NULL == pr) {
+ mtx_unlock(&allprison_mtx);
+ return 1;
+ }
+
+ mtx_lock(&pr->pr_mtx);
+ if (-1 != uap->cpushares)
+ pr->pr_sched_shares = uap->cpushares;
+ if (-1 != uap->memlimit)
+ pr->pr_mem_limit = uap->memlimit;
+ mtx_unlock(&pr->pr_mtx);
+ mtx_unlock(&allprison_mtx);
+ return 0;
+}
+
static int
sysctl_jail_list(SYSCTL_HANDLER_ARGS)
{
More information about the p4-projects
mailing list