CPU limit for Jails(patch for ULE scheduler)

Меньшиков Константин k.menshikov at peterhost.ru
Mon Apr 20 15:18:15 UTC 2009


Hello all!
Many users want have limits on resourse for jail, for examle cpu and 
memory limit.
I`m rewrire original cdjones patch  for cpu limit for jail under ULE
scheduler.
So,  this work simple.
We count cpu usage for all jails, and if jail use cpu more than have
shared cpu, we move his threads to IDLE queue and return to TIMESHARE in 
  reverse case.
Jailed thread can use all avaliable cpu time, if  system has avaliable cpu.
If system under heavy load, jailed thread can`t use cpu long as ratio
(shared cpu for jail/ all shared cpu) < (estimate usage cpu for jail /
all usage cpu) .
Unjailed thread and interactive thread are not subject to this regime.
Add 2 sysctl
kern.sched.total_sched_shares - total count shares cpu in system,
increase if we have more cpu
kern.sched.flush_estcpu_interval - flush estcpu interval in ticks,
default is 2560 = 2 * 128 * 10, NCPU*stathz*sec, increase if we have
more cpu
For use cpu limit, you need use flag -S NSharedCPU in /usr/sbin/jail
program.
My example jail -S100 /usr/jails/root/ root.kostjn.pht  192.168.0.245
/bin/csh

I`m tested this under 10 simultaneous process in jail and in main
system. test program is infinity cycle an 8 core xeon, use RELENG_7.
First run process in jail, and after in main system.
This one process tracking cpu usage
Jail
root    1052  0.0  0.0  3692   784  p1  RJ    7:38PM   0:00.39 /test.o
root    1052 21.2  0.0  3692   784  p1  RJ    7:38PM   0:02.40 /test.o
root    1052 35.6  0.0  3692   784  p1  RJ    7:38PM   0:04.40 /test.o
root    1052 47.5  0.0  3692   784  p1  RJ    7:38PM   0:06.41 /test.o
root    1052 39.9  0.0  3692   784  p1  RJ    7:38PM   0:06.62 /test.o
root    1052 33.2  0.0  3692   784  p1  RJ    7:38PM   0:06.62 /test.o
root    1052 27.6  0.0  3692   784  p1  RJ    7:38PM   0:06.62 /test.o
root    1052 22.9  0.0  3692   784  p1  RJ    7:38PM   0:06.62 /test.o
root    1052 19.0  0.0  3692   784  p1  RJ    7:38PM   0:06.62 /test.o
root    1052 15.8  0.0  3692   784  p1  RJ    7:38PM   0:06.62 /test.o
root    1052 13.0  0.0  3692   784  p1  RJ    7:38PM   0:06.62 /test.o
root    1052 10.8  0.0  3692   784  p1  RJ    7:38PM   0:06.62 /test.o
root    1052  8.9  0.0  3692   784  p1  RJ    7:38PM   0:06.62 /tes

Main system
root    1088 14.9  0.0  3692   780  p0  R     7:38PM   0:01.57 /root/test.o
root    1088 30.8  0.0  3692   780  p0  R     7:38PM   0:03.60 /root/test.o
root    1088 43.8  0.0  3692   780  p0  R     7:38PM   0:05.60 /root/test.o
root    1088 51.0  0.0  3692   780  p0  R     7:38PM   0:07.25 /root/test.o
root    1088 50.8  0.0  3692   780  p0  R     7:38PM   0:08.28 /root/test.o
root    1088 49.1  0.0  3692   780  p0  R     7:38PM   0:09.21 /root/test.o
root    1088 48.1  0.0  3692   780  p0  R     7:38PM   0:10.24 /root/test.o
root    1088 46.2  0.0  3692   780  p0  R     7:38PM   0:11.17 /root/test.o
root    1088 42.9  0.0  3692   780  p0  R     7:38PM   0:11.95 /root/test.o

So we see, that after run in main system, jailed process can`t usage cpu.

Please communicate me  about all problem in this patch.
This is initial version, without tune jail parameter in runtime.

So, this work. But i`m not sure, that is best way.

Attempt increase priority for jailed thread not work, because non 
interactive thread (that utilize many cpu) already have small 
prioriry(numerical high).
Attempt decrease number ticks in cpu time slice, also not good idea, 
because, this increase number context switching on high load.
May be you see other way for do this?
Share you idea.

Thank.
Original cdjones  cpu and memory limit patch
http://wiki.freebsd.org/JailResourceLimits


-------------- next part --------------
diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines sys/kern/kern_jail.c sys.new/kern/kern_jail.c
--- sys/kern/kern_jail.c	2009-03-10 22:33:50.000000000 +0300
+++ sys.new/kern/kern_jail.c	2009-04-17 18:51:34.000000000 +0400
@@ -531,6 +532,7 @@ kern_jail(struct thread *td, struct jail
 	}
 #endif
 	pr->pr_linux = NULL;
+    pr->pr_sched_shares = j->sched_shares;
 	pr->pr_securelevel = securelevel;
 	if (prison_service_slots == 0)
 		pr->pr_slots = NULL;
diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines sys/kern/sched_ule.c sys.new/kern/sched_ule.c
--- sys/kern/sched_ule.c	2009-03-30 23:20:56.000000000 +0400
+++ sys.new/kern/sched_ule.c	2009-04-17 19:10:07.000000000 +0400
@@ -61,6 +61,7 @@ __FBSDID("$FreeBSD: src/sys/kern/sched_u
 #include <sys/umtx.h>
 #include <sys/vmmeter.h>
 #include <sys/cpuset.h>
+#include <sys/jail.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
@@ -186,6 +187,22 @@ static int sched_interact = SCHED_INTERA
 static int realstathz;
 static int tickincr;
 static int sched_slice;
+
+#define ESTCPU_SHIFT	10
+/*
+ * estcpu:					Global counter ticks from stat timer 
+ * flush_estcpu_interval:   Number ticks, after that we to zero estcpu,
+ *                          flush_estcpu_interval = mp_ncpus*stathz*10, 
+ *							default 2*128*10 = 2560
+ * total_sched_shares:      Total count shares cpu, 1000 per core, 
+ *							default 2*1000 = 2000 
+*/
+
+
+static int estcpu;
+static int flush_estcpu_interval = 2560;
+static int total_sched_shares = 2000;
+
 #ifdef PREEMPTION
 #ifdef FULL_PREEMPTION
 static int preempt_thresh = PRI_MAX_IDLE;
@@ -2200,6 +2219,7 @@ sched_clock(struct thread *td)
 {
 	struct tdq *tdq;
 	struct td_sched *ts;
+	struct prison *pr = td->td_proc->p_ucred->cr_prison;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	tdq = TDQ_SELF();
@@ -2234,6 +2254,20 @@ sched_clock(struct thread *td)
 		td->td_sched->ts_runtime += tickincr;
 		sched_interact_update(td);
 	}
+
+	/* Increase counter and flush if need */
+	estcpu++;
+	if (pr != NULL)
+		pr->pr_estcpu++;
+
+	if (estcpu > flush_estcpu_interval){
+		estcpu = 0;
+		LIST_FOREACH(pr, &allprison, pr_list) {
+			pr->pr_estcpu = 0;
+		}
+		CTR0(KTR_SCHED,"Flush estcpu and pr_estcpu for all jails");
+	}
+
 	/*
 	 * We used up one time slice.
 	 */
@@ -2375,6 +2409,8 @@ tdq_add(struct tdq *tdq, struct thread *
 	int cpumask;
 #endif
 
+    struct prison *pr = td->td_proc->p_ucred->cr_prison;
+
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 	    ("sched_add: trying to run inhibited thread"));
@@ -2383,6 +2419,32 @@ tdq_add(struct tdq *tdq, struct thread *
 	KASSERT(td->td_flags & TDF_INMEM,
 	    ("sched_add: thread swapped out"));
 
+        /* We move thread in IDLE queue if prison estimate cpu more than shares
+		 * cpu and thread is not interactive. Use ESTCPU_SHIFT to avoid
+		 * rounding away results */
+    if(pr != NULL)
+    	CTR6(KTR_SCHED,"pid %i, prison %i, pr_estcpu %i,\
+						estcpu %i shares %i interact %i",
+                   		td->td_proc->p_pid,pr->pr_id,pr->pr_estcpu,
+						estcpu, pr->pr_sched_shares, sched_interact_score(td));
+    if (pr != NULL && pr->pr_sched_shares != 0 &&
+        sched_interact_score(td) > sched_interact &&
+		estcpu != 0 && total_sched_shares != 0){
+
+    	if ((pr->pr_estcpu          << ESTCPU_SHIFT)  / (estcpu) >
+          	(pr->pr_sched_shares    << ESTCPU_SHIFT)  / (total_sched_shares))
+        {
+          	td->td_priority  = PRI_MIN_IDLE; 
+			td->td_pri_class = PRI_IDLE;
+            CTR2(KTR_SCHED,"prison %i excess cpu limit!!! new pri = %i ",pr->pr_id,td->td_priority);
+
+        } else {
+            CTR1(KTR_SCHED,"prison %i use cpu less limit",pr->pr_id);     
+			sched_priority(td);
+			td->td_pri_class = PRI_TIMESHARE;
+        }
+    }
+
 	ts = td->td_sched;
 	class = PRI_BASE(td->td_pri_class);
         TD_SET_RUNQ(td);
@@ -2746,6 +2808,10 @@ SYSCTL_INT(_kern_sched, OID_AUTO, intera
      "Interactivity score threshold");
 SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh,
      0,"Min priority for preemption, lower priorities have greater precedence");
+SYSCTL_INT(_kern_sched, OID_AUTO, flush_estcpu_interval, CTLFLAG_RW, &flush_estcpu_interval,
+     0,"Number ticks stat timer after thar we zero estcpu counter");
+SYSCTL_INT(_kern_sched, OID_AUTO, total_sched_shares, CTLFLAG_RW, &total_sched_shares,
+     0,"Total number shared cpu for system");
 #ifdef SMP
 SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0,
     "Pick the target cpu based on priority rather than load.");
diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines sys/sys/jail.h sys.new/sys/jail.h
--- sys/sys/jail.h	2009-02-18 23:12:08.000000000 +0300
+++ sys.new/sys/jail.h	2009-04-17 18:53:43.000000000 +0400
@@ -31,6 +31,7 @@ struct jail {
 	uint32_t	ip6s;
 	struct in_addr	*ip4;
 	struct in6_addr	*ip6;
+	uint32_t	sched_shares;
 };
 #define	JAIL_API_VERSION 2
 
@@ -132,6 +133,9 @@ struct prison {
 	struct task	 pr_task;			/* (d) destroy task */
 	struct mtx	 pr_mtx;
 	void		**pr_slots;			/* (p) additional data */
+	uint32_t	pr_estcpu;			/* (p) cpu usage */
+	uint32_t	pr_sched_shares;	/* (c) number virtual cpu */
+
 	int		 pr_ip4s;			/* (c) number of v4 IPs */
 	struct in_addr	*pr_ip4;			/* (c) v4 IPs of jail */
 	int		 pr_ip6s;			/* (c) number of v6 IPs */
diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines usr.sbin/jail/jail.c usr.sbin.new/jail/jail.c
--- usr.sbin/jail/jail.c	2009-02-07 16:19:08.000000000 +0300
+++ usr.sbin.new/jail/jail.c	2009-04-17 18:57:15.000000000 +0400
@@ -83,6 +83,7 @@ main(int argc, char **argv)
 	int ch, error, i, ngroups, securelevel;
 	int hflag, iflag, Jflag, lflag, uflag, Uflag;
 	char path[PATH_MAX], *jailname, *ep, *username, *JidFile, *ip;
+	uint32_t sched_shares = 0;
 	static char *cleanenv;
 	const char *shell, *p = NULL;
 	long ltmp;
@@ -94,7 +95,7 @@ main(int argc, char **argv)
 	jailname = username = JidFile = cleanenv = NULL;
 	fp = NULL;
 
-	while ((ch = getopt(argc, argv, "hiln:s:u:U:J:")) != -1) {
+	while ((ch = getopt(argc, argv, "hilS:n:s:u:U:J:")) != -1) {
 		switch (ch) {
 		case 'h':
 			hflag = 1;
@@ -115,6 +116,9 @@ main(int argc, char **argv)
 				errx(1, "invalid securelevel: `%s'", optarg);
 			securelevel = ltmp;
 			break;
+		case 'S':
+			sched_shares = (uint32_t)strtol(optarg,NULL,10);
+			break;
 		case 'u':
 			username = optarg;
 			uflag = 1;
@@ -152,6 +156,8 @@ main(int argc, char **argv)
 	if (jailname != NULL)
 		j.jailname = jailname;
 
+	j.sched_shares = sched_shares;
+
 	/* Handle IP addresses. If requested resolve hostname too. */
 	bzero(&hints, sizeof(struct addrinfo));
 	hints.ai_protocol = IPPROTO_TCP;
@@ -264,9 +270,10 @@ static void
 usage(void)
 {
 
-	(void)fprintf(stderr, "%s%s%s\n",
+	(void)fprintf(stderr, "%s%s%s%s\n",
 	     "usage: jail [-hi] [-n jailname] [-J jid_file] ",
 	     "[-s securelevel] [-l -u username | -U username] ",
+		 "[-S number shared cpu] ",
 	     "path hostname [ip[,..]] command ...");
 	exit(1);
 }
-------------- next part --------------
diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines sys/kern/kern_jail.c sys.new/kern/kern_jail.c
--- sys/kern/kern_jail.c	2008-11-25 05:59:29.000000000 +0300
+++ sys.new/kern/kern_jail.c	2009-04-17 20:23:40.000000000 +0400
@@ -156,6 +156,7 @@ jail(struct thread *td, struct jail_args
 		goto e_dropvnref;
 	pr->pr_ip = j.ip_number;
 	pr->pr_linux = NULL;
+    pr->pr_sched_shares = j->sched_shares;
 	pr->pr_securelevel = securelevel;
 	if (prison_service_slots == 0)
 		pr->pr_slots = NULL;
diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines sys/kern/sched_ule.c sys.new/kern/sched_ule.c
--- sys/kern/sched_ule.c	2008-11-25 05:59:29.000000000 +0300
+++ sys.new/kern/sched_ule.c	2009-04-17 20:23:40.000000000 +0400
@@ -61,6 +61,7 @@ __FBSDID("$FreeBSD: src/sys/kern/sched_u
 #include <sys/umtx.h>
 #include <sys/vmmeter.h>
 #include <sys/cpuset.h>
+#include <sys/jail.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
@@ -186,6 +187,22 @@ static int sched_interact = SCHED_INTERA
 static int realstathz;
 static int tickincr;
 static int sched_slice;
+
+#define ESTCPU_SHIFT	10
+/*
+ * estcpu:					Global counter ticks from stat timer 
+ * flush_estcpu_interval:   Number ticks, after that we to zero estcpu,
+ *                          flush_estcpu_interval = mp_ncpus*stathz*10, 
+ *							default 2*128*10 = 2560
+ * total_sched_shares:      Total count shares cpu, 1000 per core, 
+ *							default 2*1000 = 2000 
+*/
+
+
+static int estcpu;
+static int flush_estcpu_interval = 2560;
+static int total_sched_shares = 2000;
+
 #ifdef PREEMPTION
 #ifdef FULL_PREEMPTION
 static int preempt_thresh = PRI_MAX_IDLE;
@@ -2200,6 +2217,7 @@ sched_clock(struct thread *td)
 {
 	struct tdq *tdq;
 	struct td_sched *ts;
+	struct prison *pr = td->td_proc->p_ucred->cr_prison;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	tdq = TDQ_SELF();
@@ -2234,6 +2252,20 @@ sched_clock(struct thread *td)
 		td->td_sched->ts_runtime += tickincr;
 		sched_interact_update(td);
 	}
+
+	/* Increase counter and flush if need */
+	estcpu++;
+	if (pr != NULL)
+		pr->pr_estcpu++;
+
+	if (estcpu > flush_estcpu_interval){
+		estcpu = 0;
+		LIST_FOREACH(pr, &allprison, pr_list) {
+			pr->pr_estcpu = 0;
+		}
+		CTR0(KTR_SCHED,"Flush estcpu and pr_estcpu for all jails");
+	}
+
 	/*
 	 * We used up one time slice.
 	 */
@@ -2375,6 +2407,8 @@ tdq_add(struct tdq *tdq, struct thread *
 	int cpumask;
 #endif
 
+    struct prison *pr = td->td_proc->p_ucred->cr_prison;
+
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 	    ("sched_add: trying to run inhibited thread"));
@@ -2383,6 +2417,32 @@ tdq_add(struct tdq *tdq, struct thread *
 	KASSERT(td->td_flags & TDF_INMEM,
 	    ("sched_add: thread swapped out"));
 
+        /* We move thread in IDLE queue if prison estimate cpu more than shares
+		 * cpu and thread is not interactive. Use ESTCPU_SHIFT to avoid
+		 * rounding away results */
+    if(pr != NULL)
+    	CTR6(KTR_SCHED,"pid %i, prison %i, pr_estcpu %i,\
+						estcpu %i shares %i interact %i",
+                   		td->td_proc->p_pid,pr->pr_id,pr->pr_estcpu,
+						estcpu, pr->pr_sched_shares, sched_interact_score(td));
+    if (pr != NULL && pr->pr_sched_shares != 0 &&
+        sched_interact_score(td) > sched_interact &&
+		estcpu != 0 && total_sched_shares != 0){
+
+    	if ((pr->pr_estcpu          << ESTCPU_SHIFT)  / (estcpu) >
+          	(pr->pr_sched_shares    << ESTCPU_SHIFT)  / (total_sched_shares))
+        {
+          	td->td_priority  = PRI_MIN_IDLE; 
+			td->td_pri_class = PRI_IDLE;
+            CTR2(KTR_SCHED,"prison %i excess cpu limit!!! new pri = %i ",pr->pr_id,td->td_priority);
+
+        } else {
+            CTR1(KTR_SCHED,"prison %i use cpu less limit",pr->pr_id);     
+			sched_priority(td);
+			td->td_pri_class = PRI_TIMESHARE;
+        }
+    }
+
 	ts = td->td_sched;
 	class = PRI_BASE(td->td_pri_class);
         TD_SET_RUNQ(td);
@@ -2741,6 +2801,10 @@ SYSCTL_INT(_kern_sched, OID_AUTO, intera
      "Interactivity score threshold");
 SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh,
      0,"Min priority for preemption, lower priorities have greater precedence");
+SYSCTL_INT(_kern_sched, OID_AUTO, flush_estcpu_interval, CTLFLAG_RW, &flush_estcpu_interval,
+     0,"Number ticks stat timer after thar we zero estcpu counter");
+SYSCTL_INT(_kern_sched, OID_AUTO, total_sched_shares, CTLFLAG_RW, &total_sched_shares,
+     0,"Total number shared cpu for system");
 #ifdef SMP
 SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0,
     "Pick the target cpu based on priority rather than load.");
diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines sys/sys/jail.h sys.new/sys/jail.h
--- sys/sys/jail.h	2008-11-25 05:59:29.000000000 +0300
+++ sys.new/sys/jail.h	2009-04-17 20:26:54.000000000 +0400
@@ -18,6 +18,7 @@ struct jail {
 	char		*path;
 	char		*hostname;
 	u_int32_t	ip_number;
+	uint32_t	sched_shares;
 };
 
 struct xprison {
@@ -74,6 +75,8 @@ struct prison {
 	struct task	 pr_task;			/* (d) destroy task */
 	struct mtx	 pr_mtx;
 	void		**pr_slots;			/* (p) additional data */
+	uint32_t	pr_estcpu;			/* (p) cpu usage */
+	uint32_t	pr_sched_shares;	/* (c) number virtual cpu */
 };
 #endif /* _KERNEL || _WANT_PRISON */
 
diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines usr.sbin/jail/jail.c usr.sbin.new/jail/jail.c
--- usr.sbin/jail/jail.c	2008-11-25 05:59:29.000000000 +0300
+++ usr.sbin.new/jail/jail.c	2009-04-17 20:31:17.000000000 +0400
@@ -57,6 +57,7 @@ main(int argc, char **argv)
 	gid_t groups[NGROUPS];
 	int ch, i, iflag, Jflag, lflag, ngroups, securelevel, uflag, Uflag;
 	char path[PATH_MAX], *ep, *username, *JidFile;
+	uint32_t sched_shares = 0;
 	static char *cleanenv;
 	const char *shell, *p = NULL;
 	long ltmp;
@@ -67,7 +68,7 @@ main(int argc, char **argv)
 	username = JidFile = cleanenv = NULL;
 	fp = NULL;
 
-	while ((ch = getopt(argc, argv, "ils:u:U:J:")) != -1) {
+	while ((ch = getopt(argc, argv, "ilS:s:u:U:J:")) != -1) {
 		switch (ch) {
 		case 'i':
 			iflag = 1;
@@ -82,6 +83,9 @@ main(int argc, char **argv)
 				errx(1, "invalid securelevel: `%s'", optarg);
 			securelevel = ltmp;
 			break;
+		case 'S':
+			sched_shares = (uint32_t)strtol(optarg,NULL,10);
+			break;
 		case 'u':
 			username = optarg;
 			uflag = 1;
@@ -115,6 +119,7 @@ main(int argc, char **argv)
 	j.version = 0;
 	j.path = path;
 	j.hostname = argv[1];
+	j.sched_shares = sched_shares;
 	if (inet_aton(argv[2], &in) == 0)
 		errx(1, "Could not make sense of ip-number: %s", argv[2]);
 	j.ip_number = ntohl(in.s_addr);
@@ -182,9 +187,10 @@ static void
 usage(void)
 {
 
-	(void)fprintf(stderr, "%s%s%s\n",
+	(void)fprintf(stderr, "%s%s%s%s\n",
 	     "usage: jail [-i] [-J jid_file] [-s securelevel] [-l -u ",
 	     "username | -U username]",
+		 "[-S number shared cpu] ",
 	     " path hostname ip-number command ...");
 	exit(1);
 }


More information about the freebsd-jail mailing list