CPU limit for jails under ULE scheduler

Меньшиков Константин k.menshikov at peterhost.ru
Fri Apr 17 14:06:53 UTC 2009


Hello all!
I`m rewrire original cdjones patch  for cpu limit for jail under ULE 
scheduler.
So,  this work simple.
We count cpu usage for all jails, and if jail use cpu more than have 
shared cpu, we move his threads to IDLE queue.
Jailed thread can use all avaliable cpu time, if  system has avaliable cpu.
If system under heavy load, jailed thread can`t use cpu long as ratio 
(shared cpu for jail/ all shared cpu) < (estimate usage cpu for jail / 
all usage cpu) .
Unjailed thread are not subject to this regime.
Interactive thread also are not subject to this regime.
Add 2 sysctl
kern.sched.total_sched_shares - total count shares cpu in system, 
increase if we have more cpu
kern.sched.flush_estcpu_interval - flush estcpu interval in ticks, 
default is 2560 = 2 * 128 * 10, NCPU*stathz*sec, increase if we have 
more cpu
For use cpu limit, you need use flag -S NSharedCPU in /usr/sbin/jail 
program.
My example jail -S100 /usr/jails/root/ root.kostjn.pht  192.168.0.245 
/bin/csh

I`m tested this under 10 simultaneous process in jail and in main 
system. test program is infinity cycle an 8 core xeon, use RELENG_7.
First run process in jail, and after in main system.
This one process tracking cpu usage
Jail
root    1052  0.0  0.0  3692   784  p1  RJ    7:38PM   0:00.39 /test.o
root    1052 21.2  0.0  3692   784  p1  RJ    7:38PM   0:02.40 /test.o
root    1052 35.6  0.0  3692   784  p1  RJ    7:38PM   0:04.40 /test.o
root    1052 47.5  0.0  3692   784  p1  RJ    7:38PM   0:06.41 /test.o
root    1052 39.9  0.0  3692   784  p1  RJ    7:38PM   0:06.62 /test.o
root    1052 33.2  0.0  3692   784  p1  RJ    7:38PM   0:06.62 /test.o
root    1052 27.6  0.0  3692   784  p1  RJ    7:38PM   0:06.62 /test.o
root    1052 22.9  0.0  3692   784  p1  RJ    7:38PM   0:06.62 /test.o
root    1052 19.0  0.0  3692   784  p1  RJ    7:38PM   0:06.62 /test.o
root    1052 15.8  0.0  3692   784  p1  RJ    7:38PM   0:06.62 /test.o
root    1052 13.0  0.0  3692   784  p1  RJ    7:38PM   0:06.62 /test.o
root    1052 10.8  0.0  3692   784  p1  RJ    7:38PM   0:06.62 /test.o
root    1052  8.9  0.0  3692   784  p1  RJ    7:38PM   0:06.62 /tes

Main system
root    1088 14.9  0.0  3692   780  p0  R     7:38PM   0:01.57 /root/test.o
root    1088 30.8  0.0  3692   780  p0  R     7:38PM   0:03.60 /root/test.o
root    1088 43.8  0.0  3692   780  p0  R     7:38PM   0:05.60 /root/test.o
root    1088 51.0  0.0  3692   780  p0  R     7:38PM   0:07.25 /root/test.o
root    1088 50.8  0.0  3692   780  p0  R     7:38PM   0:08.28 /root/test.o
root    1088 49.1  0.0  3692   780  p0  R     7:38PM   0:09.21 /root/test.o
root    1088 48.1  0.0  3692   780  p0  R     7:38PM   0:10.24 /root/test.o
root    1088 46.2  0.0  3692   780  p0  R     7:38PM   0:11.17 /root/test.o
root    1088 42.9  0.0  3692   780  p0  R     7:38PM   0:11.95 /root/test.o

So we see, that after run in main system, jailed process can`t usage cpu.

I`m don`t have big expirience in kernel programming, consequently best 
if you see source code.
Please communicate me  about all problem in this patch.
This is initial version, without tune jail parameter in runtime.
Thank.
Sorry  for my bad english :)
Original cdjones  cpu and memory limit patch 
http://wiki.freebsd.org/JailResourceLimits

-------------- next part --------------
diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines sys/kern/kern_jail.c sys.new/kern/kern_jail.c
--- sys/kern/kern_jail.c	2009-03-10 22:33:50.000000000 +0300
+++ sys.new/kern/kern_jail.c	2009-04-17 18:51:34.000000000 +0400
@@ -531,6 +532,7 @@ kern_jail(struct thread *td, struct jail
 	}
 #endif
 	pr->pr_linux = NULL;
+    pr->pr_sched_shares = j->sched_shares;
 	pr->pr_securelevel = securelevel;
 	if (prison_service_slots == 0)
 		pr->pr_slots = NULL;
diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines sys/kern/sched_ule.c sys.new/kern/sched_ule.c
--- sys/kern/sched_ule.c	2009-03-30 23:20:56.000000000 +0400
+++ sys.new/kern/sched_ule.c	2009-04-17 19:10:07.000000000 +0400
@@ -61,6 +61,7 @@ __FBSDID("$FreeBSD: src/sys/kern/sched_u
 #include <sys/umtx.h>
 #include <sys/vmmeter.h>
 #include <sys/cpuset.h>
+#include <sys/jail.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
@@ -186,6 +187,22 @@ static int sched_interact = SCHED_INTERA
 static int realstathz;
 static int tickincr;
 static int sched_slice;
+
+#define ESTCPU_SHIFT	10
+/*
+ * estcpu:					Global counter ticks from stat timer 
+ * flush_estcpu_interval:   Number ticks, after that we to zero estcpu,
+ *                          flush_estcpu_interval = mp_ncpus*stathz*10, 
+ *							default 2*128*10 = 2560
+ * total_sched_shares:      Total count shares cpu, 1000 per core, 
+ *							default 2*1000 = 2000 
+*/
+
+
+static int estcpu;
+static int flush_estcpu_interval = 2560;
+static int total_sched_shares = 2000;
+
 #ifdef PREEMPTION
 #ifdef FULL_PREEMPTION
 static int preempt_thresh = PRI_MAX_IDLE;
@@ -2200,6 +2219,7 @@ sched_clock(struct thread *td)
 {
 	struct tdq *tdq;
 	struct td_sched *ts;
+	struct prison *pr = td->td_proc->p_ucred->cr_prison;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	tdq = TDQ_SELF();
@@ -2234,6 +2254,20 @@ sched_clock(struct thread *td)
 		td->td_sched->ts_runtime += tickincr;
 		sched_interact_update(td);
 	}
+
+	/* Increase counter and flush if need */
+	estcpu++;
+	if (pr != NULL)
+		pr->pr_estcpu++;
+
+	if (estcpu > flush_estcpu_interval){
+		estcpu = 0;
+		LIST_FOREACH(pr, &allprison, pr_list) {
+			pr->pr_estcpu = 0;
+		}
+		CTR0(KTR_SCHED,"Flush estcpu and pr_estcpu for all jails");
+	}
+
 	/*
 	 * We used up one time slice.
 	 */
@@ -2375,6 +2409,8 @@ tdq_add(struct tdq *tdq, struct thread *
 	int cpumask;
 #endif
 
+    struct prison *pr = td->td_proc->p_ucred->cr_prison;
+
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 	    ("sched_add: trying to run inhibited thread"));
@@ -2383,6 +2419,32 @@ tdq_add(struct tdq *tdq, struct thread *
 	KASSERT(td->td_flags & TDF_INMEM,
 	    ("sched_add: thread swapped out"));
 
+        /* We move thread in IDLE queue if prison estimate cpu more than shares
+		 * cpu and thread is not interactive. Use ESTCPU_SHIFT to avoid
+		 * rounding away results */
+    if(pr != NULL)
+    	CTR6(KTR_SCHED,"pid %i, prison %i, pr_estcpu %i,\
+						estcpu %i shares %i interact %i",
+                   		td->td_proc->p_pid,pr->pr_id,pr->pr_estcpu,
+						estcpu, pr->pr_sched_shares, sched_interact_score(td));
+    if (pr != NULL && pr->pr_sched_shares != 0 &&
+        sched_interact_score(td) > sched_interact &&
+		estcpu != 0 && total_sched_shares != 0){
+
+    	if ((pr->pr_estcpu          << ESTCPU_SHIFT)  / (estcpu) >
+          	(pr->pr_sched_shares    << ESTCPU_SHIFT)  / (total_sched_shares))
+        {
+          	td->td_priority  = PRI_MIN_IDLE; 
+			td->td_pri_class = PRI_IDLE;
+            CTR2(KTR_SCHED,"prison %i excess cpu limit!!! new pri = %i ",pr->pr_id,td->td_priority);
+
+        } else {
+            CTR1(KTR_SCHED,"prison %i use cpu less limit",pr->pr_id);     
+			sched_priority(td);
+			td->td_pri_class = PRI_TIMESHARE;
+        }
+    }
+
 	ts = td->td_sched;
 	class = PRI_BASE(td->td_pri_class);
         TD_SET_RUNQ(td);
@@ -2746,6 +2808,10 @@ SYSCTL_INT(_kern_sched, OID_AUTO, intera
      "Interactivity score threshold");
 SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh,
      0,"Min priority for preemption, lower priorities have greater precedence");
+SYSCTL_INT(_kern_sched, OID_AUTO, flush_estcpu_interval, CTLFLAG_RW, &flush_estcpu_interval,
+     0,"Number ticks stat timer after thar we zero estcpu counter");
+SYSCTL_INT(_kern_sched, OID_AUTO, total_sched_shares, CTLFLAG_RW, &total_sched_shares,
+     0,"Total number shared cpu for system");
 #ifdef SMP
 SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0,
     "Pick the target cpu based on priority rather than load.");
diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines sys/sys/jail.h sys.new/sys/jail.h
--- sys/sys/jail.h	2009-02-18 23:12:08.000000000 +0300
+++ sys.new/sys/jail.h	2009-04-17 18:53:43.000000000 +0400
@@ -31,6 +31,7 @@ struct jail {
 	uint32_t	ip6s;
 	struct in_addr	*ip4;
 	struct in6_addr	*ip6;
+	uint32_t	sched_shares;
 };
 #define	JAIL_API_VERSION 2
 
@@ -132,6 +133,9 @@ struct prison {
 	struct task	 pr_task;			/* (d) destroy task */
 	struct mtx	 pr_mtx;
 	void		**pr_slots;			/* (p) additional data */
+	uint32_t	pr_estcpu;			/* (p) cpu usage */
+	uint32_t	pr_sched_shares;	/* (c) number virtual cpu */
+
 	int		 pr_ip4s;			/* (c) number of v4 IPs */
 	struct in_addr	*pr_ip4;			/* (c) v4 IPs of jail */
 	int		 pr_ip6s;			/* (c) number of v6 IPs */
diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines usr.sbin/jail/jail.c usr.sbin.new/jail/jail.c
--- usr.sbin/jail/jail.c	2009-02-07 16:19:08.000000000 +0300
+++ usr.sbin.new/jail/jail.c	2009-04-17 18:57:15.000000000 +0400
@@ -83,6 +83,7 @@ main(int argc, char **argv)
 	int ch, error, i, ngroups, securelevel;
 	int hflag, iflag, Jflag, lflag, uflag, Uflag;
 	char path[PATH_MAX], *jailname, *ep, *username, *JidFile, *ip;
+	uint32_t sched_shares = 0;
 	static char *cleanenv;
 	const char *shell, *p = NULL;
 	long ltmp;
@@ -94,7 +95,7 @@ main(int argc, char **argv)
 	jailname = username = JidFile = cleanenv = NULL;
 	fp = NULL;
 
-	while ((ch = getopt(argc, argv, "hiln:s:u:U:J:")) != -1) {
+	while ((ch = getopt(argc, argv, "hilS:n:s:u:U:J:")) != -1) {
 		switch (ch) {
 		case 'h':
 			hflag = 1;
@@ -115,6 +116,9 @@ main(int argc, char **argv)
 				errx(1, "invalid securelevel: `%s'", optarg);
 			securelevel = ltmp;
 			break;
+		case 'S':
+			sched_shares = (uint32_t)strtol(optarg,NULL,10);
+			break;
 		case 'u':
 			username = optarg;
 			uflag = 1;
@@ -152,6 +156,8 @@ main(int argc, char **argv)
 	if (jailname != NULL)
 		j.jailname = jailname;
 
+	j.sched_shares = sched_shares;
+
 	/* Handle IP addresses. If requested resolve hostname too. */
 	bzero(&hints, sizeof(struct addrinfo));
 	hints.ai_protocol = IPPROTO_TCP;
@@ -264,9 +270,10 @@ static void
 usage(void)
 {
 
-	(void)fprintf(stderr, "%s%s%s\n",
+	(void)fprintf(stderr, "%s%s%s%s\n",
 	     "usage: jail [-hi] [-n jailname] [-J jid_file] ",
 	     "[-s securelevel] [-l -u username | -U username] ",
+		 "[-S number shared cpu] ",
 	     "path hostname [ip[,..]] command ...");
 	exit(1);
 }
-------------- next part --------------
diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines sys/kern/kern_jail.c sys.new/kern/kern_jail.c
--- sys/kern/kern_jail.c	2008-11-25 05:59:29.000000000 +0300
+++ sys.new/kern/kern_jail.c	2009-04-17 20:23:40.000000000 +0400
@@ -156,6 +156,7 @@ jail(struct thread *td, struct jail_args
 		goto e_dropvnref;
 	pr->pr_ip = j.ip_number;
 	pr->pr_linux = NULL;
+    pr->pr_sched_shares = j->sched_shares;
 	pr->pr_securelevel = securelevel;
 	if (prison_service_slots == 0)
 		pr->pr_slots = NULL;
diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines sys/kern/sched_ule.c sys.new/kern/sched_ule.c
--- sys/kern/sched_ule.c	2008-11-25 05:59:29.000000000 +0300
+++ sys.new/kern/sched_ule.c	2009-04-17 20:23:40.000000000 +0400
@@ -61,6 +61,7 @@ __FBSDID("$FreeBSD: src/sys/kern/sched_u
 #include <sys/umtx.h>
 #include <sys/vmmeter.h>
 #include <sys/cpuset.h>
+#include <sys/jail.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
@@ -186,6 +187,22 @@ static int sched_interact = SCHED_INTERA
 static int realstathz;
 static int tickincr;
 static int sched_slice;
+
+#define ESTCPU_SHIFT	10
+/*
+ * estcpu:					Global counter ticks from stat timer 
+ * flush_estcpu_interval:   Number ticks, after that we to zero estcpu,
+ *                          flush_estcpu_interval = mp_ncpus*stathz*10, 
+ *							default 2*128*10 = 2560
+ * total_sched_shares:      Total count shares cpu, 1000 per core, 
+ *							default 2*1000 = 2000 
+*/
+
+
+static int estcpu;
+static int flush_estcpu_interval = 2560;
+static int total_sched_shares = 2000;
+
 #ifdef PREEMPTION
 #ifdef FULL_PREEMPTION
 static int preempt_thresh = PRI_MAX_IDLE;
@@ -2200,6 +2217,7 @@ sched_clock(struct thread *td)
 {
 	struct tdq *tdq;
 	struct td_sched *ts;
+	struct prison *pr = td->td_proc->p_ucred->cr_prison;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	tdq = TDQ_SELF();
@@ -2234,6 +2252,20 @@ sched_clock(struct thread *td)
 		td->td_sched->ts_runtime += tickincr;
 		sched_interact_update(td);
 	}
+
+	/* Increase counter and flush if need */
+	estcpu++;
+	if (pr != NULL)
+		pr->pr_estcpu++;
+
+	if (estcpu > flush_estcpu_interval){
+		estcpu = 0;
+		LIST_FOREACH(pr, &allprison, pr_list) {
+			pr->pr_estcpu = 0;
+		}
+		CTR0(KTR_SCHED,"Flush estcpu and pr_estcpu for all jails");
+	}
+
 	/*
 	 * We used up one time slice.
 	 */
@@ -2375,6 +2407,8 @@ tdq_add(struct tdq *tdq, struct thread *
 	int cpumask;
 #endif
 
+    struct prison *pr = td->td_proc->p_ucred->cr_prison;
+
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	KASSERT((td->td_inhibitors == 0),
 	    ("sched_add: trying to run inhibited thread"));
@@ -2383,6 +2417,32 @@ tdq_add(struct tdq *tdq, struct thread *
 	KASSERT(td->td_flags & TDF_INMEM,
 	    ("sched_add: thread swapped out"));
 
+        /* We move thread in IDLE queue if prison estimate cpu more than shares
+		 * cpu and thread is not interactive. Use ESTCPU_SHIFT to avoid
+		 * rounding away results */
+    if(pr != NULL)
+    	CTR6(KTR_SCHED,"pid %i, prison %i, pr_estcpu %i,\
+						estcpu %i shares %i interact %i",
+                   		td->td_proc->p_pid,pr->pr_id,pr->pr_estcpu,
+						estcpu, pr->pr_sched_shares, sched_interact_score(td));
+    if (pr != NULL && pr->pr_sched_shares != 0 &&
+        sched_interact_score(td) > sched_interact &&
+		estcpu != 0 && total_sched_shares != 0){
+
+    	if ((pr->pr_estcpu          << ESTCPU_SHIFT)  / (estcpu) >
+          	(pr->pr_sched_shares    << ESTCPU_SHIFT)  / (total_sched_shares))
+        {
+          	td->td_priority  = PRI_MIN_IDLE; 
+			td->td_pri_class = PRI_IDLE;
+            CTR2(KTR_SCHED,"prison %i excess cpu limit!!! new pri = %i ",pr->pr_id,td->td_priority);
+
+        } else {
+            CTR1(KTR_SCHED,"prison %i use cpu less limit",pr->pr_id);     
+			sched_priority(td);
+			td->td_pri_class = PRI_TIMESHARE;
+        }
+    }
+
 	ts = td->td_sched;
 	class = PRI_BASE(td->td_pri_class);
         TD_SET_RUNQ(td);
@@ -2741,6 +2801,10 @@ SYSCTL_INT(_kern_sched, OID_AUTO, intera
      "Interactivity score threshold");
 SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh,
      0,"Min priority for preemption, lower priorities have greater precedence");
+SYSCTL_INT(_kern_sched, OID_AUTO, flush_estcpu_interval, CTLFLAG_RW, &flush_estcpu_interval,
+     0,"Number ticks stat timer after thar we zero estcpu counter");
+SYSCTL_INT(_kern_sched, OID_AUTO, total_sched_shares, CTLFLAG_RW, &total_sched_shares,
+     0,"Total number shared cpu for system");
 #ifdef SMP
 SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0,
     "Pick the target cpu based on priority rather than load.");
diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines sys/sys/jail.h sys.new/sys/jail.h
--- sys/sys/jail.h	2008-11-25 05:59:29.000000000 +0300
+++ sys.new/sys/jail.h	2009-04-17 20:26:54.000000000 +0400
@@ -18,6 +18,7 @@ struct jail {
 	char		*path;
 	char		*hostname;
 	u_int32_t	ip_number;
+	uint32_t	sched_shares;
 };
 
 struct xprison {
@@ -74,6 +75,8 @@ struct prison {
 	struct task	 pr_task;			/* (d) destroy task */
 	struct mtx	 pr_mtx;
 	void		**pr_slots;			/* (p) additional data */
+	uint32_t	pr_estcpu;			/* (p) cpu usage */
+	uint32_t	pr_sched_shares;	/* (c) number virtual cpu */
 };
 #endif /* _KERNEL || _WANT_PRISON */
 
diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines usr.sbin/jail/jail.c usr.sbin.new/jail/jail.c
--- usr.sbin/jail/jail.c	2008-11-25 05:59:29.000000000 +0300
+++ usr.sbin.new/jail/jail.c	2009-04-17 20:31:17.000000000 +0400
@@ -57,6 +57,7 @@ main(int argc, char **argv)
 	gid_t groups[NGROUPS];
 	int ch, i, iflag, Jflag, lflag, ngroups, securelevel, uflag, Uflag;
 	char path[PATH_MAX], *ep, *username, *JidFile;
+	uint32_t sched_shares = 0;
 	static char *cleanenv;
 	const char *shell, *p = NULL;
 	long ltmp;
@@ -67,7 +68,7 @@ main(int argc, char **argv)
 	username = JidFile = cleanenv = NULL;
 	fp = NULL;
 
-	while ((ch = getopt(argc, argv, "ils:u:U:J:")) != -1) {
+	while ((ch = getopt(argc, argv, "ilS:s:u:U:J:")) != -1) {
 		switch (ch) {
 		case 'i':
 			iflag = 1;
@@ -82,6 +83,9 @@ main(int argc, char **argv)
 				errx(1, "invalid securelevel: `%s'", optarg);
 			securelevel = ltmp;
 			break;
+		case 'S':
+			sched_shares = (uint32_t)strtol(optarg,NULL,10);
+			break;
 		case 'u':
 			username = optarg;
 			uflag = 1;
@@ -115,6 +119,7 @@ main(int argc, char **argv)
 	j.version = 0;
 	j.path = path;
 	j.hostname = argv[1];
+	j.sched_shares = sched_shares;
 	if (inet_aton(argv[2], &in) == 0)
 		errx(1, "Could not make sense of ip-number: %s", argv[2]);
 	j.ip_number = ntohl(in.s_addr);
@@ -182,9 +187,10 @@ static void
 usage(void)
 {
 
-	(void)fprintf(stderr, "%s%s%s\n",
+	(void)fprintf(stderr, "%s%s%s%s\n",
 	     "usage: jail [-i] [-J jid_file] [-s securelevel] [-l -u ",
 	     "username | -U username]",
+		 "[-S number shared cpu] ",
 	     " path hostname ip-number command ...");
 	exit(1);
 }


More information about the freebsd-jail mailing list