svn commit: r329616 - in user/jeff/numa/sys: kern sys vm

Jeff Roberson jeff at FreeBSD.org
Tue Feb 20 02:30:52 UTC 2018


Author: jeff
Date: Tue Feb 20 02:30:51 2018
New Revision: 329616
URL: https://svnweb.freebsd.org/changeset/base/329616

Log:
  PID Controlled page daemon
  
  Differential Revision:	https://reviews.freebsd.org/D14402

Added:
  user/jeff/numa/sys/kern/subr_pidctrl.c   (contents, props changed)
  user/jeff/numa/sys/sys/pidctrl.h   (contents, props changed)
Modified:
  user/jeff/numa/sys/vm/vm_meter.c
  user/jeff/numa/sys/vm/vm_page.c
  user/jeff/numa/sys/vm/vm_pageout.c
  user/jeff/numa/sys/vm/vm_pagequeue.h

Added: user/jeff/numa/sys/kern/subr_pidctrl.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/jeff/numa/sys/kern/subr_pidctrl.c	Tue Feb 20 02:30:51 2018	(r329616)
@@ -0,0 +1,157 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2017,  Jeffrey Roberson <jeff at freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/pidctrl.h>
+
+void
+pidctrl_init(struct pidctrl *pc, int interval, int setpoint, int bound,
+    int Kpd, int Kid, int Kdd)
+{
+
+	bzero(pc, sizeof(*pc));
+	pc->pc_setpoint = setpoint;
+	pc->pc_interval = interval;
+	pc->pc_bound = bound * setpoint * Kid;
+	pc->pc_Kpd = Kpd;
+	pc->pc_Kid = Kid;
+	pc->pc_Kdd = Kdd;
+}
+
+void
+pidctrl_init_sysctl(struct pidctrl *pc, struct sysctl_oid_list *parent)
+{
+
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "error", CTLFLAG_RD,
+	    &pc->pc_error, 0, "Current difference from setpoint value (P)");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "olderror", CTLFLAG_RD,
+	    &pc->pc_olderror, 0, "Error value from last interval");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "integral", CTLFLAG_RD,
+	    &pc->pc_integral, 0, "Accumulated error integral (I)");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "derivative",
+	    CTLFLAG_RD, &pc->pc_derivative, 0, "Error derivative (I)");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "input", CTLFLAG_RD,
+	    &pc->pc_input, 0, "Last controller process variable input");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "output", CTLFLAG_RD,
+	    &pc->pc_output, 0, "Last controller output");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "ticks", CTLFLAG_RD,
+	    &pc->pc_ticks, 0, "Last controler runtime");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "setpoint", CTLFLAG_RW,
+	    &pc->pc_setpoint, 0, "Desired level for process variable");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "interval", CTLFLAG_RD,
+	    &pc->pc_interval, 0, "Interval between calculations (ticks)");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "bound", CTLFLAG_RW,
+	    &pc->pc_bound, 0, "Integral wind-up limit");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "kpd", CTLFLAG_RW,
+	    &pc->pc_Kpd, 0, "Inverse of proportional gain");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "kid", CTLFLAG_RW,
+	    &pc->pc_Kid, 0, "Inverse of integral gain");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "kdd", CTLFLAG_RW,
+	    &pc->pc_Kdd, 0, "Inverse of derivative gain");
+}
+
+int
+pidctrl_classic(struct pidctrl *pc, int input)
+{
+	int output, error;
+	int Kpd, Kid, Kdd;
+
+	error = pc->pc_setpoint - input;
+	pc->pc_ticks = ticks;
+	pc->pc_olderror = pc->pc_error;
+
+	/* Fetch gains and prevent divide by zero. */
+	Kpd = MAX(pc->pc_Kpd, 1);
+	Kid = MAX(pc->pc_Kid, 1);
+	Kdd = MAX(pc->pc_Kdd, 1);
+
+	/* Compute P (proportional error), I (integral), D (derivative) */
+	pc->pc_error = error;
+	pc->pc_integral =
+	    MAX(MIN(pc->pc_integral + error, pc->pc_bound), -pc->pc_bound);
+	pc->pc_derivative = error - pc->pc_olderror;
+
+	/* Divide by inverse gain values to produce output. */
+	output = ((pc->pc_error / pc->pc_Kpd) +
+	    (pc->pc_integral / pc->pc_Kid)) +
+	    (pc->pc_derivative / pc->pc_Kdd);
+	/* Save for sysctl. */
+	pc->pc_output = output;
+	pc->pc_input = input;
+
+	return output;
+}
+
+int
+pidctrl_daemon(struct pidctrl *pc, int input)
+{
+	int output, error;
+	int Kpd, Kid, Kdd;
+
+	error = pc->pc_setpoint - input;
+	/*
+	 * When ticks expired we reset our variables and start a new
+	 * interval.  If we're called multiple times during one interval
+	 * we attempt to report a target as if the entire error came at
+	 * the interval boundary.
+	 */
+	if ((u_int)(ticks - pc->pc_ticks) >= pc->pc_interval) {
+		pc->pc_ticks = ticks;
+		pc->pc_olderror = pc->pc_error;
+		pc->pc_output = pc->pc_error = 0;
+	} else {
+		error = MAX(error + pc->pc_error, 0);
+	}
+
+	/* Fetch gains and prevent divide by zero. */
+	Kpd = MAX(pc->pc_Kpd, 1);
+	Kid = MAX(pc->pc_Kid, 1);
+	Kdd = MAX(pc->pc_Kdd, 1);
+
+	/* Compute P (proportional error), I (integral), D (derivative) */
+	pc->pc_error = error;
+	pc->pc_integral =
+	    MAX(MIN(pc->pc_integral + error, pc->pc_bound), 0);
+	pc->pc_derivative = error - pc->pc_olderror;
+
+	/* Divide by inverse gain values to produce output. */
+	output = ((error / pc->pc_Kpd) +
+	    (pc->pc_integral / pc->pc_Kid)) +
+	    (pc->pc_derivative / pc->pc_Kdd);
+	output = MAX(output - pc->pc_output, 0);
+	pc->pc_output += output;
+	pc->pc_input = input;
+
+	return output;
+}

Added: user/jeff/numa/sys/sys/pidctrl.h
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/jeff/numa/sys/sys/pidctrl.h	Tue Feb 20 02:30:51 2018	(r329616)
@@ -0,0 +1,123 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2017,  Jeffrey Roberson <jeff at freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_PIDCTRL_H_
+#define _SYS_PIDCTRL_H_
+
+/*
+ * Proportional Integral Derivative controller.
+ *
+ * This controller is intended to replace a multitude of threshold based
+ * daemon regulation systems.  These systems produce sharp sawtooths of
+ * activity which can cause latency spikes and other undesireable bursty
+ * behavior.  The PID controller adapts to changing load conditions and
+ * adjusts the work done by the daemon to keep a smoother output.
+ *
+ * The setpoint can be thought of as a single watermark that the controller
+ * is always trying to reach.  Compared to a high water/low water type
+ * algorithm the pid controller is dynamically deciding the low water and
+ * regulating to the high water.  The setpoint should be high enough that
+ * the controller and daemon have time to observe the rise in value and
+ * respond to it, else the resource may be exhausted.  More frequent wakeups
+ * permit higher setpoints and less underutilized resources.
+ *
+ * The controller has been optimised for simplicity of math making it quite
+ * inexpensive to execute.  There is no floating point and so the gains must
+ * be the inverse of whole integers.
+ *
+ * Failing to measure and tune the gain parameters can result in wild
+ * oscillations in output.  It is strongly encouraged that controllers are
+ * tested and tuned under a wide variety of workloads before gain values are
+ * picked.  Some reasonable defaults are provided below.
+ */
+
+struct pidctrl {
+	/* Saved control variables. */
+	int	pc_error;		/* Current error. */
+	int	pc_olderror;		/* Saved error for derivative. */
+	int	pc_integral;		/* Integral accumulator. */
+	int	pc_derivative;		/* Change from last error. */
+	int	pc_input;		/* Last input. */
+	int	pc_output;		/* Last output. */
+	int	pc_ticks;		/* Last sampling time. */
+	/* configuration options, runtime tunable via sysctl */
+	int	pc_setpoint;		/* Desired level */
+	int	pc_interval;		/* Update interval in ticks. */
+	int	pc_bound;		/* Integral wind-up limit. */
+	int	pc_Kpd;			/* Proportional gain divisor. */
+	int	pc_Kid;			/* Integral gain divisor. */
+	int	pc_Kdd;			/* Derivative gain divisor. */
+};
+
+/*
+ * Reasonable default divisors.
+ *
+ * Actual gains are 1/divisor.  Gains interact in complex ways with the
+ * setpoint and interval.  Measurement under multiple loads should be
+ * taken to ensure adequate stability and rise time.
+ */
+#define	PIDCTRL_KPD	3		/* Default proportional divisor. */
+#define	PIDCTRL_KID	4		/* Default integral divisor. */
+#define	PIDCTRL_KDD	8		/* Default derivative divisor. */
+#define	PIDCTRL_BOUND	4		/* Bound factor, setpoint multiple. */
+
+struct sysctl_oid_list;
+
+void	pidctrl_init(struct pidctrl *pc, int interval, int setpoint,
+	    int bound, int Kpd, int Kid, int Kdd);
+void	pidctrl_init_sysctl(struct pidctrl *pc, struct sysctl_oid_list *parent);
+
+/*
+ * This is the classic PID controller where the interval is clamped to
+ * [-bound, bound] and the output may be negative.  This should be used
+ * in continuous control loops that can adjust a process variable in
+ * either direction.  This is a descrete time controller and should
+ * only be called once per-interval or the derivative term will be
+ * inaccurate.
+ */
+int	pidctrl_classic(struct pidctrl *pc, int input);
+
+/*
+ * This controler is intended for consumer type daemons that can only
+ * regulate in a positive direction, that is to say, they can not exert
+ * positive pressure on the process variable or input.  They can only
+ * reduce it by doing work.  As such the integral is bound between [0, bound]
+ * and the output is similarly a positive value reflecting the units of
+ * work necessary to be completed in the current interval to eliminate error.
+ *
+ * It is a descrete time controller but can be invoked more than once in a
+ * given time interval for ease of client implementation.  This should only
+ * be done in overload situations or the controller may not produce a stable
+ * output.  Calling it less frequently when there is no work to be done will
+ * increase the rise time but should otherwise be harmless.
+ */
+int	pidctrl_daemon(struct pidctrl *pc, int input);
+
+#endif	/* !_SYS_PIDCTRL_H_ */

Modified: user/jeff/numa/sys/vm/vm_meter.c
==============================================================================
--- user/jeff/numa/sys/vm/vm_meter.c	Tue Feb 20 02:18:30 2018	(r329615)
+++ user/jeff/numa/sys/vm/vm_meter.c	Tue Feb 20 02:30:51 2018	(r329616)
@@ -473,3 +473,58 @@ vm_laundry_count(void)
 	return vm_pagequeue_count(PQ_LAUNDRY);
 }
 
+static void
+vm_domain_stats_init(struct vm_domain *vmd, struct sysctl_oid *parent)
+{
+	struct sysctl_oid *oid;
+
+	vmd->vmd_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(parent), OID_AUTO,
+	    vmd->vmd_name, CTLFLAG_RD, NULL, "");
+	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO,
+	    "stats", CTLFLAG_RD, NULL, "");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "free_count", CTLFLAG_RD, &vmd->vmd_free_count, 0,
+	    "Free pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "active", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_cnt, 0,
+	    "Active pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "inactive", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt, 0,
+	    "Inactive pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "laundry", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt, 0,
+	    "laundry pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "unswappable",
+	    CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt, 0,
+	    "Unswappable pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "inactive_target", CTLFLAG_RD, &vmd->vmd_inactive_target, 0,
+	    "Target inactive pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "free_target", CTLFLAG_RD, &vmd->vmd_free_target, 0,
+	    "Target free pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "free_reserved", CTLFLAG_RD, &vmd->vmd_free_reserved, 0,
+	    "Reserved free pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "free_min", CTLFLAG_RD, &vmd->vmd_free_min, 0,
+	    "Minimum free pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "free_severe", CTLFLAG_RD, &vmd->vmd_free_severe, 0,
+	    "Severe free pages");
+
+}
+
+static void
+vm_stats_init(void *arg __unused)
+{
+	struct sysctl_oid *oid;
+	int i;
+
+	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_vm), OID_AUTO,
+	    "domain", CTLFLAG_RD, NULL, "");
+	for (i = 0; i < vm_ndomains; i++)
+		vm_domain_stats_init(VM_DOMAIN(i), oid);
+}
+
+SYSINIT(vmstats_init, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_stats_init, NULL);

Modified: user/jeff/numa/sys/vm/vm_page.c
==============================================================================
--- user/jeff/numa/sys/vm/vm_page.c	Tue Feb 20 02:18:30 2018	(r329615)
+++ user/jeff/numa/sys/vm/vm_page.c	Tue Feb 20 02:30:51 2018	(r329616)
@@ -476,6 +476,7 @@ vm_page_domain_init(int domain)
 		}
 	}
 	mtx_init(&vmd->vmd_free_mtx, "vm page free queue", NULL, MTX_DEF);
+	snprintf(vmd->vmd_name, sizeof(vmd->vmd_name), "%d", domain);
 }
 
 /*
@@ -2934,9 +2935,9 @@ vm_wait(void)
 		 * consume all freed pages while old allocators wait.
 		 */
 		mtx_lock(&vm_domainset_lock);
-		if (vm_page_count_min()) {
+		if (vm_page_count_severe()) {
 			vm_min_waiters++;
-			msleep(&vm_min_domains, &vm_domainset_lock, PVM,
+			msleep(&vm_severe_domains, &vm_domainset_lock, PVM,
 			    "vmwait", 0);
 		}
 		mtx_unlock(&vm_domainset_lock);
@@ -3204,7 +3205,7 @@ vm_domain_free_wakeup(struct vm_domain *vmd)
 	 * high water mark. And wakeup scheduler process if we have
 	 * lots of memory. this process will swapin processes.
 	 */
-	if (vmd->vmd_pages_needed && !vm_paging_min(vmd)) {
+	if (vmd->vmd_pages_needed && !vm_paging_severe(vmd)) {
 		vmd->vmd_pages_needed = false;
 		wakeup(&vmd->vmd_free_count);
 	}

Modified: user/jeff/numa/sys/vm/vm_pageout.c
==============================================================================
--- user/jeff/numa/sys/vm/vm_pageout.c	Tue Feb 20 02:18:30 2018	(r329615)
+++ user/jeff/numa/sys/vm/vm_pageout.c	Tue Feb 20 02:30:51 2018	(r329616)
@@ -124,7 +124,7 @@ static void vm_pageout(void);
 static void vm_pageout_init(void);
 static int vm_pageout_clean(vm_page_t m, int *numpagedout);
 static int vm_pageout_cluster(vm_page_t m);
-static bool vm_pageout_scan(struct vm_domain *vmd, int pass);
+static bool vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage);
 static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
     int starting_page_shortage);
 
@@ -146,7 +146,7 @@ SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
 
 /* Pagedaemon activity rates, in subdivisions of one second. */
 #define	VM_LAUNDER_RATE		10
-#define	VM_INACT_SCAN_RATE	2
+#define	VM_INACT_SCAN_RATE	10
 
 static int vm_pageout_oom_seq = 12;
 
@@ -1206,7 +1206,7 @@ out:
  * queue scan to meet the target.
  */
 static bool
-vm_pageout_scan(struct vm_domain *vmd, int pass)
+vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage)
 {
 	struct pgo_pglist pglist;
 	vm_page_t m, next;
@@ -1251,7 +1251,7 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
 	 */
 	if (pass > 0) {
 		deficit = atomic_readandclear_int(&vmd->vmd_pageout_deficit);
-		page_shortage = vm_paging_target(vmd) + deficit;
+		page_shortage = shortage + deficit;
 	} else
 		page_shortage = deficit = 0;
 	starting_page_shortage = page_shortage;
@@ -1505,7 +1505,7 @@ lock_queue:
 	 */
 	inactq_shortage = vmd->vmd_inactive_target - (pq->pq_cnt +
 	    vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt / act_scan_laundry_weight) +
-	    vm_paging_target(vmd) + deficit + addl_page_shortage;
+	    shortage + deficit + addl_page_shortage;
 	inactq_shortage *= act_scan_laundry_weight;
 
 	pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
@@ -1875,12 +1875,13 @@ static void
 vm_pageout_worker(void *arg)
 {
 	struct vm_domain *vmd;
-	int domain, pass;
+	int domain, pass, shortage;
 	bool target_met;
 
 	domain = (uintptr_t)arg;
 	vmd = VM_DOMAIN(domain);
 	pass = 0;
+	shortage = 0;
 	target_met = true;
 
 	/*
@@ -1918,54 +1919,40 @@ vm_pageout_worker(void *arg)
 			vmd->vmd_pages_needed = false;
 			wakeup(&vmd->vmd_free_count);
 		}
-
 		/*
-		 * Do not clear vmd_pageout_wanted until we reach our free page
-		 * target.  Otherwise, we may be awakened over and over again,
-		 * wasting CPU time.
+		 * Might the page daemon need to run again?
 		 */
-		if (vmd->vmd_pageout_wanted && target_met)
-			vmd->vmd_pageout_wanted = false;
-
-		/*
-		 * Might the page daemon receive a wakeup call?
-		 */
-		if (vmd->vmd_pageout_wanted) {
+		if (vm_paging_needed(vmd, vmd->vmd_free_count)) {
 			/*
-			 * No.  Either vmd_pageout_wanted was set by another
-			 * thread during the previous scan, which must have
-			 * been a level 0 scan, or vmd_pageout_wanted was
-			 * already set and the scan failed to free enough
-			 * pages.  If we haven't yet performed a level >= 1
-			 * (page reclamation) scan, then increase the level
-			 * and scan again now.  Otherwise, sleep a bit and
-			 * try again later.
+			 * Yes, the scan failed to free enough pages.  If
+			 * we have performed a level >= 1 (page reclamation)
+			 * scan, then sleep a bit and try again.
 			 */
 			vm_domain_free_unlock(vmd);
-			if (pass >= 1)
+			if (pass > 1)
 				pause("pwait", hz / VM_INACT_SCAN_RATE);
-			pass++;
 		} else {
 			/*
-			 * Yes.  If threads are still sleeping in VM_WAIT
-			 * then we immediately start a new scan.  Otherwise,
-			 * sleep until the next wakeup or until pages need to
-			 * have their reference stats updated.
+			 * No, sleep until the next wakeup or until pages
+			 * need to have their reference stats updated.
 			 */
-			if (vmd->vmd_pages_needed) {
-				vm_domain_free_unlock(vmd);
-				if (pass == 0)
-					pass++;
-			} else if (mtx_sleep(&vmd->vmd_pageout_wanted,
+			vmd->vmd_pageout_wanted = false;
+			if (mtx_sleep(&vmd->vmd_pageout_wanted,
 			    vm_domain_free_lockptr(vmd), PDROP | PVM,
-			    "psleep", hz) == 0) {
+			    "psleep", hz / VM_INACT_SCAN_RATE) == 0)
 				VM_CNT_INC(v_pdwakeups);
-				pass = 1;
-			} else
-				pass = 0;
 		}
+		shortage = pidctrl_daemon(&vmd->vmd_pid, vmd->vmd_free_count);
+		if (shortage && pass == 0)
+			pass = 1;
 
-		target_met = vm_pageout_scan(vmd, pass);
+		target_met = vm_pageout_scan(vmd, pass, shortage);
+		/*
+		 * If the target was not met we must increase the pass to
+		 * more aggressively reclaim.
+		 */
+		if (!target_met)
+			pass++;
 	}
 }
 
@@ -1976,6 +1963,7 @@ static void
 vm_pageout_init_domain(int domain)
 {
 	struct vm_domain *vmd;
+	struct sysctl_oid *oid;
 	int lim, i, j;
 
 	vmd = VM_DOMAIN(domain);
@@ -2003,10 +1991,10 @@ vm_pageout_init_domain(int domain)
 		vmd->vmd_inactive_target = vmd->vmd_free_count / 3;
 
 	/*
-	 * Set the default wakeup threshold to be 10% above the minimum
-	 * page limit.  This keeps the steady state out of shortfall.
+	 * Set the default wakeup threshold to be 10% below the paging
+	 * target.  This keeps the steady state out of shortfall.
 	 */
-	vmd->vmd_pageout_wakeup_thresh = (vmd->vmd_free_min / 10) * 11;
+	vmd->vmd_pageout_wakeup_thresh = (vmd->vmd_free_target / 10) * 9;
 
 	/*
 	 * Target amount of memory to move out of the laundry queue during a
@@ -2031,6 +2019,14 @@ vm_pageout_init_domain(int domain)
 	for (i = 0; i < PQ_COUNT; i++)
 		for (j = 0; j < BPQ_COUNT; j++)
 			vmd->vmd_pagequeues[i].pq_bpqs[j].bpq_lim = lim;
+
+	/* Initialize the pageout daemon pid controller. */
+	pidctrl_init(&vmd->vmd_pid, hz / VM_INACT_SCAN_RATE,
+	    vmd->vmd_free_target, PIDCTRL_BOUND,
+	    PIDCTRL_KPD, PIDCTRL_KID, PIDCTRL_KDD);
+	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO,
+	    "pidctrl", CTLFLAG_RD, NULL, "");
+	pidctrl_init_sysctl(&vmd->vmd_pid, SYSCTL_CHILDREN(oid));
 }
 
 static void

Modified: user/jeff/numa/sys/vm/vm_pagequeue.h
==============================================================================
--- user/jeff/numa/sys/vm/vm_pagequeue.h	Tue Feb 20 02:18:30 2018	(r329615)
+++ user/jeff/numa/sys/vm/vm_pagequeue.h	Tue Feb 20 02:30:51 2018	(r329616)
@@ -86,7 +86,10 @@ struct vm_pagequeue {
 } __aligned(CACHE_LINE_SIZE);
 
 #include <vm/uma.h>
+#include <sys/pidctrl.h>
 
+struct sysctl_oid;
+
 struct vm_domain {
 	struct vm_pagequeue vmd_pagequeues[PQ_COUNT];
 	struct mtx_padalign vmd_free_mtx;
@@ -97,6 +100,7 @@ struct vm_domain {
 	long vmd_segs;			/* bitmask of the segments */
 
 	/* Paging control variables, locked by domain_free_mtx. */
+	struct pidctrl vmd_pid;		/* Pageout controller. */
 	u_int vmd_free_count;
 	boolean_t vmd_oom;
 	int vmd_oom_seq;
@@ -129,6 +133,10 @@ struct vm_domain {
 	u_int vmd_pageout_wakeup_thresh;/* (c) min pages to wake pagedaemon */
 	u_int vmd_interrupt_free_min;	/* (c) reserved pages for int code */
 	u_int vmd_free_severe;		/* (c) severe page depletion point */
+
+	/* Name for sysctl etc. */
+	struct sysctl_oid *vmd_oid;
+	char vmd_name[sizeof(__XSTRING(MAXMEMDOM))];
 } __aligned(CACHE_LINE_SIZE);
 
 extern struct vm_domain vm_dom[MAXMEMDOM];


More information about the svn-src-user mailing list