svn commit: r329882 - in head/sys: conf kern sys vm

Jeff Roberson jeff at FreeBSD.org
Fri Feb 23 22:51:53 UTC 2018


Author: jeff
Date: Fri Feb 23 22:51:51 2018
New Revision: 329882
URL: https://svnweb.freebsd.org/changeset/base/329882

Log:
  Add a generic Proportional Integral Derivative (PID) controller algorithm and
  use it to regulate page daemon output.
  
  This provides much smoother and more responsive page daemon output, anticipating
  demand and avoiding pageout stalls by increasing the number of pages to match
  the workload.  This is a reimplementation of work done by myself and mlaier at
  Isilon.
  
  Reviewed by:	bsdimp
  Tested by:	pho
  Sponsored by:	Netflix, Dell/EMC Isilon
  Differential Revision:	https://reviews.freebsd.org/D14402

Added:
  head/sys/kern/subr_pidctrl.c   (contents, props changed)
  head/sys/sys/pidctrl.h   (contents, props changed)
Modified:
  head/sys/conf/files
  head/sys/vm/vm_meter.c
  head/sys/vm/vm_page.c
  head/sys/vm/vm_pageout.c
  head/sys/vm/vm_pagequeue.h

Modified: head/sys/conf/files
==============================================================================
--- head/sys/conf/files	Fri Feb 23 22:23:28 2018	(r329881)
+++ head/sys/conf/files	Fri Feb 23 22:51:51 2018	(r329882)
@@ -3874,6 +3874,7 @@ kern/subr_msgbuf.c		standard
 kern/subr_param.c		standard
 kern/subr_pcpu.c		standard
 kern/subr_pctrie.c		standard
+kern/subr_pidctrl.c		standard
 kern/subr_power.c		standard
 kern/subr_prf.c			standard
 kern/subr_prof.c		standard

Added: head/sys/kern/subr_pidctrl.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/kern/subr_pidctrl.c	Fri Feb 23 22:51:51 2018	(r329882)
@@ -0,0 +1,157 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2017,  Jeffrey Roberson <jeff at freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/pidctrl.h>
+
+void
+pidctrl_init(struct pidctrl *pc, int interval, int setpoint, int bound,
+    int Kpd, int Kid, int Kdd)
+{
+
+	bzero(pc, sizeof(*pc));
+	pc->pc_setpoint = setpoint;
+	pc->pc_interval = interval;
+	pc->pc_bound = bound * setpoint * Kid;
+	pc->pc_Kpd = Kpd;
+	pc->pc_Kid = Kid;
+	pc->pc_Kdd = Kdd;
+}
+
+void
+pidctrl_init_sysctl(struct pidctrl *pc, struct sysctl_oid_list *parent)
+{
+
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "error", CTLFLAG_RD,
+	    &pc->pc_error, 0, "Current difference from setpoint value (P)");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "olderror", CTLFLAG_RD,
+	    &pc->pc_olderror, 0, "Error value from last interval");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "integral", CTLFLAG_RD,
+	    &pc->pc_integral, 0, "Accumulated error integral (I)");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "derivative",
+	    CTLFLAG_RD, &pc->pc_derivative, 0, "Error derivative (I)");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "input", CTLFLAG_RD,
+	    &pc->pc_input, 0, "Last controller process variable input");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "output", CTLFLAG_RD,
+	    &pc->pc_output, 0, "Last controller output");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "ticks", CTLFLAG_RD,
+	    &pc->pc_ticks, 0, "Last controler runtime");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "setpoint", CTLFLAG_RW,
+	    &pc->pc_setpoint, 0, "Desired level for process variable");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "interval", CTLFLAG_RD,
+	    &pc->pc_interval, 0, "Interval between calculations (ticks)");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "bound", CTLFLAG_RW,
+	    &pc->pc_bound, 0, "Integral wind-up limit");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "kpd", CTLFLAG_RW,
+	    &pc->pc_Kpd, 0, "Inverse of proportional gain");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "kid", CTLFLAG_RW,
+	    &pc->pc_Kid, 0, "Inverse of integral gain");
+	SYSCTL_ADD_INT(NULL, parent, OID_AUTO, "kdd", CTLFLAG_RW,
+	    &pc->pc_Kdd, 0, "Inverse of derivative gain");
+}
+
+int
+pidctrl_classic(struct pidctrl *pc, int input)
+{
+	int output, error;
+	int Kpd, Kid, Kdd;
+
+	error = pc->pc_setpoint - input;
+	pc->pc_ticks = ticks;
+	pc->pc_olderror = pc->pc_error;
+
+	/* Fetch gains and prevent divide by zero. */
+	Kpd = MAX(pc->pc_Kpd, 1);
+	Kid = MAX(pc->pc_Kid, 1);
+	Kdd = MAX(pc->pc_Kdd, 1);
+
+	/* Compute P (proportional error), I (integral), D (derivative) */
+	pc->pc_error = error;
+	pc->pc_integral =
+	    MAX(MIN(pc->pc_integral + error, pc->pc_bound), -pc->pc_bound);
+	pc->pc_derivative = error - pc->pc_olderror;
+
+	/* Divide by inverse gain values to produce output. */
+	output = ((pc->pc_error / pc->pc_Kpd) +
+	    (pc->pc_integral / pc->pc_Kid)) +
+	    (pc->pc_derivative / pc->pc_Kdd);
+	/* Save for sysctl. */
+	pc->pc_output = output;
+	pc->pc_input = input;
+
+	return output;
+}
+
+int
+pidctrl_daemon(struct pidctrl *pc, int input)
+{
+	int output, error;
+	int Kpd, Kid, Kdd;
+
+	error = pc->pc_setpoint - input;
+	/*
+	 * When ticks expired we reset our variables and start a new
+	 * interval.  If we're called multiple times during one interval
+	 * we attempt to report a target as if the entire error came at
+	 * the interval boundary.
+	 */
+	if ((u_int)(ticks - pc->pc_ticks) >= pc->pc_interval) {
+		pc->pc_ticks = ticks;
+		pc->pc_olderror = pc->pc_error;
+		pc->pc_output = pc->pc_error = 0;
+	} else {
+		error = MAX(error + pc->pc_error, 0);
+	}
+
+	/* Fetch gains and prevent divide by zero. */
+	Kpd = MAX(pc->pc_Kpd, 1);
+	Kid = MAX(pc->pc_Kid, 1);
+	Kdd = MAX(pc->pc_Kdd, 1);
+
+	/* Compute P (proportional error), I (integral), D (derivative) */
+	pc->pc_error = error;
+	pc->pc_integral =
+	    MAX(MIN(pc->pc_integral + error, pc->pc_bound), 0);
+	pc->pc_derivative = error - pc->pc_olderror;
+
+	/* Divide by inverse gain values to produce output. */
+	output = ((error / pc->pc_Kpd) +
+	    (pc->pc_integral / pc->pc_Kid)) +
+	    (pc->pc_derivative / pc->pc_Kdd);
+	output = MAX(output - pc->pc_output, 0);
+	pc->pc_output += output;
+	pc->pc_input = input;
+
+	return output;
+}

Added: head/sys/sys/pidctrl.h
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/sys/pidctrl.h	Fri Feb 23 22:51:51 2018	(r329882)
@@ -0,0 +1,123 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2017,  Jeffrey Roberson <jeff at freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_PIDCTRL_H_
+#define _SYS_PIDCTRL_H_
+
+/*
+ * Proportional Integral Derivative controller.
+ *
+ * This controller is intended to replace a multitude of threshold based
+ * daemon regulation systems.  These systems produce sharp sawtooths of
+ * activity which can cause latency spikes and other undesireable bursty
+ * behavior.  The PID controller adapts to changing load conditions and
+ * adjusts the work done by the daemon to keep a smoother output.
+ *
+ * The setpoint can be thought of as a single watermark that the controller
+ * is always trying to reach.  Compared to a high water/low water type
+ * algorithm the pid controller is dynamically deciding the low water and
+ * regulating to the high water.  The setpoint should be high enough that
+ * the controller and daemon have time to observe the rise in value and
+ * respond to it, else the resource may be exhausted.  More frequent wakeups
+ * permit higher setpoints and less underutilized resources.
+ *
+ * The controller has been optimised for simplicity of math making it quite
+ * inexpensive to execute.  There is no floating point and so the gains must
+ * be the inverse of whole integers.
+ *
+ * Failing to measure and tune the gain parameters can result in wild
+ * oscillations in output.  It is strongly encouraged that controllers are
+ * tested and tuned under a wide variety of workloads before gain values are
+ * picked.  Some reasonable defaults are provided below.
+ */
+
+struct pidctrl {
+	/* Saved control variables. */
+	int	pc_error;		/* Current error. */
+	int	pc_olderror;		/* Saved error for derivative. */
+	int	pc_integral;		/* Integral accumulator. */
+	int	pc_derivative;		/* Change from last error. */
+	int	pc_input;		/* Last input. */
+	int	pc_output;		/* Last output. */
+	int	pc_ticks;		/* Last sampling time. */
+	/* configuration options, runtime tunable via sysctl */
+	int	pc_setpoint;		/* Desired level */
+	int	pc_interval;		/* Update interval in ticks. */
+	int	pc_bound;		/* Integral wind-up limit. */
+	int	pc_Kpd;			/* Proportional gain divisor. */
+	int	pc_Kid;			/* Integral gain divisor. */
+	int	pc_Kdd;			/* Derivative gain divisor. */
+};
+
+/*
+ * Reasonable default divisors.
+ *
+ * Actual gains are 1/divisor.  Gains interact in complex ways with the
+ * setpoint and interval.  Measurement under multiple loads should be
+ * taken to ensure adequate stability and rise time.
+ */
+#define	PIDCTRL_KPD	3		/* Default proportional divisor. */
+#define	PIDCTRL_KID	4		/* Default integral divisor. */
+#define	PIDCTRL_KDD	8		/* Default derivative divisor. */
+#define	PIDCTRL_BOUND	4		/* Bound factor, setpoint multiple. */
+
+struct sysctl_oid_list;
+
+void	pidctrl_init(struct pidctrl *pc, int interval, int setpoint,
+	    int bound, int Kpd, int Kid, int Kdd);
+void	pidctrl_init_sysctl(struct pidctrl *pc, struct sysctl_oid_list *parent);
+
+/*
+ * This is the classic PID controller where the interval is clamped to
+ * [-bound, bound] and the output may be negative.  This should be used
+ * in continuous control loops that can adjust a process variable in
+ * either direction.  This is a descrete time controller and should
+ * only be called once per-interval or the derivative term will be
+ * inaccurate.
+ */
+int	pidctrl_classic(struct pidctrl *pc, int input);
+
+/*
+ * This controler is intended for consumer type daemons that can only
+ * regulate in a positive direction, that is to say, they can not exert
+ * positive pressure on the process variable or input.  They can only
+ * reduce it by doing work.  As such the integral is bound between [0, bound]
+ * and the output is similarly a positive value reflecting the units of
+ * work necessary to be completed in the current interval to eliminate error.
+ *
+ * It is a descrete time controller but can be invoked more than once in a
+ * given time interval for ease of client implementation.  This should only
+ * be done in overload situations or the controller may not produce a stable
+ * output.  Calling it less frequently when there is no work to be done will
+ * increase the rise time but should otherwise be harmless.
+ */
+int	pidctrl_daemon(struct pidctrl *pc, int input);
+
+#endif	/* !_SYS_PIDCTRL_H_ */

Modified: head/sys/vm/vm_meter.c
==============================================================================
--- head/sys/vm/vm_meter.c	Fri Feb 23 22:23:28 2018	(r329881)
+++ head/sys/vm/vm_meter.c	Fri Feb 23 22:51:51 2018	(r329882)
@@ -473,3 +473,58 @@ vm_laundry_count(void)
 	return vm_pagequeue_count(PQ_LAUNDRY);
 }
 
+static void
+vm_domain_stats_init(struct vm_domain *vmd, struct sysctl_oid *parent)
+{
+	struct sysctl_oid *oid;
+
+	vmd->vmd_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(parent), OID_AUTO,
+	    vmd->vmd_name, CTLFLAG_RD, NULL, "");
+	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO,
+	    "stats", CTLFLAG_RD, NULL, "");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "free_count", CTLFLAG_RD, &vmd->vmd_free_count, 0,
+	    "Free pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "active", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_cnt, 0,
+	    "Active pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "inactive", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt, 0,
+	    "Inactive pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "laundry", CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt, 0,
+	    "laundry pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "unswappable",
+	    CTLFLAG_RD, &vmd->vmd_pagequeues[PQ_UNSWAPPABLE].pq_cnt, 0,
+	    "Unswappable pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "inactive_target", CTLFLAG_RD, &vmd->vmd_inactive_target, 0,
+	    "Target inactive pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "free_target", CTLFLAG_RD, &vmd->vmd_free_target, 0,
+	    "Target free pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "free_reserved", CTLFLAG_RD, &vmd->vmd_free_reserved, 0,
+	    "Reserved free pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "free_min", CTLFLAG_RD, &vmd->vmd_free_min, 0,
+	    "Minimum free pages");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+	    "free_severe", CTLFLAG_RD, &vmd->vmd_free_severe, 0,
+	    "Severe free pages");
+
+}
+
+static void
+vm_stats_init(void *arg __unused)
+{
+	struct sysctl_oid *oid;
+	int i;
+
+	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_vm), OID_AUTO,
+	    "domain", CTLFLAG_RD, NULL, "");
+	for (i = 0; i < vm_ndomains; i++)
+		vm_domain_stats_init(VM_DOMAIN(i), oid);
+}
+
+SYSINIT(vmstats_init, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_stats_init, NULL);

Modified: head/sys/vm/vm_page.c
==============================================================================
--- head/sys/vm/vm_page.c	Fri Feb 23 22:23:28 2018	(r329881)
+++ head/sys/vm/vm_page.c	Fri Feb 23 22:51:51 2018	(r329882)
@@ -430,6 +430,7 @@ vm_page_domain_init(int domain)
 		    MTX_DEF | MTX_DUPOK);
 	}
 	mtx_init(&vmd->vmd_free_mtx, "vm page free queue", NULL, MTX_DEF);
+	snprintf(vmd->vmd_name, sizeof(vmd->vmd_name), "%d", domain);
 }
 
 /*

Modified: head/sys/vm/vm_pageout.c
==============================================================================
--- head/sys/vm/vm_pageout.c	Fri Feb 23 22:23:28 2018	(r329881)
+++ head/sys/vm/vm_pageout.c	Fri Feb 23 22:51:51 2018	(r329882)
@@ -124,7 +124,7 @@ static void vm_pageout(void);
 static void vm_pageout_init(void);
 static int vm_pageout_clean(vm_page_t m, int *numpagedout);
 static int vm_pageout_cluster(vm_page_t m);
-static bool vm_pageout_scan(struct vm_domain *vmd, int pass);
+static bool vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage);
 static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
     int starting_page_shortage);
 
@@ -146,7 +146,7 @@ SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
 
 /* Pagedaemon activity rates, in subdivisions of one second. */
 #define	VM_LAUNDER_RATE		10
-#define	VM_INACT_SCAN_RATE	2
+#define	VM_INACT_SCAN_RATE	10
 
 static int vm_pageout_oom_seq = 12;
 
@@ -1104,7 +1104,7 @@ dolaundry:
  * queue scan to meet the target.
  */
 static bool
-vm_pageout_scan(struct vm_domain *vmd, int pass)
+vm_pageout_scan(struct vm_domain *vmd, int pass, int shortage)
 {
 	vm_page_t m, next;
 	struct vm_pagequeue *pq;
@@ -1148,7 +1148,7 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
 	 */
 	if (pass > 0) {
 		deficit = atomic_readandclear_int(&vmd->vmd_pageout_deficit);
-		page_shortage = vm_paging_target(vmd) + deficit;
+		page_shortage = shortage + deficit;
 	} else
 		page_shortage = deficit = 0;
 	starting_page_shortage = page_shortage;
@@ -1398,7 +1398,7 @@ drop_page:
 	 */
 	inactq_shortage = vmd->vmd_inactive_target - (pq->pq_cnt +
 	    vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt / act_scan_laundry_weight) +
-	    vm_paging_target(vmd) + deficit + addl_page_shortage;
+	    shortage + deficit + addl_page_shortage;
 	inactq_shortage *= act_scan_laundry_weight;
 
 	pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
@@ -1764,12 +1764,13 @@ static void
 vm_pageout_worker(void *arg)
 {
 	struct vm_domain *vmd;
-	int domain, pass;
+	int domain, pass, shortage;
 	bool target_met;
 
 	domain = (uintptr_t)arg;
 	vmd = VM_DOMAIN(domain);
 	pass = 0;
+	shortage = 0;
 	target_met = true;
 
 	/*
@@ -1792,48 +1793,39 @@ vm_pageout_worker(void *arg)
 		vm_domain_free_lock(vmd);
 
 		/*
-		 * Do not clear vmd_pageout_wanted until we reach our free page
-		 * target.  Otherwise, we may be awakened over and over again,
-		 * wasting CPU time.
+		 * Might the page daemon need to run again?
 		 */
-		if (vmd->vmd_pageout_wanted && target_met)
-			vmd->vmd_pageout_wanted = false;
-
-		/*
-		 * Might the page daemon receive a wakeup call?
-		 */
-		if (vmd->vmd_pageout_wanted) {
+		if (vm_paging_needed(vmd, vmd->vmd_free_count)) {
 			/*
-			 * No.  Either vmd_pageout_wanted was set by another
-			 * thread during the previous scan, which must have
-			 * been a level 0 scan, or vmd_pageout_wanted was
-			 * already set and the scan failed to free enough
-			 * pages.  If we haven't yet performed a level >= 1
-			 * (page reclamation) scan, then increase the level
-			 * and scan again now.  Otherwise, sleep a bit and
-			 * try again later.
+			 * Yes, the scan failed to free enough pages.  If
+			 * we have performed a level >= 1 (page reclamation)
+			 * scan, then sleep a bit and try again.
 			 */
 			vm_domain_free_unlock(vmd);
-			if (pass >= 1)
+			if (pass > 1)
 				pause("pwait", hz / VM_INACT_SCAN_RATE);
-			pass++;
 		} else {
 			/*
-			 * Yes.  If threads are still sleeping in vm_wait()
-			 * then we immediately start a new scan.  Otherwise,
-			 * sleep until the next wakeup or until pages need to
-			 * have their reference stats updated.
+			 * No, sleep until the next wakeup or until pages
+			 * need to have their reference stats updated.
 			 */
+			vmd->vmd_pageout_wanted = false;
 			if (mtx_sleep(&vmd->vmd_pageout_wanted,
 			    vm_domain_free_lockptr(vmd), PDROP | PVM,
-			    "psleep", hz) == 0) {
+			    "psleep", hz / VM_INACT_SCAN_RATE) == 0)
 				VM_CNT_INC(v_pdwakeups);
-				pass = 1;
-			} else
-				pass = 0;
 		}
+		shortage = pidctrl_daemon(&vmd->vmd_pid, vmd->vmd_free_count);
+		if (shortage && pass == 0)
+			pass = 1;
 
-		target_met = vm_pageout_scan(vmd, pass);
+		target_met = vm_pageout_scan(vmd, pass, shortage);
+		/*
+		 * If the target was not met we must increase the pass to
+		 * more aggressively reclaim.
+		 */
+		if (!target_met)
+			pass++;
 	}
 }
 
@@ -1844,6 +1836,7 @@ static void
 vm_pageout_init_domain(int domain)
 {
 	struct vm_domain *vmd;
+	struct sysctl_oid *oid;
 
 	vmd = VM_DOMAIN(domain);
 	vmd->vmd_interrupt_free_min = 2;
@@ -1870,10 +1863,10 @@ vm_pageout_init_domain(int domain)
 		vmd->vmd_inactive_target = vmd->vmd_free_count / 3;
 
 	/*
-	 * Set the default wakeup threshold to be 10% above the minimum
-	 * page limit.  This keeps the steady state out of shortfall.
+	 * Set the default wakeup threshold to be 10% below the paging
+	 * target.  This keeps the steady state out of shortfall.
 	 */
-	vmd->vmd_pageout_wakeup_thresh = (vmd->vmd_free_min / 10) * 11;
+	vmd->vmd_pageout_wakeup_thresh = (vmd->vmd_free_target / 10) * 9;
 
 	/*
 	 * Target amount of memory to move out of the laundry queue during a
@@ -1882,6 +1875,14 @@ vm_pageout_init_domain(int domain)
 	 */
 	vmd->vmd_background_launder_target = (vmd->vmd_free_target -
 	    vmd->vmd_free_min) / 10;
+
+	/* Initialize the pageout daemon pid controller. */
+	pidctrl_init(&vmd->vmd_pid, hz / VM_INACT_SCAN_RATE,
+	    vmd->vmd_free_target, PIDCTRL_BOUND,
+	    PIDCTRL_KPD, PIDCTRL_KID, PIDCTRL_KDD);
+	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO,
+	    "pidctrl", CTLFLAG_RD, NULL, "");
+	pidctrl_init_sysctl(&vmd->vmd_pid, SYSCTL_CHILDREN(oid));
 }
 
 static void

Modified: head/sys/vm/vm_pagequeue.h
==============================================================================
--- head/sys/vm/vm_pagequeue.h	Fri Feb 23 22:23:28 2018	(r329881)
+++ head/sys/vm/vm_pagequeue.h	Fri Feb 23 22:51:51 2018	(r329882)
@@ -73,6 +73,8 @@ struct vm_pagequeue {
 	const char	* const pq_name;
 } __aligned(CACHE_LINE_SIZE);
 
+#include <sys/pidctrl.h>
+struct sysctl_oid;
 
 struct vm_domain {
 	struct vm_pagequeue vmd_pagequeues[PQ_COUNT];
@@ -83,6 +85,7 @@ struct vm_domain {
 	long vmd_segs;			/* bitmask of the segments */
 
 	/* Paging control variables, locked by domain_free_mtx. */
+	struct pidctrl vmd_pid;		/* Pageout controller. */
 	u_int vmd_free_count;
 	boolean_t vmd_oom;
 	int vmd_oom_seq;
@@ -113,6 +116,10 @@ struct vm_domain {
 	u_int vmd_pageout_wakeup_thresh;/* (c) min pages to wake pagedaemon */
 	u_int vmd_interrupt_free_min;	/* (c) reserved pages for int code */
 	u_int vmd_free_severe;		/* (c) severe page depletion point */
+
+	/* Name for sysctl etc. */
+	struct sysctl_oid *vmd_oid;
+	char vmd_name[sizeof(__XSTRING(MAXMEMDOM))];
 } __aligned(CACHE_LINE_SIZE);
 
 extern struct vm_domain vm_dom[MAXMEMDOM];


More information about the svn-src-all mailing list