svn commit: r270759 - in head/sys: cddl/compat/opensolaris/kern cddl/compat/opensolaris/sys cddl/contrib/opensolaris/uts/common/fs/zfs vm

Steven Hartland smh at FreeBSD.org
Thu Aug 28 19:50:10 UTC 2014


Author: smh
Date: Thu Aug 28 19:50:08 2014
New Revision: 270759
URL: http://svnweb.freebsd.org/changeset/base/270759

Log:
  Refactor ZFS ARC reclaim logic to be more VM cooperative
  
  Prior to this change we triggered ARC reclaim when kmem usage passed 3/4
  of the total available, as indicated by vmem_size(kmem_arena, VMEM_ALLOC).
  
  This could lead large amounts of unused RAM e.g. on a 192GB machine with
  ARC the only major RAM consumer, 40GB of RAM would remain unused.
  
  The old method has also been seen to result in extreme RAM usage under
  certain loads, causing poor performance and stalls.
  
  We now trigger ARC reclaim when the number of free pages drops below the
  value defined by the new sysctl vfs.zfs.arc_free_target, which defaults
  to the value of vm.v_free_target.
  
  Credit to Karl Denninger for the original patch on which this update was
  based.
  
  PR:		191510 and 187594
  Tested by:	dteske
  MFC after:	1 week
  Relnotes:	yes
  Sponsored by:	Multiplay

Modified:
  head/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c
  head/sys/cddl/compat/opensolaris/sys/kmem.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
  head/sys/vm/vm_pageout.c

Modified: head/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c
==============================================================================
--- head/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c	Thu Aug 28 18:59:39 2014	(r270758)
+++ head/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c	Thu Aug 28 19:50:08 2014	(r270759)
@@ -126,18 +126,47 @@ kmem_size_init(void *unused __unused)
 }
 SYSINIT(kmem_size_init, SI_SUB_KMEM, SI_ORDER_ANY, kmem_size_init, NULL);
 
-uint64_t
-kmem_size(void)
+/*
+ * The return values from kmem_free_* are only valid once the pagedaemon
+ * has been initialised, before then they return 0.
+ * 
+ * To ensure the returns are valid the caller can use a SYSINIT with
+ * subsystem set to SI_SUB_KTHREAD_PAGE and an order of at least
+ * SI_ORDER_SECOND.
+ */
+u_int
+kmem_free_target(void)
 {
 
-	return (kmem_size_val);
+	return (vm_cnt.v_free_target);
+}
+
+u_int
+kmem_free_min(void)
+{
+
+	return (vm_cnt.v_free_min);
+}
+
+u_int
+kmem_free_count(void)
+{
+
+	return (vm_cnt.v_free_count);
+}
+
+u_int
+kmem_page_count(void)
+{
+
+	return (vm_cnt.v_page_count);
 }
 
 uint64_t
-kmem_used(void)
+kmem_size(void)
 {
 
-	return (vmem_size(kmem_arena, VMEM_ALLOC));
+	return (kmem_size_val);
 }
 
 static int

Modified: head/sys/cddl/compat/opensolaris/sys/kmem.h
==============================================================================
--- head/sys/cddl/compat/opensolaris/sys/kmem.h	Thu Aug 28 18:59:39 2014	(r270758)
+++ head/sys/cddl/compat/opensolaris/sys/kmem.h	Thu Aug 28 19:50:08 2014	(r270759)
@@ -66,7 +66,16 @@ typedef struct kmem_cache {
 void *zfs_kmem_alloc(size_t size, int kmflags);
 void zfs_kmem_free(void *buf, size_t size);
 uint64_t kmem_size(void);
-uint64_t kmem_used(void);
+u_int kmem_page_count(void);
+
+/*
+ * The return values from kmem_free_* are only valid once the pagedaemon
+ * has been initialised, before then they return 0.
+ */
+u_int kmem_free_count(void);
+u_int kmem_free_target(void);
+u_int kmem_free_min(void);
+
 kmem_cache_t *kmem_cache_create(char *name, size_t bufsize, size_t align,
     int (*constructor)(void *, void *, int), void (*destructor)(void *, void *),
     void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	Thu Aug 28 18:59:39 2014	(r270758)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	Thu Aug 28 19:50:08 2014	(r270759)
@@ -193,9 +193,6 @@ extern int zfs_prefetch_disable;
  */
 static boolean_t arc_warm;
 
-/*
- * These tunables are for performance analysis.
- */
 uint64_t zfs_arc_max;
 uint64_t zfs_arc_min;
 uint64_t zfs_arc_meta_limit = 0;
@@ -204,6 +201,20 @@ int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
 int zfs_disable_dup_eviction = 0;
 uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
+u_int zfs_arc_free_target = (1 << 19); /* default before pagedaemon init only */
+
+static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
+
+#ifdef _KERNEL
+static void
+arc_free_target_init(void *unused __unused)
+{
+
+	zfs_arc_free_target = kmem_free_target();
+}
+SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
+    arc_free_target_init, NULL);
+#endif
 
 TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
 SYSCTL_DECL(_vfs_zfs);
@@ -214,6 +225,35 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
     &zfs_arc_average_blocksize, 0,
     "ARC average blocksize");
+/*
+ * We don't have a tunable for arc_free_target due to the dependency on
+ * pagedaemon initialisation.
+ */
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
+    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
+    sysctl_vfs_zfs_arc_free_target, "IU",
+    "Desired number of free pages below which ARC triggers reclaim");
+
+static int
+sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
+{
+	u_int val;
+	int err;
+
+	val = zfs_arc_free_target;
+	err = sysctl_handle_int(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (val < kmem_free_min())
+		return (EINVAL);
+	if (val > kmem_page_count())
+		return (EINVAL);
+
+	zfs_arc_free_target = val;
+
+	return (0);
+}
 
 /*
  * Note that buffers can be in one of 6 states:
@@ -2418,9 +2458,12 @@ arc_flush(spa_t *spa)
 void
 arc_shrink(void)
 {
+
 	if (arc_c > arc_c_min) {
 		uint64_t to_free;
 
+		DTRACE_PROBE2(arc__shrink, uint64_t, arc_c, uint64_t,
+			arc_c_min);
 #ifdef _KERNEL
 		to_free = arc_c >> arc_shrink_shift;
 #else
@@ -2440,8 +2483,11 @@ arc_shrink(void)
 		ASSERT((int64_t)arc_p >= 0);
 	}
 
-	if (arc_size > arc_c)
+	if (arc_size > arc_c) {
+		DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
+			uint64_t, arc_c);
 		arc_adjust();
+	}
 }
 
 static int needfree = 0;
@@ -2452,15 +2498,25 @@ arc_reclaim_needed(void)
 
 #ifdef _KERNEL
 
-	if (needfree)
+	if (needfree) {
+		DTRACE_PROBE(arc__reclaim_needfree);
 		return (1);
+	}
+
+	if (kmem_free_count() < zfs_arc_free_target) {
+		DTRACE_PROBE2(arc__reclaim_freetarget, uint64_t,
+		    kmem_free_count(), uint64_t, zfs_arc_free_target);
+		return (1);
+	}
 
 	/*
 	 * Cooperate with pagedaemon when it's time for it to scan
 	 * and reclaim some pages.
 	 */
-	if (vm_paging_needed())
+	if (vm_paging_needed()) {
+		DTRACE_PROBE(arc__reclaim_paging);
 		return (1);
+	}
 
 #ifdef sun
 	/*
@@ -2504,15 +2560,14 @@ arc_reclaim_needed(void)
 	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
 		return (1);
 #endif
-#else	/* !sun */
-	if (kmem_used() > (kmem_size() * 3) / 4)
-		return (1);
 #endif	/* sun */
 
 #else
 	if (spa_get_random(100) == 0)
 		return (1);
 #endif
+	DTRACE_PROBE(arc__reclaim_no);
+
 	return (0);
 }
 

Modified: head/sys/vm/vm_pageout.c
==============================================================================
--- head/sys/vm/vm_pageout.c	Thu Aug 28 18:59:39 2014	(r270758)
+++ head/sys/vm/vm_pageout.c	Thu Aug 28 19:50:08 2014	(r270759)
@@ -115,10 +115,14 @@ __FBSDID("$FreeBSD$");
 
 /* the kernel process "vm_pageout"*/
 static void vm_pageout(void);
+static void vm_pageout_init(void);
 static int vm_pageout_clean(vm_page_t);
 static void vm_pageout_scan(struct vm_domain *vmd, int pass);
 static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass);
 
+SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init,
+    NULL);
+
 struct proc *pageproc;
 
 static struct kproc_desc page_kp = {
@@ -126,7 +130,7 @@ static struct kproc_desc page_kp = {
 	vm_pageout,
 	&pageproc
 };
-SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start,
+SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,
     &page_kp);
 
 #if !defined(NO_SWAPPING)
@@ -1640,15 +1644,11 @@ vm_pageout_worker(void *arg)
 }
 
 /*
- *	vm_pageout is the high level pageout daemon.
+ *	vm_pageout_init initialises basic pageout daemon settings.
  */
 static void
-vm_pageout(void)
+vm_pageout_init(void)
 {
-#if MAXMEMDOM > 1
-	int error, i;
-#endif
-
 	/*
 	 * Initialize some paging parameters.
 	 */
@@ -1694,6 +1694,17 @@ vm_pageout(void)
 	/* XXX does not really belong here */
 	if (vm_page_max_wired == 0)
 		vm_page_max_wired = vm_cnt.v_free_count / 3;
+}
+
+/*
+ *     vm_pageout is the high level pageout daemon.
+ */
+static void
+vm_pageout(void)
+{
+#if MAXMEMDOM > 1
+	int error, i;
+#endif
 
 	swap_pager_swap_init();
 #if MAXMEMDOM > 1


More information about the svn-src-all mailing list