svn commit: r337660 - head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs

Sat Aug 11 22:01:53 UTC 2018

Author: mmacy
Date: Sat Aug 11 22:01:52 2018
New Revision: 337660
URL: https://svnweb.freebsd.org/changeset/base/337660

Log:
  Enable balanced arc pruning
  
  Taken from:
  ommit f6046738365571bd647f804958dfdff8a32fbde4
  Author: Brian Behlendorf <behlendorf1 at llnl.gov>
  Date:   Sat May 30 09:57:53 2015 -0500
  
      Make arc_prune() asynchronous
  
      As described in the comment above arc_adapt_thread() it is critical
      that the arc_adapt_thread() function never sleep while holding a hash
      lock.  This behavior was possible in the Linux implementation because
      the arc_prune() logic was implemented to be synchronous.  Under
      illumos the analogous dnlc_reduce_cache() function is asynchronous.
  
      To address this the arc_do_user_prune() function is has been reworked
      in to two new functions as follows:
  
      * arc_prune_async() is an asynchronous implementation which dispatches
      the prune callback to be run by the system taskq.  This makes it
      suitable to use in the context of the arc_adapt_thread().
  
      * arc_prune() is a synchronous implementation which depends on the
      arc_prune_async() implementation but blocks until the outstanding
      callbacks complete.  This is used in arc_kmem_reap_now() where it
      is safe, and expected, that memory will be freed.
  
      This patch additionally adds the zfs_arc_meta_strategy module option
      while allows the meta reclaim strategy to be configured.  It defaults
      to a balanced strategy which has been proved to work well under Linux
      but the illumos meta-only strategy can be enabled.
  
      Signed-off-by: Tim Chase <tim at chase2k.com>
      Signed-off-by: Brian Behlendorf <behlendorf1 at llnl.gov>

Modified:
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
==============================================================================

--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	Sat Aug 11 21:10:08 2018	(r337659)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	Sat Aug 11 22:01:52 2018	(r337660)
@@ -525,6 +525,14 @@ typedef struct arc_state {
 	refcount_t arcs_size;
 } arc_state_t;
 
+/*
+ * Percentage that can be consumed by dnodes of ARC meta buffers.
+ */
+int zfs_arc_meta_prune = 10000;
+unsigned long zfs_arc_dnode_limit_percent = 10;
+int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
+int zfs_arc_meta_adjust_restarts = 4096;
+
 /* The 6 states: */
 static arc_state_t ARC_anon;
 static arc_state_t ARC_mru;
@@ -4076,11 +4084,114 @@ arc_adjust_impl(arc_state_t *state, uint64_t spa, int6
 }
 
 /*
+ * The goal of this function is to evict enough meta data buffers from the
+ * ARC in order to enforce the arc_meta_limit.  Achieving this is slightly
+ * more complicated than it appears because it is common for data buffers
+ * to have holds on meta data buffers.  In addition, dnode meta data buffers
+ * will be held by the dnodes in the block preventing them from being freed.
+ * This means we can't simply traverse the ARC and expect to always find
+ * enough unheld meta data buffer to release.
+ *
+ * Therefore, this function has been updated to make alternating passes
+ * over the ARC releasing data buffers and then newly unheld meta data
+ * buffers.  This ensures forward progress is maintained and meta_used
+ * will decrease.  Normally this is sufficient, but if required the ARC
+ * will call the registered prune callbacks causing dentry and inodes to
+ * be dropped from the VFS cache.  This will make dnode meta data buffers
+ * available for reclaim.
+ */
+static uint64_t
+arc_adjust_meta_balanced(uint64_t meta_used)
+{
+	int64_t delta, prune = 0, adjustmnt;
+	uint64_t total_evicted = 0;
+	arc_buf_contents_t type = ARC_BUFC_DATA;
+	int restarts = MAX(zfs_arc_meta_adjust_restarts, 0);
+
+restart:
+	/*
+	 * This slightly differs than the way we evict from the mru in
+	 * arc_adjust because we don't have a "target" value (i.e. no
+	 * "meta" arc_p). As a result, I think we can completely
+	 * cannibalize the metadata in the MRU before we evict the
+	 * metadata from the MFU. I think we probably need to implement a
+	 * "metadata arc_p" value to do this properly.
+	 */
+	adjustmnt = meta_used - arc_meta_limit;
+
+	if (adjustmnt > 0 && refcount_count(&arc_mru->arcs_esize[type]) > 0) {
+		delta = MIN(refcount_count(&arc_mru->arcs_esize[type]),
+		    adjustmnt);
+		total_evicted += arc_adjust_impl(arc_mru, 0, delta, type);
+		adjustmnt -= delta;
+	}
+
+	/*
+	 * We can't afford to recalculate adjustmnt here. If we do,
+	 * new metadata buffers can sneak into the MRU or ANON lists,
+	 * thus penalize the MFU metadata. Although the fudge factor is
+	 * small, it has been empirically shown to be significant for
+	 * certain workloads (e.g. creating many empty directories). As
+	 * such, we use the original calculation for adjustmnt, and
+	 * simply decrement the amount of data evicted from the MRU.
+	 */
+
+	if (adjustmnt > 0 && refcount_count(&arc_mfu->arcs_esize[type]) > 0) {
+		delta = MIN(refcount_count(&arc_mfu->arcs_esize[type]),
+		    adjustmnt);
+		total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type);
+	}
+
+	adjustmnt = meta_used - arc_meta_limit;
+
+	if (adjustmnt > 0 &&
+	    refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) {
+		delta = MIN(adjustmnt,
+		    refcount_count(&arc_mru_ghost->arcs_esize[type]));
+		total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type);
+		adjustmnt -= delta;
+	}
+
+	if (adjustmnt > 0 &&
+	    refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) {
+		delta = MIN(adjustmnt,
+		    refcount_count(&arc_mfu_ghost->arcs_esize[type]));
+		total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type);
+	}
+
+	/*
+	 * If after attempting to make the requested adjustment to the ARC
+	 * the meta limit is still being exceeded then request that the
+	 * higher layers drop some cached objects which have holds on ARC
+	 * meta buffers.  Requests to the upper layers will be made with
+	 * increasingly large scan sizes until the ARC is below the limit.
+	 */
+	if (meta_used > arc_meta_limit) {
+		if (type == ARC_BUFC_DATA) {
+			type = ARC_BUFC_METADATA;
+		} else {
+			type = ARC_BUFC_DATA;
+
+			if (zfs_arc_meta_prune) {
+				prune += zfs_arc_meta_prune;
+				arc_prune_async(prune);
+			}
+		}
+
+		if (restarts > 0) {
+			restarts--;
+			goto restart;
+		}
+	}
+	return (total_evicted);
+}
+
+/*
  * Evict metadata buffers from the cache, such that arc_meta_used is
  * capped by the arc_meta_limit tunable.
  */
 static uint64_t
-arc_adjust_meta(uint64_t meta_used)
+arc_adjust_meta_only(uint64_t meta_used)
 {
 	uint64_t total_evicted = 0;
 	int64_t target;
@@ -4110,6 +4221,15 @@ arc_adjust_meta(uint64_t meta_used)
 	total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
 
 	return (total_evicted);
+}
+
+static uint64_t
+arc_adjust_meta(uint64_t meta_used)
+{
+	if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
+		return (arc_adjust_meta_only(meta_used));
+	else
+		return (arc_adjust_meta_balanced(meta_used));
 }
 
 /*