svn commit: r323433 - in head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys

Andriy Gapon avg at FreeBSD.org
Mon Sep 11 11:31:45 UTC 2017


Author: avg
Date: Mon Sep 11 11:31:43 2017
New Revision: 323433
URL: https://svnweb.freebsd.org/changeset/base/323433

Log:
  MFV r323110: 8558 lwp_create() returns EAGAIN on system with more than 80K ZFS filesystems
  
  illumos/illumos-gate at 216d7723a1a58124cf95c4950d51d5f99d3f4128
  https://github.com/illumos/illumos-gate/commit/216d7723a1a58124cf95c4950d51d5f99d3f4128
  
  https://www.illumos.org/issues/8558
    On a system with more than 80K ZFS filesystems, we've seen cases where
    lwp_create() will start to fail by returning EAGAIN. The problem being,
    for each of those 80K ZFS filesystems, a taskq will be created for each
    dataset as part of the ZIL for each dataset.
    For each of these taskq's, a kernel thread will be created which results
    in 24KB being allocated for each thread. With enough of these 24KB
    allocations, we eventually exhaust the memory region set aside for these
    allocations. Currently, segkpsize is set to a value of 2GB, which means
    we can only support about 80K filesystems; 2GB / 24KB = ~80K.
    The lwp_create() failure comes into play due to the fact that LWP
    creation also allocates 24KB from this same region of memory. Thus, if
    we've exhausted this region of memory due to the number of ZIL taskq's,
    there won't be any memory avaible to allow the call to lwp_create() to
    succeed.
  
  FreeBSD note: I haven't created sysctl-s for the new ZIL clean
  parameters.  Let's add them if anyone requires to tune them.
  
  Reviewed by: George Wilson <george.wilson at delphix.com>
  Reviewed by: Sebastien Roy <sebastien.roy at delphix.com>
  Approved by: Robert Mustacchi <rm at joyent.com>
  Author: Prakash Surya <prakash.surya at delphix.com>
  MFC after:	3 weeks

Modified:
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
Directory Properties:
  head/sys/cddl/contrib/opensolaris/   (props changed)

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c	Mon Sep 11 10:41:42 2017	(r323432)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c	Mon Sep 11 11:31:43 2017	(r323433)
@@ -137,6 +137,36 @@ uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
  */
 int zfs_sync_taskq_batch_pct = 75;
 
+/*
+ * These tunables determine the behavior of how zil_itxg_clean() is
+ * called via zil_clean() in the context of spa_sync(). When an itxg
+ * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching.
+ * If the dispatch fails, the call to zil_itxg_clean() will occur
+ * synchronously in the context of spa_sync(), which can negatively
+ * impact the performance of spa_sync() (e.g. in the case of the itxg
+ * list having a large number of itxs that needs to be cleaned).
+ *
+ * Thus, these tunables can be used to manipulate the behavior of the
+ * taskq used by zil_clean(); they determine the number of taskq entries
+ * that are pre-populated when the taskq is first created (via the
+ * "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of
+ * taskq entries that are cached after an on-demand allocation (via the
+ * "zfs_zil_clean_taskq_maxalloc").
+ *
+ * The idea being, we want to try reasonably hard to ensure there will
+ * already be a taskq entry pre-allocated by the time that it is needed
+ * by zil_clean(). This way, we can avoid the possibility of an
+ * on-demand allocation of a new taskq entry from failing, which would
+ * result in zil_itxg_clean() being called synchronously from zil_clean()
+ * (which can adversely affect performance of spa_sync()).
+ *
+ * Additionally, the number of threads used by the taskq can be
+ * configured via the "zfs_zil_clean_taskq_nthr_pct" tunable.
+ */
+int zfs_zil_clean_taskq_nthr_pct = 100;
+int zfs_zil_clean_taskq_minalloc = 1024;
+int zfs_zil_clean_taskq_maxalloc = 1024 * 1024;
+
 #if defined(__FreeBSD__) && defined(_KERNEL)
 
 extern int zfs_vdev_async_write_active_max_dirty_percent;
@@ -272,6 +302,12 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 	    zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
 	    TASKQ_THREADS_CPU_PCT);
 
+	dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq",
+	    zfs_zil_clean_taskq_nthr_pct, minclsyspri,
+	    zfs_zil_clean_taskq_minalloc,
+	    zfs_zil_clean_taskq_maxalloc,
+	    TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
+
 	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
 
@@ -422,6 +458,7 @@ dsl_pool_close(dsl_pool_t *dp)
 	txg_list_destroy(&dp->dp_sync_tasks);
 	txg_list_destroy(&dp->dp_dirty_dirs);
 
+	taskq_destroy(dp->dp_zil_clean_taskq);
 	taskq_destroy(dp->dp_sync_taskq);
 
 	/*

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h	Mon Sep 11 10:41:42 2017	(r323432)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h	Mon Sep 11 11:31:43 2017	(r323433)
@@ -122,6 +122,8 @@ typedef struct dsl_pool {
 	txg_list_t dp_dirty_dirs;
 	txg_list_t dp_sync_tasks;
 	taskq_t *dp_sync_taskq;
+	taskq_t *dp_zil_clean_taskq;
+	txg_list_t dp_early_sync_tasks;
 
 	/*
 	 * Protects administrative changes (properties, namespace)

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h	Mon Sep 11 10:41:42 2017	(r323432)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h	Mon Sep 11 11:31:43 2017	(r323433)
@@ -124,7 +124,6 @@ struct zilog {
 	list_t		zl_lwb_list;	/* in-flight log write list */
 	kmutex_t	zl_vdev_lock;	/* protects zl_vdev_tree */
 	avl_tree_t	zl_vdev_tree;	/* vdevs to flush in zil_commit() */
-	taskq_t		*zl_clean_taskq; /* runs lwb and itx clean tasks */
 	avl_tree_t	zl_bp_tree;	/* track bps during log parse */
 	clock_t		zl_replay_time;	/* lbolt of when replay started */
 	uint64_t	zl_replay_blks;	/* number of log blocks replayed */

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c	Mon Sep 11 10:41:42 2017	(r323432)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c	Mon Sep 11 11:31:43 2017	(r323433)
@@ -1407,8 +1407,7 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg)
 		return;
 	}
 	ASSERT3U(itxg->itxg_txg, <=, synced_txg);
-	ASSERT(itxg->itxg_txg != 0);
-	ASSERT(zilog->zl_clean_taskq != NULL);
+	ASSERT3U(itxg->itxg_txg, !=, 0);
 	clean_me = itxg->itxg_itxs;
 	itxg->itxg_itxs = NULL;
 	itxg->itxg_txg = 0;
@@ -1419,7 +1418,9 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg)
 	 * free it in-line. This should be rare. Note, using TQ_SLEEP
 	 * created a bad performance problem.
 	 */
-	if (taskq_dispatch(zilog->zl_clean_taskq,
+	ASSERT3P(zilog->zl_dmu_pool, !=, NULL);
+	ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL);
+	if (taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq,
 	    (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == 0)
 		zil_itxg_clean(clean_me);
 }
@@ -1848,13 +1849,10 @@ zil_open(objset_t *os, zil_get_data_t *get_data)
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 
-	ASSERT(zilog->zl_clean_taskq == NULL);
 	ASSERT(zilog->zl_get_data == NULL);
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 
 	zilog->zl_get_data = get_data;
-	zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri,
-	    2, 2, TASKQ_PREPOPULATE);
 
 	return (zilog);
 }
@@ -1888,8 +1886,6 @@ zil_close(zilog_t *zilog)
 		zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg);
 	VERIFY(!zilog_is_dirty(zilog));
 
-	taskq_destroy(zilog->zl_clean_taskq);
-	zilog->zl_clean_taskq = NULL;
 	zilog->zl_get_data = NULL;
 
 	/*


More information about the svn-src-head mailing list