svn commit: r344569 - in head/cddl/contrib/opensolaris: cmd/zfs lib/libzfs/common

Tue Feb 26 16:13:37 UTC 2019

On February 26, 2019 7:48:27 AM PST, Cy Schubert <Cy.Schubert at cschubert.com> wrote:
>On February 26, 2019 12:18:35 AM PST, Baptiste Daroussin
><bapt at FreeBSD.org> wrote:
>>Author: bapt
>>Date: Tue Feb 26 08:18:34 2019
>>New Revision: 344569
>>URL: https://svnweb.freebsd.org/changeset/base/344569
>>
>>Log:
>>  Implement parallel mounting for ZFS filesystem
>>  
>>  It was first implemented on Illumos and then ported to ZoL.
>>  This patch is a port to FreeBSD of the ZoL version.
>>  This patch also includes a fix for a race condition that was amended
>>  
>>With such patch Delphix has seen a huge decrease in latency of the
>>mount phase
>>  (https://github.com/openzfs/openzfs/commit/a3f0e2b569 for details).
>>With that current change Gandi has measured improvments that are on
>par
>>with
>>  those reported by Delphix.
>>  
>>  Zol commits incorporated:
>>https://github.com/zfsonlinux/zfs/commit/a10d50f999511d304f910852c7825c70c9c9e303
>>https://github.com/zfsonlinux/zfs/commit/e63ac16d25fbe991a356489c86d4077567dfea21
>>  
>>  Reviewed by:	avg, sef
>>  Approved by:	avg, sef
>>  Obtained from:	ZoL
>>  MFC after:	1 month
>>  Relnotes:	yes
>>  Sponsored by:	Gandi.net
>>  Differential Revision:	https://reviews.freebsd.org/D19098
>>
>>Modified:
>>  head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
>>  head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
>>  head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
>>  head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h
>>  head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c
>>
>>Modified: head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
>>==============================================================================
>>--- head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c	Tue Feb 26
>>06:22:10 2019	(r344568)
>>+++ head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c	Tue Feb 26
>>08:18:34 2019	(r344569)
>>@@ -5812,8 +5812,13 @@ zfs_do_holds(int argc, char **argv)
>> 
>> #define	CHECK_SPINNER 30
>> #define	SPINNER_TIME 3		/* seconds */
>>-#define	MOUNT_TIME 5		/* seconds */
>>+#define	MOUNT_TIME 1		/* seconds */
>> 
>>+typedef struct get_all_state {
>>+	boolean_t	ga_verbose;
>>+	get_all_cb_t	*ga_cbp;
>>+} get_all_state_t;
>>+
>> static int
>> get_one_dataset(zfs_handle_t *zhp, void *data)
>> {
>>@@ -5821,10 +5826,10 @@ get_one_dataset(zfs_handle_t *zhp, void *data)
>> 	static int spinval = 0;
>> 	static int spincheck = 0;
>> 	static time_t last_spin_time = (time_t)0;
>>-	get_all_cb_t *cbp = data;
>>+	get_all_state_t *state = data;
>> 	zfs_type_t type = zfs_get_type(zhp);
>> 
>>-	if (cbp->cb_verbose) {
>>+	if (state->ga_verbose) {
>> 		if (--spincheck < 0) {
>> 			time_t now = time(NULL);
>> 			if (last_spin_time + SPINNER_TIME < now) {
>>@@ -5850,26 +5855,24 @@ get_one_dataset(zfs_handle_t *zhp, void *data)
>> 		zfs_close(zhp);
>> 		return (0);
>> 	}
>>-	libzfs_add_handle(cbp, zhp);
>>-	assert(cbp->cb_used <= cbp->cb_alloc);
>>+	libzfs_add_handle(state->ga_cbp, zhp);
>>+	assert(state->ga_cbp->cb_used <= state->ga_cbp->cb_alloc);
>> 
>> 	return (0);
>> }
>> 
>> static void
>>-get_all_datasets(zfs_handle_t ***dslist, size_t *count, boolean_t
>>verbose)
>>+get_all_datasets(get_all_cb_t *cbp, boolean_t verbose)
>> {
>>-	get_all_cb_t cb = { 0 };
>>-	cb.cb_verbose = verbose;
>>-	cb.cb_getone = get_one_dataset;
>>+	get_all_state_t state = {
>>+		.ga_verbose = verbose,
>>+		.ga_cbp = cbp
>>+	};
>> 
>> 	if (verbose)
>> 		set_progress_header(gettext("Reading ZFS config"));
>>-	(void) zfs_iter_root(g_zfs, get_one_dataset, &cb);
>>+	(void) zfs_iter_root(g_zfs, get_one_dataset, &state);
>> 
>>-	*dslist = cb.cb_handles;
>>-	*count = cb.cb_used;
>>-
>> 	if (verbose)
>> 		finish_progress(gettext("done."));
>> }
>>@@ -5879,9 +5882,20 @@ get_all_datasets(zfs_handle_t ***dslist, size_t
>>*count
>>* similar, we have a common function with an extra parameter to
>>determine which
>>  * mode we are using.
>>  */
>>-#define	OP_SHARE	0x1
>>-#define	OP_MOUNT	0x2
>>+typedef enum { OP_SHARE, OP_MOUNT } share_mount_op_t;
>> 
>>+typedef struct share_mount_state {
>>+	share_mount_op_t	sm_op;
>>+	boolean_t	sm_verbose;
>>+	int	sm_flags;
>>+	char	*sm_options;
>>+	char	*sm_proto; /* only valid for OP_SHARE */
>>+	pthread_mutex_t	sm_lock; /* protects the remaining fields */
>>+	uint_t	sm_total; /* number of filesystems to process */
>>+	uint_t	sm_done; /* number of filesystems processed */
>>+	int	sm_status; /* -1 if any of the share/mount operations failed */
>>+} share_mount_state_t;
>>+
>> /*
>>  * Share or mount a dataset.
>>  */
>>@@ -6122,6 +6136,29 @@ report_mount_progress(int current, int total)
>> 		update_progress(info);
>> }
>> 
>>+/*
>>+ * zfs_foreach_mountpoint() callback that mounts or shares on
>>filesystem and
>>+ * updates the progress meter
>>+ */
>>+static int
>>+share_mount_one_cb(zfs_handle_t *zhp, void *arg)
>>+{
>>+	share_mount_state_t *sms = arg;
>>+	int ret;
>>+
>>+	ret = share_mount_one(zhp, sms->sm_op, sms->sm_flags, sms->sm_proto,
>>+	    B_FALSE, sms->sm_options);
>>+
>>+	pthread_mutex_lock(&sms->sm_lock);
>>+	if (ret != 0)
>>+		sms->sm_status = ret;
>>+	sms->sm_done++;
>>+	if (sms->sm_verbose)
>>+		report_mount_progress(sms->sm_done, sms->sm_total);
>>+	pthread_mutex_unlock(&sms->sm_lock);
>>+	return (ret);
>>+}
>>+
>> static void
>> append_options(char *mntopts, char *newopts)
>> {
>>@@ -6194,8 +6231,6 @@ share_mount(int op, int argc, char **argv)
>> 
>> 	/* check number of arguments */
>> 	if (do_all) {
>>-		zfs_handle_t **dslist = NULL;
>>-		size_t i, count = 0;
>> 		char *protocol = NULL;
>> 
>> 		if (op == OP_SHARE && argc > 0) {
>>@@ -6216,35 +6251,48 @@ share_mount(int op, int argc, char **argv)
>> 		}
>> 
>> 		start_progress_timer();
>>-		get_all_datasets(&dslist, &count, verbose);
>>+		get_all_cb_t cb = { 0 };
>>+		get_all_datasets(&cb, verbose);
>> 
>>-		if (count == 0)
>>+		if (cb.cb_used == 0) {
>>+			if (options != NULL)
>>+				free(options);
>> 			return (0);
>>+		}
>> 
>>-		qsort(dslist, count, sizeof (void *), libzfs_dataset_cmp);
>> #ifdef illumos
>>-		sa_init_selective_arg_t sharearg;
>>-		sharearg.zhandle_arr = dslist;
>>-		sharearg.zhandle_len = count;
>>-		if ((ret = zfs_init_libshare_arg(zfs_get_handle(dslist[0]),
>>-		    SA_INIT_SHARE_API_SELECTIVE, &sharearg)) != SA_OK) {
>>-			(void) fprintf(stderr,
>>-			    gettext("Could not initialize libshare, %d"), ret);
>>-			return (ret);
>>+		if (op == OP_SHARE) {
>>+			sa_init_selective_arg_t sharearg;
>>+			sharearg.zhandle_arr = cb.cb_handles;
>>+			sharearg.zhandle_len = cb.cb_used;
>>+			if ((ret = zfs_init_libshare_arg(g_zfs,
>>+			    SA_INIT_SHARE_API_SELECTIVE, &sharearg)) != SA_OK) {
>>+				(void) fprintf(stderr, gettext(
>>+				    "Could not initialize libshare, %d"), ret);
>>+				return (ret);
>>+			}
>> 		}
>> #endif
>>+		share_mount_state_t share_mount_state = { 0 };
>>+		share_mount_state.sm_op = op;
>>+		share_mount_state.sm_verbose = verbose;
>>+		share_mount_state.sm_flags = flags;
>>+		share_mount_state.sm_options = options;
>>+		share_mount_state.sm_proto = protocol;
>>+		share_mount_state.sm_total = cb.cb_used;
>>+		pthread_mutex_init(&share_mount_state.sm_lock, NULL);
>> 
>>-		for (i = 0; i < count; i++) {
>>-			if (verbose)
>>-				report_mount_progress(i, count);
>>+		/*
>>+		 * libshare isn't mt-safe, so only do the operation in parallel
>>+		 * if we're mounting.
>>+		 */
>>+		zfs_foreach_mountpoint(g_zfs, cb.cb_handles, cb.cb_used,
>>+		    share_mount_one_cb, &share_mount_state, op == OP_MOUNT);
>>+		ret = share_mount_state.sm_status;
>> 
>>-			if (share_mount_one(dslist[i], op, flags, protocol,
>>-			    B_FALSE, options) != 0)
>>-				ret = 1;
>>-			zfs_close(dslist[i]);
>>-		}
>>-
>>-		free(dslist);
>>+		for (int i = 0; i < cb.cb_used; i++)
>>+			zfs_close(cb.cb_handles[i]);
>>+		free(cb.cb_handles);
>> 	} else if (argc == 0) {
>> 		struct mnttab entry;
>> 
>>
>>Modified: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
>>==============================================================================
>>--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h	Tue Feb
>26
>>06:22:10 2019	(r344568)
>>+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h	Tue Feb
>26
>>08:18:34 2019	(r344569)
>>@@ -579,12 +579,12 @@ typedef struct get_all_cb {
>> 	zfs_handle_t	**cb_handles;
>> 	size_t		cb_alloc;
>> 	size_t		cb_used;
>>-	boolean_t	cb_verbose;
>>-	int		(*cb_getone)(zfs_handle_t *, void *);
>> } get_all_cb_t;
>> 
>>+void zfs_foreach_mountpoint(libzfs_handle_t *, zfs_handle_t **,
>>size_t,
>>+    zfs_iter_f, void*, boolean_t);
>>+
>> void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *);
>>-int libzfs_dataset_cmp(const void *, const void *);
>> 
>> /*
>>  * Functions to create and destroy datasets.
>>
>>Modified:
>>head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
>>==============================================================================
>>---
>>head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c	Tue
>>Feb 26 06:22:10 2019	(r344568)
>>+++
>>head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c	Tue
>>Feb 26 08:18:34 2019	(r344569)
>>@@ -799,6 +799,7 @@ libzfs_mnttab_cache_compare(const void *arg1,
>const
>>vo
>> void
>> libzfs_mnttab_init(libzfs_handle_t *hdl)
>> {
>>+	pthread_mutex_init(&hdl->libzfs_mnttab_cache_lock, NULL);
>> 	assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0);
>> 	avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare,
>> 	    sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node));
>>@@ -839,6 +840,7 @@ libzfs_mnttab_fini(libzfs_handle_t *hdl)
>> 		free(mtn);
>> 	}
>> 	avl_destroy(&hdl->libzfs_mnttab_cache);
>>+	(void) pthread_mutex_destroy(&hdl->libzfs_mnttab_cache_lock);
>> }
>> 
>> void
>>@@ -853,6 +855,7 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const
>char
>>*f
>> {
>> 	mnttab_node_t find;
>> 	mnttab_node_t *mtn;
>>+	int ret = ENOENT;
>> 
>> 	if (!hdl->libzfs_mnttab_enable) {
>> 		struct mnttab srch = { 0 };
>>@@ -868,6 +871,7 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const
>char
>>*f
>> 			return (ENOENT);
>> 	}
>> 
>>+	pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock);
>> 	if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
>> 		libzfs_mnttab_update(hdl);
>> 
>>@@ -875,9 +879,10 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const
>>char *f
>> 	mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL);
>> 	if (mtn) {
>> 		*entry = mtn->mtn_mt;
>>-		return (0);
>>+		ret = 0;
>> 	}
>>-	return (ENOENT);
>>+	pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);
>>+	return (ret);
>> }
>> 
>> void
>>@@ -886,15 +891,17 @@ libzfs_mnttab_add(libzfs_handle_t *hdl, const
>>char *sp
>> {
>> 	mnttab_node_t *mtn;
>> 
>>-	if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
>>-		return;
>>-	mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
>>-	mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special);
>>-	mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp);
>>-	mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS);
>>-	mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts);
>>-	avl_add(&hdl->libzfs_mnttab_cache, mtn);
>>-}
>>+	pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock);
>>+	if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0) {
>>+		mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
>>+		mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special);
>>+		mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp);
>>+		mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS);
>>+		mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts);
>>+		avl_add(&hdl->libzfs_mnttab_cache, mtn);
>>+	}
>>+	pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);
>>+}		
>> 
>> void
>> libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname)
>>@@ -902,6 +909,7 @@ libzfs_mnttab_remove(libzfs_handle_t *hdl, const
>>char 
>> 	mnttab_node_t find;
>> 	mnttab_node_t *ret;
>> 
>>+	pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock);
>> 	find.mtn_mt.mnt_special = (char *)fsname;
>> 	if ((ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL))
>> 	    != NULL) {
>>@@ -912,6 +920,7 @@ libzfs_mnttab_remove(libzfs_handle_t *hdl, const
>>char 
>> 		free(ret->mtn_mt.mnt_mntopts);
>> 		free(ret);
>> 	}
>>+	pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);
>> }
>> 
>> int
>>
>>Modified:
>head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h
>>==============================================================================
>>--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h	Tue
>>Feb 26 06:22:10 2019	(r344568)
>>+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h	Tue
>>Feb 26 08:18:34 2019	(r344569)
>>@@ -22,7 +22,7 @@
>> /*
>>* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights
>>reserved.
>>  * Copyright (c) 2011 Pawel Jakub Dawidek. All rights reserved.
>>- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
>>+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
>>* Copyright (c) 2013 Martin Matuska <mm at FreeBSD.org>. All rights
>>reserved.
>>  */
>> 
>>@@ -73,6 +73,13 @@ struct libzfs_handle {
>> 	int libzfs_storeerr; /* stuff error messages into buffer */
>> 	void *libzfs_sharehdl; /* libshare handle */
>> 	boolean_t libzfs_mnttab_enable;
>>+	/*
>>+	 * We need a lock to handle the case where parallel mount
>>+	 * threads are populating the mnttab cache simultaneously. The
>>+	 * lock only protects the integrity of the avl tree, and does
>>+	 * not protect the contents of the mnttab entries themselves.
>>+	 */
>>+	pthread_mutex_t libzfs_mnttab_cache_lock;
>> 	avl_tree_t libzfs_mnttab_cache;
>> 	int libzfs_pool_iter;
>> 	libzfs_fru_t **libzfs_fru_hash;
>>
>>Modified:
>>head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c
>>==============================================================================
>>--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c	Tue
>>Feb 26 06:22:10 2019	(r344568)
>>+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c	Tue
>>Feb 26 08:18:34 2019	(r344569)
>>@@ -26,6 +26,7 @@
>>  * Copyright 2016 Igor Kozhukhov <ikozhukhov at gmail.com>
>>  * Copyright 2017 Joyent, Inc.
>>  * Copyright 2017 RackTop Systems.
>>+ * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
>>  */
>> 
>> /*
>>@@ -34,25 +35,25 @@
>>  * they are used by mount and unmount and when changing a
>filesystem's
>>  * mountpoint.
>>  *
>>- * 	zfs_is_mounted()
>>- * 	zfs_mount()
>>- * 	zfs_unmount()
>>- * 	zfs_unmountall()
>>+ *	zfs_is_mounted()
>>+ *	zfs_mount()
>>+ *	zfs_unmount()
>>+ *	zfs_unmountall()
>>  *
>>* This file also contains the functions used to manage sharing
>>filesystems via
>>  * NFS and iSCSI:
>>  *
>>- * 	zfs_is_shared()
>>- * 	zfs_share()
>>- * 	zfs_unshare()
>>+ *	zfs_is_shared()
>>+ *	zfs_share()
>>+ *	zfs_unshare()
>>  *
>>- * 	zfs_is_shared_nfs()
>>- * 	zfs_is_shared_smb()
>>- * 	zfs_share_proto()
>>- * 	zfs_shareall();
>>- * 	zfs_unshare_nfs()
>>- * 	zfs_unshare_smb()
>>- * 	zfs_unshareall_nfs()
>>+ *	zfs_is_shared_nfs()
>>+ *	zfs_is_shared_smb()
>>+ *	zfs_share_proto()
>>+ *	zfs_shareall();
>>+ *	zfs_unshare_nfs()
>>+ *	zfs_unshare_smb()
>>+ *	zfs_unshareall_nfs()
>>  *	zfs_unshareall_smb()
>>  *	zfs_unshareall()
>>  *	zfs_unshareall_bypath()
>>@@ -60,8 +61,8 @@
>>  * The following functions are available for pool consumers, and will
>>  * mount/unmount and share/unshare all datasets within pool:
>>  *
>>- * 	zpool_enable_datasets()
>>- * 	zpool_disable_datasets()
>>+ *	zpool_enable_datasets()
>>+ *	zpool_disable_datasets()
>>  */
>> 
>> #include <dirent.h>
>>@@ -83,10 +84,14 @@
>> #include <libzfs.h>
>> 
>> #include "libzfs_impl.h"
>>+#include <thread_pool.h>
>> 
>> #include <libshare.h>
>> #define	MAXISALEN	257	/* based on sysinfo(2) man page */
>> 
>>+static int mount_tp_nthr = 512; /* tpool threads for multi-threaded
>>mounting */
>>+
>>+static void zfs_mount_task(void *);
>> static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *);
>> zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **,
>>     zfs_share_proto_t);
>>@@ -1134,25 +1139,32 @@ remove_mountpoint(zfs_handle_t *zhp)
>> 	}
>> }
>> 
>>+/*
>>+ * Add the given zfs handle to the cb_handles array, dynamically
>>reallocating
>>+ * the array if it is out of space
>>+ */
>> void
>> libzfs_add_handle(get_all_cb_t *cbp, zfs_handle_t *zhp)
>> {
>> 	if (cbp->cb_alloc == cbp->cb_used) {
>> 		size_t newsz;
>>-		void *ptr;
>>+		zfs_handle_t **newhandles;
>> 
>>-		newsz = cbp->cb_alloc ? cbp->cb_alloc * 2 : 64;
>>-		ptr = zfs_realloc(zhp->zfs_hdl,
>>-		    cbp->cb_handles, cbp->cb_alloc * sizeof (void *),
>>-		    newsz * sizeof (void *));
>>-		cbp->cb_handles = ptr;
>>+		newsz = cbp->cb_alloc != 0 ? cbp->cb_alloc * 2 : 64;
>>+		newhandles = zfs_realloc(zhp->zfs_hdl,
>>+		    cbp->cb_handles, cbp->cb_alloc * sizeof (zfs_handle_t *),
>>+		    newsz * sizeof (zfs_handle_t *));
>>+		cbp->cb_handles = newhandles;
>> 		cbp->cb_alloc = newsz;
>> 	}
>> 	cbp->cb_handles[cbp->cb_used++] = zhp;
>> }
>> 
>>+/*
>>+ * Recursive helper function used during file system enumeration
>>+ */
>> static int
>>-mount_cb(zfs_handle_t *zhp, void *data)
>>+zfs_iter_cb(zfs_handle_t *zhp, void *data)
>> {
>> 	get_all_cb_t *cbp = data;
>> 
>>@@ -1178,104 +1190,362 @@ mount_cb(zfs_handle_t *zhp, void *data)
>> 	}
>> 
>> 	libzfs_add_handle(cbp, zhp);
>>-	if (zfs_iter_filesystems(zhp, mount_cb, cbp) != 0) {
>>+	if (zfs_iter_filesystems(zhp, zfs_iter_cb, cbp) != 0) {
>> 		zfs_close(zhp);
>> 		return (-1);
>> 	}
>> 	return (0);
>> }
>> 
>>-int
>>-libzfs_dataset_cmp(const void *a, const void *b)
>>+/*
>>+ * Sort comparator that compares two mountpoint paths. We sort these
>>paths so
>>+ * that subdirectories immediately follow their parents. This means
>>that we
>>+ * effectively treat the '/' character as the lowest value non-nul
>>char.
>>+ * Since filesystems from non-global zones can have the same
>>mountpoint
>>+ * as other filesystems, the comparator sorts global zone filesystems
>>to
>>+ * the top of the list. This means that the global zone will traverse
>>the
>>+ * filesystem list in the correct order and can stop when it sees the
>>+ * first zoned filesystem. In a non-global zone, only the delegated
>>+ * filesystems are seen.
>>+ *
>>+ * An example sorted list using this comparator would look like:
>>+ *
>>+ * /foo
>>+ * /foo/bar
>>+ * /foo/bar/baz
>>+ * /foo/baz
>>+ * /foo.bar
>>+ * /foo (NGZ1)
>>+ * /foo (NGZ2)
>>+ *
>>+ * The mount code depend on this ordering to deterministically
>iterate
>>+ * over filesystems in order to spawn parallel mount tasks.
>>+ */
>>+static int
>>+mountpoint_cmp(const void *arga, const void *argb)
>> {
>>-	zfs_handle_t **za = (zfs_handle_t **)a;
>>-	zfs_handle_t **zb = (zfs_handle_t **)b;
>>+	zfs_handle_t *const *zap = arga;
>>+	zfs_handle_t *za = *zap;
>>+	zfs_handle_t *const *zbp = argb;
>>+	zfs_handle_t *zb = *zbp;
>> 	char mounta[MAXPATHLEN];
>> 	char mountb[MAXPATHLEN];
>>+	const char *a = mounta;
>>+	const char *b = mountb;
>> 	boolean_t gota, gotb;
>>+	uint64_t zoneda, zonedb;
>> 
>>-	if ((gota = (zfs_get_type(*za) == ZFS_TYPE_FILESYSTEM)) != 0)
>>-		verify(zfs_prop_get(*za, ZFS_PROP_MOUNTPOINT, mounta,
>>+	zoneda = zfs_prop_get_int(za, ZFS_PROP_ZONED);
>>+	zonedb = zfs_prop_get_int(zb, ZFS_PROP_ZONED);
>>+	if (zoneda && !zonedb)
>>+		return (1);
>>+	if (!zoneda && zonedb)
>>+		return (-1);
>>+	gota = (zfs_get_type(za) == ZFS_TYPE_FILESYSTEM);
>>+	if (gota)
>>+		verify(zfs_prop_get(za, ZFS_PROP_MOUNTPOINT, mounta,
>> 		    sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0);
>>-	if ((gotb = (zfs_get_type(*zb) == ZFS_TYPE_FILESYSTEM)) != 0)
>>-		verify(zfs_prop_get(*zb, ZFS_PROP_MOUNTPOINT, mountb,
>>+	gotb = (zfs_get_type(zb) == ZFS_TYPE_FILESYSTEM);
>>+	if (gotb)
>>+		verify(zfs_prop_get(zb, ZFS_PROP_MOUNTPOINT, mountb,
>> 		    sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0);
>> 
>>-	if (gota && gotb)
>>-		return (strcmp(mounta, mountb));
>>+	if (gota && gotb) {
>>+		while (*a != '\0' && (*a == *b)) {
>>+			a++;
>>+			b++;
>>+		}
>>+		if (*a == *b)
>>+			return (0);
>>+		if (*a == '\0')
>>+			return (-1);
>>+		if (*b == '\0')
>>+			return (-1);
>>+		if (*a == '/')
>>+			return (-1);
>>+		if (*b == '/')
>>+			return (-1);
>>+		return (*a < *b ? -1 : *a > *b);
>>+	}
>> 
>> 	if (gota)
>> 		return (-1);
>> 	if (gotb)
>> 		return (1);
>> 
>>-	return (strcmp(zfs_get_name(a), zfs_get_name(b)));
>>+	/*
>>+	 * If neither filesystem has a mountpoint, revert to sorting by
>>+	 * datset name.
>>+	 */
>>+	return (strcmp(zfs_get_name(za), zfs_get_name(zb)));
>> }
>> 
>> /*
>>+ * Reutrn true if path2 is a child of path1
>>+ */
>>+static boolean_t
>>+libzfs_path_contains(const char *path1, const char *path2)
>>+{
>>+	return (strstr(path2, path1) == path2 && path2[strlen(path1)] ==
>>'/');
>>+}
>>+
>>+
>>+static int
>>+non_descendant_idx(zfs_handle_t **handles, size_t num_handles, int
>>idx)
>>+{
>>+	char parent[ZFS_MAXPROPLEN];
>>+	char child[ZFS_MAXPROPLEN];
>>+	int i;
>>+
>>+	verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, parent,
>>+	    sizeof (parent), NULL, NULL, 0, B_FALSE) == 0);
>>+
>>+	for (i = idx + 1; i < num_handles; i++) {
>>+		verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, child,
>>+		    sizeof (child), NULL, NULL, 0, B_FALSE) == 0);
>>+		if (!libzfs_path_contains(parent, child))
>>+			break;
>>+	}
>>+	return (i);
>>+}
>>+
>>+typedef struct mnt_param {
>>+	libzfs_handle_t	*mnt_hdl;
>>+	tpool_t		*mnt_tp;
>>+	zfs_handle_t	**mnt_zhps; /* filesystems to mount */
>>+	size_t		mnt_num_handles;
>>+	int		mnt_idx;	/* Index of selected entry to mount */
>>+	zfs_iter_f	mnt_func;
>>+	void		*mnt_data;
>>+} mnt_param_t;
>>+
>>+/*
>>+ * Allocate and populate the parameter struct for mount function, and
>>+ * schedule mounting of the entry selected by idx.
>>+ */
>>+static void
>>+zfs_dispatch_mount(libzfs_handle_t *hdl, zfs_handle_t **handles,
>>+    size_t num_handles, int idx, zfs_iter_f func, void *data, tpool_t
>>*tp)
>>+{
>>+	mnt_param_t *mnt_param = zfs_alloc(hdl, sizeof (mnt_param_t));
>>+
>>+	mnt_param->mnt_hdl = hdl;
>>+	mnt_param->mnt_tp = tp;
>>+	mnt_param->mnt_zhps = handles;
>>+	mnt_param->mnt_num_handles = num_handles;
>>+	mnt_param->mnt_idx = idx;
>>+	mnt_param->mnt_func = func;
>>+	mnt_param->mnt_data = data;
>>+
>>+	(void) tpool_dispatch(tp, zfs_mount_task, (void*)mnt_param);
>>+}
>>+
>>+/*
>>+ * This is the structure used to keep state of mounting or sharing
>>operations
>>+ * during a call to zpool_enable_datasets().
>>+ */
>>+typedef struct mount_state {
>>+	/*
>>+	 * ms_mntstatus is set to -1 if any mount fails. While multiple
>>threads
>>+	 * could update this variable concurrently, no synchronization is
>>+	 * needed as it's only ever set to -1.
>>+	 */
>>+	int		ms_mntstatus;
>>+	int		ms_mntflags;
>>+	const char	*ms_mntopts;
>>+} mount_state_t;
>>+
>>+static int
>>+zfs_mount_one(zfs_handle_t *zhp, void *arg)
>>+{
>>+	mount_state_t *ms = arg;
>>+	int ret = 0;
>>+
>>+	if (zfs_mount(zhp, ms->ms_mntopts, ms->ms_mntflags) != 0)
>>+		ret = ms->ms_mntstatus = -1;
>>+	return (ret);
>>+}
>>+
>>+static int
>>+zfs_share_one(zfs_handle_t *zhp, void *arg)
>>+{
>>+	mount_state_t *ms = arg;
>>+	int ret = 0;
>>+
>>+	if (zfs_share(zhp) != 0)
>>+		ret = ms->ms_mntstatus = -1;
>>+	return (ret);
>>+}
>>+
>>+/*
>>+ * Thread pool function to mount one file system. On completion, it
>>finds and
>>+ * schedules its children to be mounted. This depends on the sorting
>>done in
>>+ * zfs_foreach_mountpoint(). Note that the degenerate case (chain of
>>entries
>>+ * each descending from the previous) will have no parallelism since
>>we always
>>+ * have to wait for the parent to finish mounting before we can
>>schedule
>>+ * its children.
>>+ */
>>+static void
>>+zfs_mount_task(void *arg)
>>+{
>>+	mnt_param_t *mp = arg;
>>+	int idx = mp->mnt_idx;
>>+	zfs_handle_t **handles = mp->mnt_zhps;
>>+	size_t num_handles = mp->mnt_num_handles;
>>+	char mountpoint[ZFS_MAXPROPLEN];
>>+
>>+	verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, mountpoint,
>>+	    sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
>>+
>>+	if (mp->mnt_func(handles[idx], mp->mnt_data) != 0)
>>+		return;
>>+
>>+	/*
>>+	 * We dispatch tasks to mount filesystems with mountpoints
>underneath
>>+	 * this one. We do this by dispatching the next filesystem with a
>>+	 * descendant mountpoint of the one we just mounted, then skip all
>of
>>+	 * its descendants, dispatch the next descendant mountpoint, and so
>>on.
>>+	 * The non_descendant_idx() function skips over filesystems that are
>>+	 * descendants of the filesystem we just dispatched.
>>+	 */
>>+	for (int i = idx + 1; i < num_handles;
>>+	    i = non_descendant_idx(handles, num_handles, i)) {
>>+		char child[ZFS_MAXPROPLEN];
>>+		verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT,
>>+		    child, sizeof (child), NULL, NULL, 0, B_FALSE) == 0);
>>+
>>+		if (!libzfs_path_contains(mountpoint, child))
>>+			break; /* not a descendant, return */
>>+		zfs_dispatch_mount(mp->mnt_hdl, handles, num_handles, i,
>>+		    mp->mnt_func, mp->mnt_data, mp->mnt_tp);
>>+	}
>>+	free(mp);
>>+}
>>+
>>+/*
>>+ * Issue the func callback for each ZFS handle contained in the
>>handles
>>+ * array. This function is used to mount all datasets, and so this
>>function
>>+ * guarantees that filesystems for parent mountpoints are called
>>before their
>>+ * children. As such, before issuing any callbacks, we first sort the
>>array
>>+ * of handles by mountpoint.
>>+ *
>>+ * Callbacks are issued in one of two ways:
>>+ *
>>+ * 1. Sequentially: If the parallel argument is B_FALSE or the
>>ZFS_SERIAL_MOUNT
>>+ *    environment variable is set, then we issue callbacks
>>sequentially.
>>+ *
>>+ * 2. In parallel: If the parallel argument is B_TRUE and the
>>ZFS_SERIAL_MOUNT
>>+ *    environment variable is not set, then we use a tpool to
>dispatch
>>threads
>>+ *    to mount filesystems in parallel. This function dispatches
>tasks
>>to mount
>>+ *    the filesystems at the top-level mountpoints, and these tasks
>in
>>turn
>>+ *    are responsible for recursively mounting filesystems in their
>>children
>>+ *    mountpoints.
>>+ */
>>+void
>>+zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles,
>>+    size_t num_handles, zfs_iter_f func, void *data, boolean_t
>>parallel)
>>+{
>>+	zoneid_t zoneid = getzoneid();
>>+
>>+	/*
>>+	 * The ZFS_SERIAL_MOUNT environment variable is an undocumented
>>+	 * variable that can be used as a convenience to do a/b comparison
>>+	 * of serial vs. parallel mounting.
>>+	 */
>>+	boolean_t serial_mount = !parallel ||
>>+	    (getenv("ZFS_SERIAL_MOUNT") != NULL);
>>+
>>+	/*
>>+	 * Sort the datasets by mountpoint. See mountpoint_cmp for details
>>+	 * of how these are sorted.
>>+	 */
>>+	qsort(handles, num_handles, sizeof (zfs_handle_t *),
>mountpoint_cmp);
>>+
>>+	if (serial_mount) {
>>+		for (int i = 0; i < num_handles; i++) {
>>+			func(handles[i], data);
>>+		}
>>+		return;
>>+	}
>>+
>>+	/*
>>+	 * Issue the callback function for each dataset using a parallel
>>+	 * algorithm that uses a thread pool to manage threads.
>>+	 */
>>+	tpool_t *tp = tpool_create(1, mount_tp_nthr, 0, NULL);
>>+
>>+	/*
>>+	 * There may be multiple "top level" mountpoints outside of the
>>pool's
>>+	 * root mountpoint, e.g.: /foo /bar. Dispatch a mount task for each
>>of
>>+	 * these.
>>+	 */
>>+	for (int i = 0; i < num_handles;
>>+	    i = non_descendant_idx(handles, num_handles, i)) {
>>+		/*
>>+		 * Since the mountpoints have been sorted so that the zoned
>>+		 * filesystems are at the end, a zoned filesystem seen from
>>+		 * the global zone means that we're done.
>>+		 */
>>+		if (zoneid == GLOBAL_ZONEID &&
>>+		    zfs_prop_get_int(handles[i], ZFS_PROP_ZONED))
>>+			break;
>>+		zfs_dispatch_mount(hdl, handles, num_handles, i, func, data,
>>+		    tp);
>>+	}
>>+
>>+	tpool_wait(tp);	/* wait for all scheduled mounts to complete */
>>+	tpool_destroy(tp);
>>+}
>>+
>>+/*
>>* Mount and share all datasets within the given pool.  This assumes
>>that no
>>- * datasets within the pool are currently mounted.  Because users can
>>create
>>- * complicated nested hierarchies of mountpoints, we first gather all
>>the
>>- * datasets and mountpoints within the pool, and sort them by
>>mountpoint.  Once
>>- * we have the list of all filesystems, we iterate over them in order
>>and mount
>>- * and/or share each one.
>>+ * datasets within the pool are currently mounted.
>>  */
>> #pragma weak zpool_mount_datasets = zpool_enable_datasets
>> int
>>zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int
>>flags)
>> {
>> 	get_all_cb_t cb = { 0 };
>>-	libzfs_handle_t *hdl = zhp->zpool_hdl;
>>+	mount_state_t ms = { 0 };
>> 	zfs_handle_t *zfsp;
>>-	int i, ret = -1;
>>-	int *good;
>>+	int ret = 0;
>> 
>>-	/*
>>-	 * Gather all non-snap datasets within the pool.
>>-	 */
>>-	if ((zfsp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_DATASET)) ==
>>NULL)
>>+	if ((zfsp = zfs_open(zhp->zpool_hdl, zhp->zpool_name,
>>+	    ZFS_TYPE_DATASET)) == NULL)
>> 		goto out;
>> 
>>-	libzfs_add_handle(&cb, zfsp);
>>-	if (zfs_iter_filesystems(zfsp, mount_cb, &cb) != 0)
>>-		goto out;
>> 	/*
>>-	 * Sort the datasets by mountpoint.
>>+	 * Gather all non-snapshot datasets within the pool. Start by adding
>>+	 * the root filesystem for this pool to the list, and then iterate
>>+	 * over all child filesystems.
>> 	 */
>>-	qsort(cb.cb_handles, cb.cb_used, sizeof (void *),
>>-	    libzfs_dataset_cmp);
>>+	libzfs_add_handle(&cb, zfsp);
>>+	if (zfs_iter_filesystems(zfsp, zfs_iter_cb, &cb) != 0)
>>+		goto out;
>> 
>> 	/*
>>-	 * And mount all the datasets, keeping track of which ones
>>-	 * succeeded or failed.
>>+	 * Mount all filesystems
>> 	 */
>>-	if ((good = zfs_alloc(zhp->zpool_hdl,
>>-	    cb.cb_used * sizeof (int))) == NULL)
>>-		goto out;
>>+	ms.ms_mntopts = mntopts;
>>+	ms.ms_mntflags = flags;
>>+	zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used,
>>+	    zfs_mount_one, &ms, B_TRUE);
>>+	if (ms.ms_mntstatus != 0)
>>+		ret = ms.ms_mntstatus;
>> 
>>-	ret = 0;
>>-	for (i = 0; i < cb.cb_used; i++) {
>>-		if (zfs_mount(cb.cb_handles[i], mntopts, flags) != 0)
>>-			ret = -1;
>>-		else
>>-			good[i] = 1;
>>-	}
>>-
>> 	/*
>>-	 * Then share all the ones that need to be shared. This needs
>>-	 * to be a separate pass in order to avoid excessive reloading
>>-	 * of the configuration. Good should never be NULL since
>>-	 * zfs_alloc is supposed to exit if memory isn't available.
>>+	 * Share all filesystems that need to be shared. This needs to be
>>+	 * a separate pass because libshare is not mt-safe, and so we need
>>+	 * to share serially.
>> 	 */
>>-	for (i = 0; i < cb.cb_used; i++) {
>>-		if (good[i] && zfs_share(cb.cb_handles[i]) != 0)
>>-			ret = -1;
>>-	}
>>+	ms.ms_mntstatus = 0;
>>+	zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used,
>>+	    zfs_share_one, &ms, B_FALSE);
>>+	if (ms.ms_mntstatus != 0)
>>+		ret = ms.ms_mntstatus;
>> 
>>-	free(good);
>>-
>> out:
>>-	for (i = 0; i < cb.cb_used; i++)
>>+	for (int i = 0; i < cb.cb_used; i++)
>> 		zfs_close(cb.cb_handles[i]);
>> 	free(cb.cb_handles);
>> 
>
>This broke my systems, many filesystems fail to mount causing nullfs
>late mounts to fail. No details now until tonight.
>
>Suggest we back this out until it is properly tested.

Nested zfs filesystems seem not to be handled properly or possibly not supported any more. This explains my mail gateway also not mounting all filesystems in /home. It was odd that dovecot stopped working.

The symptom of the problem is zfs mount -a no longer mounts all filesystems. Zfs mount fails saying the filesystem is already mounted. The workaround is to zfs umount each affected zfs dataset by hand and zfs mount it by hand.

Generally this has screwed up sites that have hundreds (in my case 122) zfs datasets. The work around might be to script testing each mount, unmounting and remounting if necessary.

I'm being sarcastic about creating an rc script to clean this up. This needs to be backed out and tested properly before being committed. 

-- 
Pardon the typos and autocorrect, small keyboard in use.
Cheers,
Cy Schubert <Cy.Schubert at cschubert.com>
FreeBSD UNIX: <cy at FreeBSD.org> Web: http://www.FreeBSD.org

	The need of the many outweighs the greed of the few.