kern/170914: [zfs] [patch] Import patchs related with issues 3090 and 3102.

Marcelo Araujo araujo at FreeBSD.org
Thu Aug 23 08:40:03 UTC 2012


>Number:         170914
>Category:       kern
>Synopsis:       [zfs] [patch] Import patchs related with issues 3090 and 3102.
>Confidential:   no
>Severity:       non-critical
>Priority:       low
>Responsible:    freebsd-bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          update
>Submitter-Id:   current-users
>Arrival-Date:   Thu Aug 23 08:40:03 UTC 2012
>Closed-Date:
>Last-Modified:
>Originator:     Marcelo Araujo
>Release:        9-1-BETA1
>Organization:
FreeBSD
>Environment:
FreeBSD QnapAraujo 9.1-BETA1 FreeBSD 9.1-BETA1 #15: Wed Jul 11 08:36:49 PDT 2012 root at build9x64.pcbsd.org:/usr/obj/builds/amd64/pcbsd-build90/fbsd-source/9.0/sys/GENERIC amd64
>Description:
Import from Illumos-gate the patch to solve the following problems:
3090 vdev_reopen() during reguid causes vdev to be treated as corrupt
3102 vdev_uberblock_load() and vdev_validate() may read the wrong label

https://www.illumos.org/issues/3090
https://www.illumos.org/issues/3102

Commit: b1e53580146d
>How-To-Repeat:

>Fix:


Patch attached with submission follows:

Index: cddl/contrib/opensolaris/cmd/ztest/ztest.c
===================================================================
--- cddl/contrib/opensolaris/cmd/ztest/ztest.c	(revision 239609)
+++ cddl/contrib/opensolaris/cmd/ztest/ztest.c	(working copy)
@@ -364,7 +364,7 @@
 	{ ztest_spa_rename,			1,	&zopt_rarely	},
 	{ ztest_scrub,				1,	&zopt_rarely	},
 	{ ztest_dsl_dataset_promote_busy,	1,	&zopt_rarely	},
-	{ ztest_vdev_attach_detach,		1,	&zopt_rarely },
+	{ ztest_vdev_attach_detach,		1,	&zopt_rarely 	},
 	{ ztest_vdev_LUN_growth,		1,	&zopt_rarely	},
 	{ ztest_vdev_add_remove,		1,
 	    &ztest_opts.zo_vdevtime				},
@@ -415,6 +415,13 @@
 static ztest_ds_t *ztest_ds;
 
 static mutex_t ztest_vdev_lock;
+
+/*
+ * The ztest_name_lock protects the pool and dataset namespace used by
+ * the individual tests. To modify the namespace, consumers must grab
+ * this lock as writer. Grabbing the lock as reader will ensure that the
+ * namespace does not change while the lock is held.
+ */
 static rwlock_t ztest_name_lock;
 
 static boolean_t ztest_dump_core = B_TRUE;
@@ -4860,10 +4867,16 @@
 {
 	spa_t *spa = ztest_spa;
 	uint64_t orig, load;
+	int error;
 
 	orig = spa_guid(spa);
 	load = spa_load_guid(spa);
-	if (spa_change_guid(spa) != 0)
+
+	(void) rw_wrlock(&ztest_name_lock);
+	error = spa_change_guid(spa);
+	(void) rw_unlock(&ztest_name_lock);
+
+	if (error != 0)
 		return;
 
 	if (ztest_opts.zo_verbose >= 3) {
@@ -5542,6 +5555,12 @@
 	VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
 	VERIFY3U(0, ==, ztest_dataset_open(0));
 	ztest_dataset_close(0);
+
+	spa->spa_debug = B_TRUE;
+	ztest_spa = spa;
+	txg_wait_synced(spa_get_dsl(spa), 0);
+	ztest_reguid(NULL, 0);
+
 	spa_close(spa, FTAG);
 	kernel_fini();
 }
Index: cddl/contrib/opensolaris/lib/libzfs/common/libzfs_import.c
===================================================================
--- cddl/contrib/opensolaris/lib/libzfs/common/libzfs_import.c	(revision 239609)
+++ cddl/contrib/opensolaris/lib/libzfs/common/libzfs_import.c	(working copy)
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
@@ -437,8 +437,8 @@
 	uint_t i, nspares, nl2cache;
 	boolean_t config_seen;
 	uint64_t best_txg;
-	char *name, *hostname, *comment;
-	uint64_t version, guid;
+	char *name, *hostname;
+	uint64_t guid;
 	uint_t children = 0;
 	nvlist_t **child = NULL;
 	uint_t holes;
@@ -531,54 +531,46 @@
 				 *	hostid (if available)
 				 *	hostname (if available)
 				 */
-				uint64_t state;
+				uint64_t state, version, pool_txg;
+				char *comment = NULL;
 
-				verify(nvlist_lookup_uint64(tmp,
-				    ZPOOL_CONFIG_VERSION, &version) == 0);
-				if (nvlist_add_uint64(config,
-				    ZPOOL_CONFIG_VERSION, version) != 0)
-					goto nomem;
-				verify(nvlist_lookup_uint64(tmp,
-				    ZPOOL_CONFIG_POOL_GUID, &guid) == 0);
-				if (nvlist_add_uint64(config,
-				    ZPOOL_CONFIG_POOL_GUID, guid) != 0)
-					goto nomem;
-				verify(nvlist_lookup_string(tmp,
-				    ZPOOL_CONFIG_POOL_NAME, &name) == 0);
-				if (nvlist_add_string(config,
-				    ZPOOL_CONFIG_POOL_NAME, name) != 0)
-					goto nomem;
+				version = fnvlist_lookup_uint64(tmp,
+				    ZPOOL_CONFIG_VERSION);
+				fnvlist_add_uint64(config,
+				    ZPOOL_CONFIG_VERSION, version);
+				guid = fnvlist_lookup_uint64(tmp,
+				    ZPOOL_CONFIG_POOL_GUID);
+				fnvlist_add_uint64(config,
+				    ZPOOL_CONFIG_POOL_GUID, guid);
+				name = fnvlist_lookup_string(tmp,
+				    ZPOOL_CONFIG_POOL_NAME);
+				fnvlist_add_string(config,
+				    ZPOOL_CONFIG_POOL_NAME, name);
 
-				/*
-				 * COMMENT is optional, don't bail if it's not
-				 * there, instead, set it to NULL.
-				 */
+				if (nvlist_lookup_uint64(tmp,
+				    ZPOOL_CONFIG_POOL_TXG, &pool_txg) == 0)
+					fnvlist_add_uint64(config,
+					    ZPOOL_CONFIG_POOL_TXG, pool_txg);
+
 				if (nvlist_lookup_string(tmp,
-				    ZPOOL_CONFIG_COMMENT, &comment) != 0)
-					comment = NULL;
-				else if (nvlist_add_string(config,
-				    ZPOOL_CONFIG_COMMENT, comment) != 0)
-					goto nomem;
+				   ZPOOL_CONFIG_COMMENT, &comment) == 0)
+					fnvlist_add_string(config,
+						ZPOOL_CONFIG_COMMENT, comment);
 
-				verify(nvlist_lookup_uint64(tmp,
-				    ZPOOL_CONFIG_POOL_STATE, &state) == 0);
-				if (nvlist_add_uint64(config,
-				    ZPOOL_CONFIG_POOL_STATE, state) != 0)
-					goto nomem;
+				state = fnvlist_lookup_uint64(tmp,
+				    ZPOOL_CONFIG_POOL_STATE);
+				fnvlist_add_uint64(config,
+				    ZPOOL_CONFIG_POOL_STATE, state);
 
 				hostid = 0;
 				if (nvlist_lookup_uint64(tmp,
 				    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
-					if (nvlist_add_uint64(config,
-					    ZPOOL_CONFIG_HOSTID, hostid) != 0)
-						goto nomem;
-					verify(nvlist_lookup_string(tmp,
-					    ZPOOL_CONFIG_HOSTNAME,
-					    &hostname) == 0);
-					if (nvlist_add_string(config,
-					    ZPOOL_CONFIG_HOSTNAME,
-					    hostname) != 0)
-						goto nomem;
+					fnvlist_add_uint64(config,
+					    ZPOOL_CONFIG_HOSTID, hostid);
+					hostname = fnvlist_lookup_string(tmp,
+					    ZPOOL_CONFIG_HOSTNAME);
+					fnvlist_add_string(config,
+					    ZPOOL_CONFIG_HOSTNAME, hostname);
 				}
 
 				config_seen = B_TRUE;
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c	(revision 239609)
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c	(working copy)
@@ -1329,8 +1329,9 @@
 		uint64_t aux_guid = 0;
 		nvlist_t *nvl;
 
-		if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) ==
-		    NULL) {
+		uint64_t txg = strict ? spa->spa_config_txg : -1ULL;
+
+		if ((label = vdev_label_read_config(vd, txg)) == NULL) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_BAD_LABEL);
 			return (0);
@@ -1512,7 +1513,7 @@
 		    !l2arc_vdev_present(vd))
 			l2arc_add_vdev(spa, vd);
 	} else {
-		(void) vdev_validate(vd, B_TRUE);
+		(void) vdev_validate(vd, spa_last_synced_txg(spa));
 	}
 
 	/*
@@ -1971,7 +1972,7 @@
 	if (!vdev_readable(vd))
 		return (0);
 
-	if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == NULL) {
+	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		return (-1);
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h	(revision 239609)
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h	(working copy)
@@ -142,7 +142,7 @@
 struct uberblock;
 extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
 extern int vdev_label_number(uint64_t psise, uint64_t offset);
-extern nvlist_t *vdev_label_read_config(vdev_t *vd, int label);
+extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg);
 extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **);
 
 typedef enum {
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h	(revision 239609)
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h	(working copy)
@@ -141,6 +141,7 @@
 	vdev_t		*spa_root_vdev;		/* top-level vdev container */
 	uint64_t	spa_config_guid;	/* config pool guid */
 	uint64_t	spa_load_guid;		/* spa_load initialized guid */
+	uint64_t	spa_last_synced_guid;	/* last synced guid */
 	list_t		spa_config_dirty_list;	/* vdevs with dirty config */
 	list_t		spa_state_dirty_list;	/* vdevs with dirty state */
 	spa_aux_vdev_t	spa_spares;		/* hot spares */
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	(revision 239609)
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	(working copy)
@@ -120,6 +120,8 @@
 
 static dsl_syncfunc_t spa_sync_version;
 static dsl_syncfunc_t spa_sync_props;
+static dsl_checkfunc_t spa_change_guid_check;
+static dsl_syncfunc_t spa_change_guid_sync;
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
@@ -683,6 +685,47 @@
 	}
 }
 
+/*ARGSUSED*/
+static int
+spa_change_guid_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	spa_t *spa = arg1;
+	uint64_t *newguid = arg2;
+	vdev_t *rvd = spa->spa_root_vdev;
+	uint64_t vdev_state;
+
+	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+	vdev_state = rvd->vdev_state;
+	spa_config_exit(spa, SCL_STATE, FTAG);
+
+	if (vdev_state != VDEV_STATE_HEALTHY)
+		return (ENXIO);
+
+	ASSERT3U(spa_guid(spa), !=, *newguid);
+
+	return (0);
+}
+
+static void
+spa_change_guid_sync(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	spa_t *spa = arg1;
+	uint64_t *newguid = arg2;
+	uint64_t oldguid;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	oldguid = spa_guid(spa);
+
+	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+	rvd->vdev_guid = *newguid;
+	rvd->vdev_guid_sum += (*newguid - oldguid);
+	vdev_config_dirty(rvd);
+	spa_config_exit(spa, SCL_STATE, FTAG);
+
+	spa_history_log_internal(LOG_INTERNAL, spa, tx, "old=%lld new=%lld",
+	    oldguid, *newguid);
+}
+
 /*
  * Change the GUID for the pool.  This is done so that we can later
  * re-import a pool built from a clone of our own vdevs.  We will modify
@@ -695,29 +738,23 @@
 int
 spa_change_guid(spa_t *spa)
 {
-	uint64_t	oldguid, newguid;
-	uint64_t	txg;
+	int error;
+	uint64_t guid;
 
-	if (!(spa_mode_global & FWRITE))
-		return (EROFS);
+	mutex_enter(&spa_namespace_lock);
+	guid = spa_generate_guid(NULL);
 
-	txg = spa_vdev_enter(spa);
+	error = dsl_sync_task_do(spa_get_dsl(spa), spa_change_guid_check,
+	    spa_change_guid_sync, spa, &guid, 5);
 
-	if (spa->spa_root_vdev->vdev_state != VDEV_STATE_HEALTHY)
-		return (spa_vdev_exit(spa, NULL, txg, ENXIO));
+	if (error == 0) {
+		spa_config_sync(spa, B_FALSE, B_TRUE);
+		spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
+	}
 
-	oldguid = spa_guid(spa);
-	newguid = spa_generate_guid(NULL);
-	ASSERT3U(oldguid, !=, newguid);
+	mutex_exit(&spa_namespace_lock);
 
-	spa->spa_root_vdev->vdev_guid = newguid;
-	spa->spa_root_vdev->vdev_guid_sum += (newguid - oldguid);
-
-	vdev_config_dirty(spa->spa_root_vdev);
-
-	spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
-
-	return (spa_vdev_exit(spa, NULL, txg, 0));
+	return (error);
 }
 
 /*
@@ -2471,7 +2508,7 @@
 
 		/*
 		 * Now that we've validated the config, check the state of the
-		 * root vdev.  If it can't be opened, it indicates one or
+		 * root vdev. If it can't be opened, it indicates one or
 		 * more toplevel vdevs are faulted.
 		 */
 		if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
@@ -6107,6 +6144,9 @@
 				    rvd->vdev_children, txg, B_TRUE);
 		}
 
+		if (error == 0)
+			spa->spa_last_synced_guid = rvd->vdev_guid;
+
 		spa_config_exit(spa, SCL_STATE, FTAG);
 
 		if (error == 0)
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c	(revision 239609)
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c	(working copy)
@@ -1352,16 +1352,29 @@
 uint64_t
 spa_guid(spa_t *spa)
 {
+	dsl_pool_t *dp = spa_get_dsl(spa);
+	uint64_t guid;
+
 	/*
 	 * If we fail to parse the config during spa_load(), we can go through
 	 * the error path (which posts an ereport) and end up here with no root
 	 * vdev.  We stash the original pool guid in 'spa_config_guid' to handle
 	 * this case.
 	 */
-	if (spa->spa_root_vdev != NULL)
+	if (spa->spa_root_vdev == NULL)
+		return (spa->spa_config_guid);
+
+	guid = spa->spa_last_synced_guid != 0 ?
+		spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
+
+	/*
+	 * Return the most recently synced out guid unless we're
+	 * in syncing context.
+	 */
+	if (dp && dsl_pool_sync_context(dp))
 		return (spa->spa_root_vdev->vdev_guid);
 	else
-		return (spa->spa_config_guid);
+		return (guid);
 }
 
 uint64_t
Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
===================================================================
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c	(revision 239609)
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c	(working copy)
@@ -433,17 +433,22 @@
 }
 
 /*
- * Returns the configuration from the label of the given vdev. If 'label' is
- * VDEV_BEST_LABEL, each label of the vdev will be read until a valid
- * configuration is found; otherwise, only the specified label will be read.
+ * Returns the configuration from the label of the given vdev. For vdevs
+ * which don't have a txg value stored on their label (i.e. spares/cache)
+ * or have not been completely initialized (txg = 0) just return
+ * the configuration from the first valid label we find. Otherwise,
+ * find the most up-to-date label that does not exceed the specified
+ * 'txg' value.
  */
 nvlist_t *
-vdev_label_read_config(vdev_t *vd, int label)
+vdev_label_read_config(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	nvlist_t *config = NULL;
 	vdev_phys_t *vp;
 	zio_t *zio;
+	uint64_t best_txg = 0;
+	int error = 0;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SPECULATIVE;
 
@@ -456,8 +461,7 @@
 
 retry:
 	for (int l = 0; l < VDEV_LABELS; l++) {
-		if (label >= 0 && label < VDEV_LABELS && label != l)
-			continue;
+		nvlist_t *label = NULL;
 
 		zio = zio_root(spa, NULL, NULL, flags);
 
@@ -467,12 +471,30 @@
 
 		if (zio_wait(zio) == 0 &&
 		    nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
-		    &config, 0) == 0)
-			break;
+		    &label, 0) == 0) {
+			uint64_t label_txg = 0;
+			/*
+			 * Auxiliary vdevs won't have txg values in their
+			 * labels and newly added vdevs may not have been
+			 * completely initialized so just return the
+			 * configuration from the first valid label we
+			 * encounter.
+			 */
+			error = nvlist_lookup_uint64(label,
+			    ZPOOL_CONFIG_POOL_TXG, &label_txg);
+			if ((error || label_txg == 0) && !config) {
+				config = label;
+				break;
+			} else if (label_txg <= txg && label_txg > best_txg) {
+				best_txg = label_txg;
+				nvlist_free(config);
+				config = fnvlist_dup(label);
+			}
+		}
 
-		if (config != NULL) {
-			nvlist_free(config);
-			config = NULL;
+		if (label != NULL) {
+			nvlist_free(label);
+			label = NULL;
 		}
 	}
 
@@ -507,7 +529,7 @@
 	/*
 	 * Read the label, if any, and perform some basic sanity checks.
 	 */
-	if ((label = vdev_label_read_config(vd, VDEV_BEST_LABEL)) == NULL)
+	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL)
 		return (B_FALSE);
 
 	(void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
@@ -867,7 +889,6 @@
 struct ubl_cbdata {
 	uberblock_t	*ubl_ubbest;	/* Best uberblock */
 	vdev_t		*ubl_vd;	/* vdev associated with the above */
-	int		ubl_label;	/* Label associated with the above */
 };
 
 static void
@@ -886,15 +907,13 @@
 		if (ub->ub_txg <= spa->spa_load_max_txg &&
 		    vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {
 			/*
-			 * Keep track of the vdev and label in which this
-			 * uberblock was found. We will use this information
-			 * later to obtain the config nvlist associated with
+			 * Keep track of the vdev in which this uberblock
+			 * was found. We will use this information later
+			 * to obtain the config nvlist associated with
 			 * this uberblock.
 			 */
 			*cbp->ubl_ubbest = *ub;
 			cbp->ubl_vd = vd;
-			cbp->ubl_label = vdev_label_number(vd->vdev_psize,
-			    zio->io_offset);
 		}
 		mutex_exit(&rio->io_lock);
 	}
@@ -926,12 +945,11 @@
  * Reads the 'best' uberblock from disk along with its associated
  * configuration. First, we read the uberblock array of each label of each
  * vdev, keeping track of the uberblock with the highest txg in each array.
- * Then, we read the configuration from the same label as the best uberblock.
+ * Then, we read the configuration from the same vdev as the best uberblock.
  */
 void
 vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
 {
-	int i;
 	zio_t *zio;
 	spa_t *spa = rvd->vdev_spa;
 	struct ubl_cbdata cb;
@@ -951,13 +969,15 @@
 	zio = zio_root(spa, NULL, &cb, flags);
 	vdev_uberblock_load_impl(zio, rvd, flags, &cb);
 	(void) zio_wait(zio);
-	if (cb.ubl_vd != NULL) {
-		for (i = cb.ubl_label % 2; i < VDEV_LABELS; i += 2) {
-			*config = vdev_label_read_config(cb.ubl_vd, i);
-			if (*config != NULL)
-				break;
-		}
-	}
+
+	/*
+	 * It's possible that the best uberblock was discovered on a label
+	 * that has a configuration which was written in a future txg.
+	 * Search all labels on this vdev to find the configuration that
+	 * matches the txg for our uberblock.
+	 */
+	if (cb.ubl_vd != NULL)
+		*config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 


>Release-Note:
>Audit-Trail:
>Unformatted:


More information about the freebsd-bugs mailing list