svn commit: r356003 - in stable/12: stand/libsa/zfs sys/cddl/boot/zfs

Toomas Soome tsoome at FreeBSD.org
Sun Dec 22 08:22:03 UTC 2019


Author: tsoome
Date: Sun Dec 22 08:22:02 2019
New Revision: 356003
URL: https://svnweb.freebsd.org/changeset/base/356003

Log:
  MFC r354283, r354323, r354363, r354364, r354593, r355773, r355786:
  
  loader: we do not support booting from pool with log device
  loader: factor out label and uberblock load from vdev_probe, add MMP checks
  loader: populate nvl with data even when label_txg is 0
  loader: clean up the noise around log device
  loader: memory leak in vdev_label_read_config()
  loader: zfsimpl.c cstyle cleanup
  loader: rewrite zfs vdev initialization
  
  In some cases the pool discovery will get stuck in infinite loop while setting
  up the vdev children.
  
  To fix, we split the vdev setup into two parts, first we create vdevs based on
  configuration we do get from pool label, then, we process pool config from MOS
  and update the pool config if needed.
  
  This patch bundle is work leading to and including fix for issue when
  in some cases the pool configuration build does end up in infinite loop.
  
  PR:		241118
  Reported by:	Ryan Moeller

Modified:
  stable/12/stand/libsa/zfs/zfsimpl.c
  stable/12/sys/cddl/boot/zfs/zfsimpl.h
  stable/12/sys/cddl/boot/zfs/zfssubr.c
Directory Properties:
  stable/12/   (props changed)

Modified: stable/12/stand/libsa/zfs/zfsimpl.c
==============================================================================
--- stable/12/stand/libsa/zfs/zfsimpl.c	Sun Dec 22 06:56:44 2019	(r356002)
+++ stable/12/stand/libsa/zfs/zfsimpl.c	Sun Dec 22 08:22:02 2019	(r356003)
@@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/stat.h>
 #include <sys/stdint.h>
 #include <sys/list.h>
+#include <machine/_inttypes.h>
 
 #include "zfsimpl.h"
 #include "zfssubr.c"
@@ -246,7 +247,7 @@ nvlist_find(const unsigned char *nvlist, const char *n
 		const char *pairname;
 
 		xdr_int(&p, &namelen);
-		pairname = (const char*) p;
+		pairname = (const char *)p;
 		p += roundup(namelen, 4);
 		xdr_int(&p, &pairtype);
 
@@ -260,12 +261,12 @@ nvlist_find(const unsigned char *nvlist, const char *n
 			} else if (type == DATA_TYPE_STRING) {
 				int len;
 				xdr_int(&p, &len);
-				(*(const char**) valuep) = (const char*) p;
+				(*(const char **)valuep) = (const char *)p;
 				return (0);
-			} else if (type == DATA_TYPE_NVLIST
-				   || type == DATA_TYPE_NVLIST_ARRAY) {
-				(*(const unsigned char**) valuep) =
-					 (const unsigned char*) p;
+			} else if (type == DATA_TYPE_NVLIST ||
+			    type == DATA_TYPE_NVLIST_ARRAY) {
+				(*(const unsigned char **)valuep) =
+				    (const unsigned char *)p;
 				return (0);
 			} else {
 				return (EIO);
@@ -310,7 +311,7 @@ nvlist_check_features_for_read(const unsigned char *nv
 		found = 0;
 
 		xdr_int(&p, &namelen);
-		pairname = (const char*) p;
+		pairname = (const char *)p;
 		p += roundup(namelen, 4);
 		xdr_int(&p, &pairtype);
 
@@ -416,7 +417,7 @@ nvlist_print(const unsigned char *nvlist, unsigned int
 		const char *pairname;
 
 		xdr_int(&p, &namelen);
-		pairname = (const char*) p;
+		pairname = (const char *)p;
 		p += roundup(namelen, 4);
 		xdr_int(&p, &pairtype);
 
@@ -491,12 +492,12 @@ vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void 
 
 	/*printf("ZFS: reading %zu bytes at 0x%jx to %p\n", psize, (uintmax_t)offset, buf);*/
 	rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
-	if (rc)
-		return (rc);
-	if (bp != NULL)
-		return (zio_checksum_verify(vdev->spa, bp, buf));
+	if (rc == 0) {
+		if (bp != NULL)
+			rc = zio_checksum_verify(vdev->v_spa, bp, buf);
+	}
 
-	return (0);
+	return (rc);
 }
 
 typedef struct remap_segment {
@@ -564,6 +565,7 @@ vdev_indirect_mapping_open(spa_t *spa, objset_phys_t *
 
 	vim->vim_havecounts =
 	    (vim->vim_dn->dn_bonuslen > VDEV_INDIRECT_MAPPING_SIZE_V0);
+
 	return (vim);
 }
 
@@ -774,8 +776,10 @@ static vdev_t *
 vdev_lookup_top(spa_t *spa, uint64_t vdev)
 {
 	vdev_t *rvd;
+	vdev_list_t *vlist;
 
-	STAILQ_FOREACH(rvd, &spa->spa_vdevs, v_childlink)
+	vlist = &spa->spa_root_vdev->v_children;
+	STAILQ_FOREACH(rvd, vlist, v_childlink)
 		if (rvd->v_id == vdev)
 			break;
 
@@ -838,7 +842,7 @@ static void
 vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void *arg)
 {
 	list_t stack;
-	spa_t *spa = vd->spa;
+	spa_t *spa = vd->v_spa;
 	zio_t *zio = arg;
 	remap_segment_t *rs;
 
@@ -896,7 +900,6 @@ vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint6
 			 */
 			if (zio->io_error != 0)
 				break;
-
 			rs->rs_offset += inner_size;
 			rs->rs_asize -= inner_size;
 			rs->rs_split_offset += inner_size;
@@ -932,19 +935,20 @@ static int
 vdev_indirect_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
     off_t offset, size_t bytes)
 {
-	zio_t zio = { 0 };
-	spa_t *spa = vdev->spa;
-	indirect_vsd_t *iv = malloc(sizeof (*iv));
+	zio_t zio;
+	spa_t *spa = vdev->v_spa;
+	indirect_vsd_t *iv;
 	indirect_split_t *first;
 	int rc = EIO;
 
+	iv = calloc(1, sizeof(*iv));
 	if (iv == NULL)
 		return (ENOMEM);
-	bzero(iv, sizeof (*iv));
 
 	list_create(&iv->iv_splits,
 	    sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
 
+	bzero(&zio, sizeof(zio));
 	zio.io_spa = spa;
 	zio.io_bp = (blkptr_t *)bp;
 	zio.io_data = buf;
@@ -1083,39 +1087,72 @@ vdev_create(uint64_t guid, vdev_read_t *_read)
 	vdev_t *vdev;
 	vdev_indirect_config_t *vic;
 
-	vdev = malloc(sizeof(vdev_t));
-	memset(vdev, 0, sizeof(vdev_t));
-	STAILQ_INIT(&vdev->v_children);
-	vdev->v_guid = guid;
-	vdev->v_state = VDEV_STATE_OFFLINE;
-	vdev->v_read = _read;
+	vdev = calloc(1, sizeof(vdev_t));
+	if (vdev != NULL) {
+		STAILQ_INIT(&vdev->v_children);
+		vdev->v_guid = guid;
+		vdev->v_read = _read;
 
-	vic = &vdev->vdev_indirect_config;
-	vic->vic_prev_indirect_vdev = UINT64_MAX;
-	STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
+		/*
+		 * root vdev has no read function.
+		 * We only point root vdev from spa.
+		 */
+		if (_read != NULL) {
+			vic = &vdev->vdev_indirect_config;
+			vic->vic_prev_indirect_vdev = UINT64_MAX;
+			STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
+		}
+	}
 
 	return (vdev);
 }
 
+static void
+vdev_set_initial_state(vdev_t *vdev, const unsigned char *nvlist)
+{
+	uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
+	uint64_t is_log;
+
+	is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
+	is_log = 0;
+	(void) nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL,
+	    &is_offline);
+	(void) nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL,
+	    &is_removed);
+	(void) nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL,
+	    &is_faulted);
+	(void) nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64,
+	    NULL, &is_degraded);
+	(void) nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64,
+	    NULL, &isnt_present);
+	(void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL,
+	    &is_log);
+
+	if (is_offline != 0)
+		vdev->v_state = VDEV_STATE_OFFLINE;
+	else if (is_removed != 0)
+		vdev->v_state = VDEV_STATE_REMOVED;
+	else if (is_faulted != 0)
+		vdev->v_state = VDEV_STATE_FAULTED;
+	else if (is_degraded != 0)
+		vdev->v_state = VDEV_STATE_DEGRADED;
+	else if (isnt_present != 0)
+		vdev->v_state = VDEV_STATE_CANT_OPEN;
+
+	vdev->v_islog = is_log == 1;
+}
+
 static int
-vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
-    vdev_t **vdevp, int is_newer)
+vdev_init(uint64_t guid, const unsigned char *nvlist, vdev_t **vdevp)
 {
-	int rc;
-	uint64_t guid, id, ashift, asize, nparity;
-	const char *type;
+	uint64_t id, ashift, asize, nparity;
 	const char *path;
-	vdev_t *vdev, *kid;
-	const unsigned char *kids;
-	int nkids, i, is_new;
-	uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
+	const char *type;
+	vdev_t *vdev;
 
-	if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
-	    NULL, &guid)
-	    || nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id)
-	    || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING,
+	if (nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id) ||
+	    nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING,
 	    NULL, &type)) {
-		printf("ZFS: can't find vdev details\n");
 		return (ENOENT);
 	}
 
@@ -1131,139 +1168,224 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vde
 		return (EIO);
 	}
 
-	is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
+	if (strcmp(type, VDEV_TYPE_MIRROR) == 0)
+		vdev = vdev_create(guid, vdev_mirror_read);
+	else if (strcmp(type, VDEV_TYPE_RAIDZ) == 0)
+		vdev = vdev_create(guid, vdev_raidz_read);
+	else if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
+		vdev = vdev_create(guid, vdev_replacing_read);
+	else if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) {
+		vdev_indirect_config_t *vic;
 
-	nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL,
-			&is_offline);
-	nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL,
-			&is_removed);
-	nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL,
-			&is_faulted);
-	nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, NULL,
-			&is_degraded);
-	nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, NULL,
-			&isnt_present);
-
-	vdev = vdev_find(guid);
-	if (!vdev) {
-		is_new = 1;
-
-		if (!strcmp(type, VDEV_TYPE_MIRROR))
-			vdev = vdev_create(guid, vdev_mirror_read);
-		else if (!strcmp(type, VDEV_TYPE_RAIDZ))
-			vdev = vdev_create(guid, vdev_raidz_read);
-		else if (!strcmp(type, VDEV_TYPE_REPLACING))
-			vdev = vdev_create(guid, vdev_replacing_read);
-		else if (!strcmp(type, VDEV_TYPE_INDIRECT)) {
-			vdev_indirect_config_t *vic;
-
-			vdev = vdev_create(guid, vdev_indirect_read);
+		vdev = vdev_create(guid, vdev_indirect_read);
+		if (vdev != NULL) {
 			vdev->v_state = VDEV_STATE_HEALTHY;
 			vic = &vdev->vdev_indirect_config;
 
 			nvlist_find(nvlist,
-			    ZPOOL_CONFIG_INDIRECT_OBJECT, DATA_TYPE_UINT64,
+			    ZPOOL_CONFIG_INDIRECT_OBJECT,
+			    DATA_TYPE_UINT64,
 			    NULL, &vic->vic_mapping_object);
 			nvlist_find(nvlist,
-			    ZPOOL_CONFIG_INDIRECT_BIRTHS, DATA_TYPE_UINT64,
+			    ZPOOL_CONFIG_INDIRECT_BIRTHS,
+			    DATA_TYPE_UINT64,
 			    NULL, &vic->vic_births_object);
 			nvlist_find(nvlist,
-			    ZPOOL_CONFIG_PREV_INDIRECT_VDEV, DATA_TYPE_UINT64,
+			    ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
+			    DATA_TYPE_UINT64,
 			    NULL, &vic->vic_prev_indirect_vdev);
-		} else
-			vdev = vdev_create(guid, vdev_disk_read);
-
-		vdev->v_id = id;
-		vdev->v_top = pvdev != NULL ? pvdev : vdev;
-		if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
-			DATA_TYPE_UINT64, NULL, &ashift) == 0) {
-			vdev->v_ashift = ashift;
-		} else {
-			vdev->v_ashift = 0;
 		}
-		if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE,
-		    DATA_TYPE_UINT64, NULL, &asize) == 0) {
-			vdev->v_psize = asize +
-			    VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
-		}
-		if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
-			DATA_TYPE_UINT64, NULL, &nparity) == 0) {
-			vdev->v_nparity = nparity;
-		} else {
-			vdev->v_nparity = 0;
-		}
-		if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
-				DATA_TYPE_STRING, NULL, &path) == 0) {
-			if (strncmp(path, "/dev/", 5) == 0)
-				path += 5;
-			vdev->v_name = strdup(path);
-		} else {
-			char *name;
+	} else {
+		vdev = vdev_create(guid, vdev_disk_read);
+	}
 
-			if (!strcmp(type, "raidz")) {
-				if (vdev->v_nparity < 1 ||
-				    vdev->v_nparity > 3) {
-					printf("ZFS: can only boot from disk, "
-					    "mirror, raidz1, raidz2 and raidz3 "
-					    "vdevs\n");
-					return (EIO);
-				}
-				asprintf(&name, "%s%d-%jd", type,
-				    vdev->v_nparity, id);
-			} else {
-				asprintf(&name, "%s-%jd", type, id);
+	if (vdev == NULL)
+		return (ENOMEM);
+
+	vdev_set_initial_state(vdev, nvlist);
+	vdev->v_id = id;
+	if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
+	    DATA_TYPE_UINT64, NULL, &ashift) == 0)
+		vdev->v_ashift = ashift;
+
+	if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE,
+	    DATA_TYPE_UINT64, NULL, &asize) == 0) {
+		vdev->v_psize = asize +
+		    VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
+	}
+
+	if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
+	    DATA_TYPE_UINT64, NULL, &nparity) == 0)
+		vdev->v_nparity = nparity;
+
+	if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
+	    DATA_TYPE_STRING, NULL, &path) == 0) {
+		if (strncmp(path, "/dev/", 5) == 0)
+			path += 5;
+		vdev->v_name = strdup(path);
+	} else {
+		char *name;
+
+		name = NULL;
+		if (strcmp(type, "raidz") == 0) {
+			if (vdev->v_nparity < 1 ||
+			    vdev->v_nparity > 3) {
+				printf("ZFS: can only boot from disk, "
+				    "mirror, raidz1, raidz2 and raidz3 "
+				    "vdevs\n");
+				return (EIO);
 			}
-			if (name == NULL)
-				return (ENOMEM);
-			vdev->v_name = name;
+			(void) asprintf(&name, "%s%d-%" PRIu64, type,
+			    vdev->v_nparity, id);
+		} else {
+			(void) asprintf(&name, "%s-%" PRIu64, type, id);
 		}
-	} else {
-		is_new = 0;
+		vdev->v_name = name;
 	}
+	*vdevp = vdev;
+	return (0);
+}
 
-	if (is_new || is_newer) {
-		/*
-		 * This is either new vdev or we've already seen this vdev,
-		 * but from an older vdev label, so let's refresh its state
-		 * from the newer label.
-		 */
-		if (is_offline)
-			vdev->v_state = VDEV_STATE_OFFLINE;
-		else if (is_removed)
-			vdev->v_state = VDEV_STATE_REMOVED;
-		else if (is_faulted)
-			vdev->v_state = VDEV_STATE_FAULTED;
-		else if (is_degraded)
-			vdev->v_state = VDEV_STATE_DEGRADED;
-		else if (isnt_present)
-			vdev->v_state = VDEV_STATE_CANT_OPEN;
+/*
+ * Find slot for vdev. We return either NULL to signal to use
+ * STAILQ_INSERT_HEAD, or we return link element to be used with
+ * STAILQ_INSERT_AFTER.
+ */
+static vdev_t *
+vdev_find_previous(vdev_t *top_vdev, vdev_t *vdev)
+{
+	vdev_t *v, *previous;
+
+	if (STAILQ_EMPTY(&top_vdev->v_children))
+		return (NULL);
+
+	previous = NULL;
+	STAILQ_FOREACH(v, &top_vdev->v_children, v_childlink) {
+		if (v->v_id > vdev->v_id)
+			return (previous);
+
+		if (v->v_id == vdev->v_id)
+			return (v);
+
+		if (v->v_id < vdev->v_id)
+			previous = v;
 	}
+	return (previous);
+}
 
-	rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
-	    &nkids, &kids);
+static size_t
+vdev_child_count(vdev_t *vdev)
+{
+	vdev_t *v;
+	size_t count;
+
+	count = 0;
+	STAILQ_FOREACH(v, &vdev->v_children, v_childlink) {
+		count++;
+	}
+	return (count);
+}
+
+/*
+ * Insert vdev into top_vdev children list. List is ordered by v_id.
+ */
+static void
+vdev_insert(vdev_t *top_vdev, vdev_t *vdev)
+{
+	vdev_t *previous;
+	size_t count;
+
 	/*
-	 * Its ok if we don't have any kids.
+	 * The top level vdev can appear in random order, depending how
+	 * the firmware is presenting the disk devices.
+	 * However, we will insert vdev to create list ordered by v_id,
+	 * so we can use either STAILQ_INSERT_HEAD or STAILQ_INSERT_AFTER
+	 * as STAILQ does not have insert before.
 	 */
+	previous = vdev_find_previous(top_vdev, vdev);
+
+	if (previous == NULL) {
+		STAILQ_INSERT_HEAD(&top_vdev->v_children, vdev, v_childlink);
+		count = vdev_child_count(top_vdev);
+		if (top_vdev->v_nchildren < count)
+			top_vdev->v_nchildren = count;
+		return;
+	}
+
+	if (previous->v_id == vdev->v_id)
+		return;
+
+	STAILQ_INSERT_AFTER(&top_vdev->v_children, previous, vdev, v_childlink);
+	count = vdev_child_count(top_vdev);
+	if (top_vdev->v_nchildren < count)
+		top_vdev->v_nchildren = count;
+}
+
+static int
+vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const unsigned char *nvlist)
+{
+	vdev_t *top_vdev, *vdev;
+	const unsigned char *kids;
+	int rc, nkids;
+
+	/* Get top vdev. */
+	top_vdev = vdev_find(top_guid);
+	if (top_vdev == NULL) {
+		rc = vdev_init(top_guid, nvlist, &top_vdev);
+		if (rc != 0)
+			return (rc);
+		top_vdev->v_spa = spa;
+		top_vdev->v_top = top_vdev;
+		vdev_insert(spa->spa_root_vdev, top_vdev);
+	}
+
+	/* Add children if there are any. */
+	rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
+	    &nkids, &kids);
 	if (rc == 0) {
-		vdev->v_nchildren = nkids;
-		for (i = 0; i < nkids; i++) {
-			rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer);
-			if (rc)
+		for (int i = 0; i < nkids; i++) {
+			uint64_t guid;
+
+			rc = nvlist_find(kids, ZPOOL_CONFIG_GUID,
+			    DATA_TYPE_UINT64, NULL, &guid);
+			if (rc != 0)
 				return (rc);
-			if (is_new)
-				STAILQ_INSERT_TAIL(&vdev->v_children, kid,
-						   v_childlink);
+			rc = vdev_init(guid, kids, &vdev);
+			if (rc != 0)
+				return (rc);
+
+			vdev->v_spa = spa;
+			vdev->v_top = top_vdev;
+			vdev_insert(top_vdev, vdev);
+
 			kids = nvlist_next(kids);
 		}
 	} else {
-		vdev->v_nchildren = 0;
+		rc = 0;
 	}
 
-	if (vdevp)
-		*vdevp = vdev;
-	return (0);
+	return (rc);
 }
 
+static int
+vdev_init_from_label(spa_t *spa, const unsigned char *nvlist)
+{
+	uint64_t pool_guid, top_guid;
+	const unsigned char *vdevs;
+
+	if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
+	    NULL, &pool_guid) ||
+	    nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64,
+	    NULL, &top_guid) ||
+	    nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
+	    NULL, &vdevs)) {
+		printf("ZFS: can't find vdev details\n");
+		return (ENOENT);
+	}
+
+	return (vdev_from_nvlist(spa, top_guid, vdevs));
+}
+
 static void
 vdev_set_state(vdev_t *vdev)
 {
@@ -1271,6 +1393,10 @@ vdev_set_state(vdev_t *vdev)
 	int good_kids;
 	int bad_kids;
 
+	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
+		vdev_set_state(kid);
+	}
+
 	/*
 	 * A mirror or raidz is healthy if all its kids are healthy. A
 	 * mirror is degraded if any of its kids is healthy; a raidz
@@ -1305,6 +1431,104 @@ vdev_set_state(vdev_t *vdev)
 	}
 }
 
+static int
+vdev_update_from_nvlist(uint64_t top_guid, const unsigned char *nvlist)
+{
+	vdev_t *vdev;
+	const unsigned char *kids;
+	int rc, nkids;
+
+	/* Update top vdev. */
+	vdev = vdev_find(top_guid);
+	if (vdev != NULL)
+		vdev_set_initial_state(vdev, nvlist);
+
+	/* Update children if there are any. */
+	rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
+	    &nkids, &kids);
+	if (rc == 0) {
+		for (int i = 0; i < nkids; i++) {
+			uint64_t guid;
+
+			rc = nvlist_find(kids, ZPOOL_CONFIG_GUID,
+			    DATA_TYPE_UINT64, NULL, &guid);
+			if (rc != 0)
+				break;
+
+			vdev = vdev_find(guid);
+			if (vdev != NULL)
+				vdev_set_initial_state(vdev, kids);
+
+			kids = nvlist_next(kids);
+		}
+	} else {
+		rc = 0;
+	}
+
+	return (rc);
+}
+
+static int
+vdev_init_from_nvlist(spa_t *spa, const unsigned char *nvlist)
+{
+	uint64_t pool_guid, vdev_children;
+	const unsigned char *vdevs, *kids;
+	int rc, nkids;
+
+	if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
+	    NULL, &pool_guid) ||
+	    nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_CHILDREN, DATA_TYPE_UINT64,
+	    NULL, &vdev_children) ||
+	    nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
+	    NULL, &vdevs)) {
+		printf("ZFS: can't find vdev details\n");
+		return (ENOENT);
+	}
+
+	/* Wrong guid?! */
+	if (spa->spa_guid != pool_guid)
+		return (EIO);
+
+	spa->spa_root_vdev->v_nchildren = vdev_children;
+
+	rc = nvlist_find(vdevs, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
+	    &nkids, &kids);
+
+	/*
+	 * MOS config has at least one child for root vdev.
+	 */
+	if (rc != 0)
+		return (EIO);
+
+	for (int i = 0; i < nkids; i++) {
+		uint64_t guid;
+		vdev_t *vdev;
+
+		rc = nvlist_find(kids, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
+		    NULL, &guid);
+		if (rc != 0)
+			break;
+		vdev = vdev_find(guid);
+		/*
+		 * Top level vdev is missing, create it.
+		 */
+		if (vdev == NULL)
+			rc = vdev_from_nvlist(spa, guid, kids);
+		else
+			rc = vdev_update_from_nvlist(guid, kids);
+		if (rc != 0)
+			break;
+		kids = nvlist_next(kids);
+	}
+
+	/*
+	 * Re-evaluate top-level vdev state.
+	 */
+	vdev_set_state(spa->spa_root_vdev);
+
+	return (rc);
+}
+
 static spa_t *
 spa_find_by_guid(uint64_t guid)
 {
@@ -1347,7 +1571,7 @@ spa_get_primary_vdev(const spa_t *spa)
 		spa = spa_get_primary();
 	if (spa == NULL)
 		return (NULL);
-	vdev = STAILQ_FIRST(&spa->spa_vdevs);
+	vdev = spa->spa_root_vdev;
 	if (vdev == NULL)
 		return (NULL);
 	for (kid = STAILQ_FIRST(&vdev->v_children); kid != NULL;
@@ -1368,8 +1592,14 @@ spa_create(uint64_t guid, const char *name)
 		free(spa);
 		return (NULL);
 	}
-	STAILQ_INIT(&spa->spa_vdevs);
 	spa->spa_guid = guid;
+	spa->spa_root_vdev = vdev_create(guid, NULL);
+	if (spa->spa_root_vdev == NULL) {
+		free(spa->spa_name);
+		free(spa);
+		return (NULL);
+	}
+	spa->spa_root_vdev->v_name = strdup("root");
 	STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
 
 	return (spa);
@@ -1404,9 +1634,8 @@ pager_printf(const char *fmt, ...)
 	va_list args;
 
 	va_start(args, fmt);
-	vsprintf(line, fmt, args);
+	vsnprintf(line, sizeof(line), fmt, args);
 	va_end(args);
-
 	return (pager_output(line));
 }
 
@@ -1417,14 +1646,13 @@ pager_printf(const char *fmt, ...)
 static int
 print_state(int indent, const char *name, vdev_state_t state)
 {
-	char buf[512];
 	int i;
+	char buf[512];
 
 	buf[0] = 0;
 	for (i = 0; i < indent; i++)
 		strcat(buf, "  ");
 	strcat(buf, name);
-
 	return (pager_printf(STATUS_FORMAT, buf, state_name(state)));
 }
 
@@ -1433,6 +1661,12 @@ vdev_status(vdev_t *vdev, int indent)
 {
 	vdev_t *kid;
 	int ret;
+
+	if (vdev->v_islog) {
+		(void)pager_output("        logs\n");
+		indent++;
+	}
+
 	ret = print_state(indent, vdev->v_name, vdev->v_state);
 	if (ret != 0)
 		return (ret);
@@ -1450,6 +1684,7 @@ spa_status(spa_t *spa)
 {
 	static char bootfs[ZFS_MAXNAMELEN];
 	uint64_t rootid;
+	vdev_list_t *vlist;
 	vdev_t *vdev;
 	int good_kids, bad_kids, degraded_kids, ret;
 	vdev_state_t state;
@@ -1478,7 +1713,8 @@ spa_status(spa_t *spa)
 	good_kids = 0;
 	degraded_kids = 0;
 	bad_kids = 0;
-	STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
+	vlist = &spa->spa_root_vdev->v_children;
+	STAILQ_FOREACH(vdev, vlist, v_childlink) {
 		if (vdev->v_state == VDEV_STATE_HEALTHY)
 			good_kids++;
 		else if (vdev->v_state == VDEV_STATE_DEGRADED)
@@ -1496,7 +1732,8 @@ spa_status(spa_t *spa)
 	ret = print_state(0, spa->spa_name, state);
 	if (ret != 0)
 		return (ret);
-	STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
+
+	STAILQ_FOREACH(vdev, vlist, v_childlink) {
 		ret = vdev_status(vdev, 1);
 		if (ret != 0)
 			return (ret);
@@ -1538,71 +1775,104 @@ vdev_label_offset(uint64_t psize, int l, uint64_t offs
 }
 
 static int
-vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
+vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
 {
-	vdev_t vtmp;
-	vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
-	vdev_phys_t *tmp_label;
-	spa_t *spa;
-	vdev_t *vdev, *top_vdev, *pool_vdev;
-	off_t off;
+	unsigned int seq1 = 0;
+	unsigned int seq2 = 0;
+	int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg);
+
+	if (cmp != 0)
+		return (cmp);
+
+	cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp);
+	if (cmp != 0)
+		return (cmp);
+
+	if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1))
+		seq1 = MMP_SEQ(ub1);
+
+	if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2))
+		seq2 = MMP_SEQ(ub2);
+
+	return (AVL_CMP(seq1, seq2));
+}
+
+static int
+uberblock_verify(uberblock_t *ub)
+{
+	if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) {
+		byteswap_uint64_array(ub, sizeof (uberblock_t));
+	}
+
+	if (ub->ub_magic != UBERBLOCK_MAGIC ||
+	    !SPA_VERSION_IS_SUPPORTED(ub->ub_version))
+		return (EINVAL);
+
+	return (0);
+}
+
+static int
+vdev_label_read(vdev_t *vd, int l, void *buf, uint64_t offset,
+    size_t size)
+{
 	blkptr_t bp;
-	const unsigned char *nvlist = NULL;
-	uint64_t val;
-	uint64_t guid;
-	uint64_t best_txg = 0;
-	uint64_t pool_txg, pool_guid;
-	const char *pool_name;
-	const unsigned char *vdevs;
-	const unsigned char *features;
-	int i, l, rc, is_newer;
-	char *upbuf;
-	const struct uberblock *up;
+	off_t off;
 
-	/*
-	 * Load the vdev label and figure out which
-	 * uberblock is most current.
-	 */
-	memset(&vtmp, 0, sizeof(vtmp));
-	vtmp.v_phys_read = _read;
-	vtmp.v_read_priv = read_priv;
-	vtmp.v_psize = P2ALIGN(ldi_get_size(read_priv),
-	    (uint64_t)sizeof (vdev_label_t));
+	off = vdev_label_offset(vd->v_psize, l, offset);
 
-	/* Test for minimum pool size. */
-	if (vtmp.v_psize < SPA_MINDEVSIZE)
-		return (EIO);
+	BP_ZERO(&bp);
+	BP_SET_LSIZE(&bp, size);
+	BP_SET_PSIZE(&bp, size);
+	BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
+	BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+	DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
+	ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
 
-	tmp_label = zfs_alloc(sizeof(vdev_phys_t));
+	return (vdev_read_phys(vd, &bp, buf, off, size));
+}
 
-	for (l = 0; l < VDEV_LABELS; l++) {
-		off = vdev_label_offset(vtmp.v_psize, l,
-		    offsetof(vdev_label_t, vl_vdev_phys));
+static unsigned char *
+vdev_label_read_config(vdev_t *vd, uint64_t txg)
+{
+	vdev_phys_t *label;
+	uint64_t best_txg = 0;
+	uint64_t label_txg = 0;
+	uint64_t asize;
+	unsigned char *nvl;
+	size_t nvl_size;
+	int error;
 
-		BP_ZERO(&bp);
-		BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
-		BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
-		BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
-		BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
-		DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
-		ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
+	label = malloc(sizeof (vdev_phys_t));
+	if (label == NULL)
+		return (NULL);
 
-		if (vdev_read_phys(&vtmp, &bp, tmp_label, off, 0))
-			continue;
+	nvl_size = VDEV_PHYS_SIZE - sizeof (zio_eck_t) - 4;
+	nvl = malloc(nvl_size);
+	if (nvl == NULL)
+		goto done;
 
-		if (tmp_label->vp_nvlist[0] != NV_ENCODE_XDR)
+	for (int l = 0; l < VDEV_LABELS; l++) {
+		const unsigned char *nvlist;
+
+		if (vdev_label_read(vd, l, label,
+		    offsetof(vdev_label_t, vl_vdev_phys),
+		    sizeof (vdev_phys_t)))
 			continue;
 
-		nvlist = (const unsigned char *) tmp_label->vp_nvlist + 4;
-		if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG,
-		    DATA_TYPE_UINT64, NULL, &pool_txg) != 0)
+		if (label->vp_nvlist[0] != NV_ENCODE_XDR)
 			continue;
 
-		if (best_txg <= pool_txg) {
-			uint64_t asize;
+		nvlist = (const unsigned char *) label->vp_nvlist + 4;
+		error = nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG,
+		    DATA_TYPE_UINT64, NULL, &label_txg);
+		if (error != 0 || label_txg == 0) {
+			memcpy(nvl, nvlist, nvl_size);
+			goto done;
+		}
 
-			best_txg = pool_txg;
-			memcpy(vdev_label, tmp_label, sizeof (vdev_phys_t));
+		if (label_txg <= txg && label_txg > best_txg) {
+			best_txg = label_txg;
+			memcpy(nvl, nvlist, nvl_size);
 
 			/*
 			 * Use asize from pool config. We need this
@@ -1610,30 +1880,88 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s
 			 */
 			if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE,
 			    DATA_TYPE_UINT64, NULL, &asize) == 0) {
-				vtmp.v_psize = asize +
+				vd->v_psize = asize +
 				    VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
 			}
 		}
 	}
 
-	zfs_free(tmp_label, sizeof (vdev_phys_t));
+	if (best_txg == 0) {
+		free(nvl);
+		nvl = NULL;
+	}
+done:
+	free(label);
+	return (nvl);
+}
 
-	if (best_txg == 0)
+static void
+vdev_uberblock_load(vdev_t *vd, uberblock_t *ub)
+{
+	uberblock_t *buf;
+
+	buf = malloc(VDEV_UBERBLOCK_SIZE(vd));
+	if (buf == NULL)
+		return;
+
+	for (int l = 0; l < VDEV_LABELS; l++) {
+		for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
+			if (vdev_label_read(vd, l, buf,
+			    VDEV_UBERBLOCK_OFFSET(vd, n),
+			    VDEV_UBERBLOCK_SIZE(vd)))
+				continue;
+			if (uberblock_verify(buf) != 0)
+				continue;
+
+			if (vdev_uberblock_compare(buf, ub) > 0)
+				*ub = *buf;
+		}
+	}
+	free(buf);
+}
+
+static int
+vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
+{
+	vdev_t vtmp;
+	spa_t *spa;
+	vdev_t *vdev;
+	unsigned char *nvlist;
+	uint64_t val;
+	uint64_t guid, vdev_children;
+	uint64_t pool_txg, pool_guid;
+	const char *pool_name;
+	const unsigned char *features;
+	int rc;
+
+	/*
+	 * Load the vdev label and figure out which
+	 * uberblock is most current.
+	 */
+	memset(&vtmp, 0, sizeof(vtmp));
+	vtmp.v_phys_read = _read;
+	vtmp.v_read_priv = read_priv;
+	vtmp.v_psize = P2ALIGN(ldi_get_size(read_priv),
+	    (uint64_t)sizeof (vdev_label_t));
+
+	/* Test for minimum device size. */
+	if (vtmp.v_psize < SPA_MINDEVSIZE)
 		return (EIO);
 

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-stable mailing list