[CFR][ZFS] vdev_geom enhancements

Justin T. Gibbs gibbs at scsiguy.com
Tue Jun 14 23:06:12 UTC 2011


Modify the geom vdev provider's open behavior so that it will
only unconditionally open a device by path if the open is part
of a pool create or device add operation, and a search of all
known geom provider's label data doesn't yield a device with
matching pool and vdev GUIDs.

This fixes a bug where the wrong disk could be associated with
a vdev's configuration data when device devfs paths change due to
insert and remove events.  While, ZFS detects this kind of coding
mixup and immediately flags the device as faulted before the
confusion can cause permanent data loss, a reboot was necessary
in order to resurrect the configuration.

sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c:
     o When opening by GUID, require both the pool and vdev
       GUIDs to match.  While it is highly unlikely for two
       vdevs to have the same vdev GUIDs, the ZFS storage
       pool allocator only guarantees they are unique within
       a pool.
     o Modify the open behavior to:
       - Open by recorded device path with GUID matching
       - If that fails, search all geom providers for a
         device with matching GUIDs.
       - If that fails and we are opening a "new to a
         pool configuration" vdev, open by path.
       - Otherwise fail the open.

Fix race conditions in the GEOM vdev provider that
can occur when both a "reopen" is attempted due to
triggering the ZFS probe code on an I/O failure while
an asynchronous vdev removal request is generated.

sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c:
     Initialize the private field in vdev_geom's consumer
     objects to reference ZFS's vdev object before GEOM's
     topology lock is released.  This insures that, should
     this consumer be orphaned before ZFS's open processing
     completes, the proper data is available to post an
     async removal request.

Export physical path information to ZFS

sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c:
     Subscribe to attribute change notifications and update
     vdev physical path information (in core and on disk) when
     a GEOM::physpath event indicates they have changed.

NOTE: These diffs still contain the removal of the drop of the spa
       namespace lock during vdev_geom open.  I'm not proposing to
       commit this change now as pjd has informed me that the drop
       is required to support Zvols.  I'm still looking into how to
       resolve both the lock order reversal that can occur with the
       SCL locks here and the Zvol issue.
-------------- next part --------------
diff -u -r -x cscope.out -x out -x ctl -x compile vendor/FreeBSD/head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c SpectraBSD/head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
--- vendor/FreeBSD/head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c	2011-02-28 13:51:28.112874995 -0700
+++ SpectraBSD/head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c	2011-06-08 17:12:55.953572950 -0600
@@ -84,8 +84,52 @@
 	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
 }
 
+static void
+vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
+{
+	vdev_t *vd;
+	spa_t *spa;
+	char *physpath;
+	int error, physpath_len;
+
+	g_topology_assert();
+
+	if (strcmp(attr, "GEOM::physpath") != 0)
+		return;
+
+	if (g_access(cp, 1, 0, 0) != 0)
+		return;
+
+	/*
+	 * Record/Update physical path information for this device.
+	 */
+	vd = cp->private;
+	spa = vd->vdev_spa;
+	physpath_len = MAXPATHLEN;
+	physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
+	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
+	g_access(cp, -1, 0, 0);
+	if (error == 0) {
+		int held_lock;
+
+		held_lock = spa_config_held(spa, SCL_STATE, RW_WRITER);
+		if (held_lock == 0)
+			spa_config_enter(spa, SCL_STATE, FTAG, RW_WRITER);
+
+		if (vd->vdev_physpath != NULL)
+			spa_strfree(vd->vdev_physpath);
+		vd->vdev_physpath = spa_strdup(physpath);
+
+		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+
+		if (held_lock == 0)
+			spa_config_exit(spa, SCL_STATE, FTAG);
+	}
+	g_free(physpath);
+}
+
 static struct g_consumer *
-vdev_geom_attach(struct g_provider *pp)
+vdev_geom_attach(struct g_provider *pp, vdev_t *vd)
 {
 	struct g_geom *gp;
 	struct g_consumer *cp;
@@ -104,6 +148,7 @@
 	if (gp == NULL) {
 		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
 		gp->orphan = vdev_geom_orphan;
+		gp->attrchanged = vdev_geom_attrchanged;
 		cp = g_new_consumer(gp);
 		if (g_attach(cp, pp) != 0) {
 			g_wither_geom(gp, ENXIO);
@@ -140,6 +185,12 @@
 			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
 		}
 	}
+
+	cp->private = vd;
+
+	/* Fetch initial physical path information for this device. */
+	vdev_geom_attrchanged(cp, "GEOM::physpath");
+	
 	return (cp);
 }
 
@@ -170,20 +221,26 @@
 	}
 }
 
-static uint64_t
-nvlist_get_guid(nvlist_t *list)
+static void
+nvlist_get_guids(nvlist_t *list, uint64_t *pguid, uint64_t *vguid)
 {
 	nvpair_t *elem = NULL;
-	uint64_t value;
 
+	*vguid = 0;
+	*pguid = 0;
 	while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
-		if (nvpair_type(elem) == DATA_TYPE_UINT64 &&
-		    strcmp(nvpair_name(elem), "guid") == 0) {
-			VERIFY(nvpair_value_uint64(elem, &value) == 0);
-			return (value);
+		if (nvpair_type(elem) != DATA_TYPE_UINT64)
+			continue;
+
+		if (strcmp(nvpair_name(elem), ZPOOL_CONFIG_POOL_GUID) == 0) {
+			VERIFY(nvpair_value_uint64(elem, pguid) == 0);
+		} else if (strcmp(nvpair_name(elem), ZPOOL_CONFIG_GUID) == 0) {
+			VERIFY(nvpair_value_uint64(elem, vguid) == 0);
 		}
+
+		if (*pguid != 0 && *vguid != 0)
+			break;
 	}
-	return (0);
 }
 
 static int
@@ -221,8 +278,8 @@
 	return (error);
 }
 
-static uint64_t
-vdev_geom_read_guid(struct g_consumer *cp)
+static void
+vdev_geom_read_guids(struct g_consumer *cp, uint64_t *pguid, uint64_t *vguid)
 {
 	struct g_provider *pp;
 	vdev_label_t *label;
@@ -230,11 +287,12 @@
 	size_t buflen;
 	uint64_t psize;
 	off_t offset, size;
-	uint64_t guid;
 	int error, l, len;
 
 	g_topology_assert_not();
 
+	*pguid = 0;
+	*vguid = 0;
 	pp = cp->provider;
 	ZFS_LOG(1, "Reading guid from %s...", pp->name);
 
@@ -244,7 +302,6 @@
 	size = sizeof(*label) + pp->sectorsize -
 	    ((sizeof(*label) - 1) % pp->sectorsize) - 1;
 
-	guid = 0;
 	label = kmem_alloc(size, KM_SLEEP);
 	buflen = sizeof(label->vl_vdev_phys.vp_nvlist);
 
@@ -262,16 +319,16 @@
 		if (nvlist_unpack(buf, buflen, &config, 0) != 0)
 			continue;
 
-		guid = nvlist_get_guid(config);
+		nvlist_get_guids(config, pguid, vguid);
 		nvlist_free(config);
-		if (guid != 0)
+		if (*pguid != 0 && *vguid != 0)
 			break;
 	}
 
 	kmem_free(label, size);
-	if (guid != 0)
-		ZFS_LOG(1, "guid for %s is %ju", pp->name, (uintmax_t)guid);
-	return (guid);
+	if (*pguid != 0 && *vguid != 0)
+		ZFS_LOG(1, "guid for %s is %ju:%ju", pp->name,
+		    (uintmax_t)*pguid, (uintmax_t)*vguid);
 }
 
 static void
@@ -283,13 +340,14 @@
 }
 
 static struct g_consumer *
-vdev_geom_attach_by_guid(uint64_t guid)
+vdev_geom_attach_by_guid(vdev_t *vd)
 {
 	struct g_class *mp;
 	struct g_geom *gp, *zgp;
 	struct g_provider *pp;
 	struct g_consumer *cp, *zcp;
 	uint64_t pguid;
+	uint64_t vguid;
 
 	g_topology_assert();
 
@@ -314,13 +372,14 @@
 					continue;
 				}
 				g_topology_unlock();
-				pguid = vdev_geom_read_guid(zcp);
+				vdev_geom_read_guids(zcp, &pguid, &vguid);
 				g_topology_lock();
 				g_access(zcp, -1, 0, 0);
 				g_detach(zcp);
-				if (pguid != guid)
+				if (pguid != spa_guid(vd->vdev_spa) ||
+				    vguid != vd->vdev_guid)
 					continue;
-				cp = vdev_geom_attach(pp);
+				cp = vdev_geom_attach(pp, vd);
 				if (cp == NULL) {
 					printf("ZFS WARNING: Unable to attach to %s.\n",
 					    pp->name);
@@ -341,7 +400,7 @@
 }
 
 static struct g_consumer *
-vdev_geom_open_by_guid(vdev_t *vd)
+vdev_geom_open_by_guids(vdev_t *vd)
 {
 	struct g_consumer *cp;
 	char *buf;
@@ -349,8 +408,9 @@
 
 	g_topology_assert();
 
-	ZFS_LOG(1, "Searching by guid [%ju].", (uintmax_t)vd->vdev_guid);
-	cp = vdev_geom_attach_by_guid(vd->vdev_guid);
+	ZFS_LOG(1, "Searching by guid [%ju:%ju].",
+	    (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
+	cp = vdev_geom_attach_by_guid(vd);
 	if (cp != NULL) {
 		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
 		buf = kmem_alloc(len, KM_SLEEP);
@@ -359,10 +419,12 @@
 		spa_strfree(vd->vdev_path);
 		vd->vdev_path = buf;
 
-		ZFS_LOG(1, "Attach by guid [%ju] succeeded, provider %s.",
+		ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
+		    (uintmax_t)spa_guid(vd->vdev_spa),
 		    (uintmax_t)vd->vdev_guid, vd->vdev_path);
 	} else {
-		ZFS_LOG(1, "Search by guid [%ju] failed.",
+		ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
+		    (uintmax_t)spa_guid(vd->vdev_spa),
 		    (uintmax_t)vd->vdev_guid);
 	}
 
@@ -374,7 +436,8 @@
 {
 	struct g_provider *pp;
 	struct g_consumer *cp;
-	uint64_t guid;
+	uint64_t pguid;
+	uint64_t vguid;
 
 	g_topology_assert();
 
@@ -382,18 +445,21 @@
 	pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
 	if (pp != NULL) {
 		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
-		cp = vdev_geom_attach(pp);
+		cp = vdev_geom_attach(pp, vd);
 		if (cp != NULL && check_guid && ISP2(pp->sectorsize) &&
 		    pp->sectorsize <= VDEV_PAD_SIZE) {
 			g_topology_unlock();
-			guid = vdev_geom_read_guid(cp);
+			vdev_geom_read_guids(cp, &pguid, &vguid);
 			g_topology_lock();
-			if (guid != vd->vdev_guid) {
+			if (pguid != spa_guid(vd->vdev_spa) ||
+			    vguid != vd->vdev_guid) {
 				vdev_geom_detach(cp, 0);
 				cp = NULL;
 				ZFS_LOG(1, "guid mismatch for provider %s: "
-				    "%ju != %ju.", vd->vdev_path,
-				    (uintmax_t)vd->vdev_guid, (uintmax_t)guid);
+				    "%ju:%ju != %ju:%ju.", vd->vdev_path,
+				    (uintmax_t)spa_guid(vd->vdev_spa),
+				    (uintmax_t)vd->vdev_guid,
+				    (uintmax_t)pguid, (uintmax_t)vguid);
 			} else {
 				ZFS_LOG(1, "guid match for provider %s.",
 				    vd->vdev_path);
@@ -410,7 +476,7 @@
 	struct g_provider *pp;
 	struct g_consumer *cp;
 	size_t bufsize;
-	int error, lock;
+	int error;
 
 	/*
 	 * We must have a pathname, and it must be absolute.
@@ -422,34 +488,48 @@
 
 	vd->vdev_tsd = NULL;
 
-	if (mutex_owned(&spa_namespace_lock)) {
-		mutex_exit(&spa_namespace_lock);
-		lock = 1;
-	} else {
-		lock = 0;
-	}
 	DROP_GIANT();
 	g_topology_lock();
 	error = 0;
 
 	/*
-	 * If we're creating or splitting a pool, just find the GEOM provider
-	 * by its name and ignore GUID mismatches.
+	 * Try using the recorded path for this device, but only
+	 * accept it if its label data contains the expected GUIDs.
 	 */
-	if (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
-	    vd->vdev_spa->spa_splitting_newspa == B_TRUE)
+	cp = vdev_geom_open_by_path(vd, 1);
+	if (cp == NULL) {
+		/*
+		 * The device at vd->vdev_path doesn't have the
+		 * expected GUIDs. The disks might have merely
+		 * moved around so try all other GEOM providers
+		 * to find one with the right GUIDs.
+		 */
+		cp = vdev_geom_open_by_guids(vd);
+	}
+
+	if (cp == NULL &&
+	    ((vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
+	      vd->vdev_spa->spa_load_state == SPA_LOAD_NONE) ||
+	     vd->vdev_spa->spa_splitting_newspa == B_TRUE)) {
+		/*
+		 * We are dealing with a vdev that hasn't been previosly
+		 * opened (since boot), and we are not loading an
+		 * existing pool configuration (add vdev to new or
+		 * existing pool) or we are splitting a pool.
+		 * Find ithe GEOM provider by its name, ignoring GUID
+		 * mismatches.
+		 *
+		 * XXPOLICY: It would be safer to only allow a device
+		 *           that is labeled but missing GUID information
+		 *           to be opened in this fashion.  This would
+		 *           require a new option to the zpool command line
+		 *           tool allowing the label information to be reset 
+		 *           on a device, and augmented error reporting
+		 *           so the user can understand why their request
+		 *           failed and the required steps to repurpose
+		 *           the device.
+		 */
 		cp = vdev_geom_open_by_path(vd, 0);
-	else {
-		cp = vdev_geom_open_by_path(vd, 1);
-		if (cp == NULL) {
-			/*
-			 * The device at vd->vdev_path doesn't have the
-			 * expected guid. The disks might have merely
-			 * moved around so try all other GEOM providers
-			 * to find one with the right guid.
-			 */
-			cp = vdev_geom_open_by_guid(vd);
-		}
 	}
 
 	if (cp == NULL) {
@@ -459,11 +539,7 @@
 	    !ISP2(cp->provider->sectorsize)) {
 		ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
 		    vd->vdev_path);
-
-		g_topology_lock();
 		vdev_geom_detach(cp, 0);
-		g_topology_unlock();
-
 		error = EINVAL;
 		cp = NULL;
 	} else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
@@ -484,18 +560,15 @@
 			cp = NULL;
 		}
 	}
+
 	g_topology_unlock();
 	PICKUP_GIANT();
-	if (lock)
-		mutex_enter(&spa_namespace_lock);
 	if (cp == NULL) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 		return (error);
 	}
-
-	cp->private = vd;
-	vd->vdev_tsd = cp;
 	pp = cp->provider;
+	vd->vdev_tsd = cp;
 
 	/*
 	 * Determine the actual size of the device.
@@ -513,12 +586,6 @@
 	 */
 	vd->vdev_nowritecache = B_FALSE;
 
-	if (vd->vdev_physpath != NULL)
-		spa_strfree(vd->vdev_physpath);
-	bufsize = sizeof("/dev/") + strlen(pp->name);
-	vd->vdev_physpath = kmem_alloc(bufsize, KM_SLEEP);
-	snprintf(vd->vdev_physpath, bufsize, "/dev/%s", pp->name);
-
 	return (0);
 }
 


More information about the freebsd-fs mailing list