svn commit: r229760 - in stable/8/sys: boot/zfs cddl/boot/zfs

Martin Matuska mm at FreeBSD.org
Sat Jan 7 02:23:59 UTC 2012


Author: mm
Date: Sat Jan  7 02:23:58 2012
New Revision: 229760
URL: http://svn.freebsd.org/changeset/base/229760

Log:
  MFC r226549,r226550,r226551,r226552,r226553,r226568
  
  MFC r226549 (pjd):
  Remove redundant size calculation.
  
  MFC r226550 (pjd):
  Initialize 'rc' properly before using it. This error could lead to infinite
  loop when data reconstruction was needed.
  
  MFC r226551 (pjd):
  Don't mark vdev as healthy too soon, so we won't try to use invalid vdevs.
  
  MFC r226552 (pjd):
  Never pass NULL block pointer when reading. This is neither expected nor
  handled by lower layers like vdev_raidz, which uses bp for checksum
  verification. This bug could lead to NULL pointer reference and resets
  during boot.
  
  MFC r226553 (pjd):
  Always pass data size for checksum verification function, as using
  physical block size declared in bp may not always be what we want.
  For example in case of gang block header physical block size declared
  in bp is much larger than SPA_GANGBLOCKSIZE (512 bytes) and checksum
  calculation failed. This bug could lead to accessing unallocated
  memory and resets/failures during boot.
  
  MFC r226568 (pjd) [1]:
  - Correctly read gang header from raidz.
  - Decompress assembled gang block data if compressed.
  - Verify checksum of a gang header.
  - Verify checksum of assembled gang block data.
  - Verify checksum of uber block.
  
  Submitted by:	avg [1]

Modified:
  stable/8/sys/boot/zfs/zfsimpl.c
  stable/8/sys/cddl/boot/zfs/zfssubr.c
Directory Properties:
  stable/8/sys/   (props changed)

Modified: stable/8/sys/boot/zfs/zfsimpl.c
==============================================================================
--- stable/8/sys/boot/zfs/zfsimpl.c	Sat Jan  7 02:09:49 2012	(r229759)
+++ stable/8/sys/boot/zfs/zfsimpl.c	Sat Jan  7 02:23:58 2012	(r229760)
@@ -347,7 +347,7 @@ vdev_read_phys(vdev_t *vdev, const blkpt
 	rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
 	if (rc)
 		return (rc);
-	if (bp && zio_checksum_error(bp, buf, offset))
+	if (bp && zio_checksum_verify(bp, buf))
 		return (EIO);
 
 	return (0);
@@ -543,8 +543,6 @@ vdev_init_from_nvlist(const unsigned cha
 			vdev->v_state = VDEV_STATE_DEGRADED;
 		else if (isnt_present)
 			vdev->v_state = VDEV_STATE_CANT_OPEN;
-		else
-			vdev->v_state = VDEV_STATE_HEALTHY;
 	}
 
 	rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
@@ -800,6 +798,7 @@ vdev_probe(vdev_phys_read_t *read, void 
 	BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
 	BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
 	BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+	DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
 	ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
 	if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
 		return (EIO);
@@ -912,6 +911,7 @@ vdev_probe(vdev_phys_read_t *read, void 
 	if (vdev) {
 		vdev->v_phys_read = read;
 		vdev->v_read_priv = read_priv;
+		vdev->v_state = VDEV_STATE_HEALTHY;
 	} else {
 		printf("ZFS: inconsistent nvlist contents\n");
 		return (EIO);
@@ -941,7 +941,7 @@ vdev_probe(vdev_phys_read_t *read, void 
 		BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
 		ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
 
-		if (vdev_read_phys(vdev, NULL, upbuf, off, VDEV_UBERBLOCK_SIZE(vdev)))
+		if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
 			continue;
 
 		if (up->ub_magic != UBERBLOCK_MAGIC)
@@ -974,34 +974,39 @@ ilog2(int n)
 }
 
 static int
-zio_read_gang(spa_t *spa, const blkptr_t *bp, const dva_t *dva, void *buf)
+zio_read_gang(spa_t *spa, const blkptr_t *bp, void *buf)
 {
+	blkptr_t gbh_bp;
 	zio_gbh_phys_t zio_gb;
-	vdev_t *vdev;
-	int vdevid;
-	off_t offset;
+	char *pbuf;
 	int i;
 
-	vdevid = DVA_GET_VDEV(dva);
-	offset = DVA_GET_OFFSET(dva);
-	STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink)
-		if (vdev->v_id == vdevid)
-			break;
-	if (!vdev || !vdev->v_read)
-		return (EIO);
-	if (vdev->v_read(vdev, NULL, &zio_gb, offset, SPA_GANGBLOCKSIZE))
+	/* Artificial BP for gang block header. */
+	gbh_bp = *bp;
+	BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
+	BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
+	BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
+	BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
+	for (i = 0; i < SPA_DVAS_PER_BP; i++)
+		DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
+
+	/* Read gang header block using the artificial BP. */
+	if (zio_read(spa, &gbh_bp, &zio_gb))
 		return (EIO);
 
+	pbuf = buf;
 	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
 		blkptr_t *gbp = &zio_gb.zg_blkptr[i];
 
 		if (BP_IS_HOLE(gbp))
 			continue;
-		if (zio_read(spa, gbp, buf))
+		if (zio_read(spa, gbp, pbuf))
 			return (EIO);
-		buf = (char*)buf + BP_GET_PSIZE(gbp);
+		pbuf += BP_GET_PSIZE(gbp);
 	}
- 
+
+	if (zio_checksum_verify(bp, buf))
+		return (EIO);
 	return (0);
 }
 
@@ -1024,46 +1029,41 @@ zio_read(spa_t *spa, const blkptr_t *bp,
 		if (!dva->dva_word[0] && !dva->dva_word[1])
 			continue;
 
-		if (DVA_GET_GANG(dva)) {
-			error = zio_read_gang(spa, bp, dva, buf);
-			if (error != 0)
-				continue;
-		} else {
-			vdevid = DVA_GET_VDEV(dva);
-			offset = DVA_GET_OFFSET(dva);
-			STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
-				if (vdev->v_id == vdevid)
-					break;
-			}
-			if (!vdev || !vdev->v_read)
-				continue;
+		vdevid = DVA_GET_VDEV(dva);
+		offset = DVA_GET_OFFSET(dva);
+		STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
+			if (vdev->v_id == vdevid)
+				break;
+		}
+		if (!vdev || !vdev->v_read)
+			continue;
 
-			size = BP_GET_PSIZE(bp);
+		size = BP_GET_PSIZE(bp);
+		if (vdev->v_read == vdev_raidz_read) {
 			align = 1ULL << vdev->v_top->v_ashift;
 			if (P2PHASE(size, align) != 0)
 				size = P2ROUNDUP(size, align);
-			if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
-				pbuf = zfs_alloc(size);
-			else
-				pbuf = buf;
+		}
+		if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
+			pbuf = zfs_alloc(size);
+		else
+			pbuf = buf;
 
+		if (DVA_GET_GANG(dva))
+			error = zio_read_gang(spa, bp, pbuf);
+		else
 			error = vdev->v_read(vdev, bp, pbuf, offset, size);
-			if (error == 0) {
-				if (cpfunc != ZIO_COMPRESS_OFF) {
-					error = zio_decompress_data(cpfunc,
-					    pbuf, BP_GET_PSIZE(bp), buf,
-					    BP_GET_LSIZE(bp));
-				} else if (size != BP_GET_PSIZE(bp)) {
-					bcopy(pbuf, buf, BP_GET_PSIZE(bp));
-				}
-			}
-			if (buf != pbuf)
-				zfs_free(pbuf, size);
-			if (error != 0)
-				continue;
+		if (error == 0) {
+			if (cpfunc != ZIO_COMPRESS_OFF)
+				error = zio_decompress_data(cpfunc, pbuf,
+				    BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
+			else if (size != BP_GET_PSIZE(bp))
+				bcopy(pbuf, buf, BP_GET_PSIZE(bp));
 		}
-		error = 0;
-		break;
+		if (buf != pbuf)
+			zfs_free(pbuf, size);
+		if (error == 0)
+			break;
 	}
 	if (error != 0)
 		printf("ZFS: i/o error - all block copies unavailable\n");

Modified: stable/8/sys/cddl/boot/zfs/zfssubr.c
==============================================================================
--- stable/8/sys/cddl/boot/zfs/zfssubr.c	Sat Jan  7 02:09:49 2012	(r229759)
+++ stable/8/sys/cddl/boot/zfs/zfssubr.c	Sat Jan  7 02:23:58 2012	(r229760)
@@ -181,14 +181,17 @@ zio_checksum_label_verifier(zio_cksum_t 
 }
 
 static int
-zio_checksum_error(const blkptr_t *bp, void *data, uint64_t offset)
+zio_checksum_verify(const blkptr_t *bp, void *data)
 {
-	unsigned int checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp);
-	uint64_t size = BP_GET_PSIZE(bp);
+	uint64_t size;
+	unsigned int checksum;
 	zio_checksum_info_t *ci;
 	zio_cksum_t actual_cksum, expected_cksum, verifier;
 	int byteswap;
 
+	checksum = BP_GET_CHECKSUM(bp);
+	size = BP_GET_PSIZE(bp);
+
 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
 		return (EINVAL);
 	ci = &zio_checksum_table[checksum];
@@ -206,7 +209,8 @@ zio_checksum_error(const blkptr_t *bp, v
 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
 			zio_checksum_gang_verifier(&verifier, bp);
 		else if (checksum == ZIO_CHECKSUM_LABEL)
-			zio_checksum_label_verifier(&verifier, offset);
+			zio_checksum_label_verifier(&verifier,
+			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
 		else
 			verifier = bp->blk_cksum;
 
@@ -224,7 +228,6 @@ zio_checksum_error(const blkptr_t *bp, v
 			byteswap_uint64_array(&expected_cksum,
 			    sizeof (zio_cksum_t));
 	} else {
-		ASSERT(!BP_IS_GANG(bp));
 		expected_cksum = bp->blk_cksum;
 		ci->ci_func[0](data, size, &actual_cksum);
 	}
@@ -1215,15 +1218,10 @@ static void
 vdev_raidz_map_free(raidz_map_t *rm)
 {
 	int c;
-	size_t size;
 
 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
 		zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
 
-	size = 0;
-	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
-		size += rm->rm_col[c].rc_size;
-
 	zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 }
 
@@ -1245,10 +1243,10 @@ vdev_child(vdev_t *pvd, uint64_t devidx)
  * any ereports we generate can note it.
  */
 static int
-raidz_checksum_verify(const blkptr_t *bp, void *data)
+raidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size)
 {
 
-	return (zio_checksum_error(bp, data, 0));
+	return (zio_checksum_verify(bp, data));
 }
 
 /*
@@ -1298,7 +1296,7 @@ raidz_parity_verify(raidz_map_t *rm)
  */
 static int
 vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data,
-    off_t offset, int total_errors, int data_errors)
+    off_t offset, uint64_t bytes, int total_errors, int data_errors)
 {
 	raidz_col_t *rc;
 	void *orig[VDEV_RAIDZ_MAXPARITY];
@@ -1377,7 +1375,7 @@ vdev_raidz_combrec(raidz_map_t *rm, cons
 			 * success.
 			 */
 			code = vdev_raidz_reconstruct(rm, tgts, n);
-			if (raidz_checksum_verify(bp, data) == 0) {
+			if (raidz_checksum_verify(bp, data, bytes) == 0) {
 				for (i = 0; i < n; i++) {
 					c = tgts[i];
 					rc = &rm->rm_col[c];
@@ -1548,7 +1546,7 @@ reconstruct:
 	 */
 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
 		if (data_errors == 0) {
-			if (raidz_checksum_verify(bp, data) == 0) {
+			if (raidz_checksum_verify(bp, data, bytes) == 0) {
 				/*
 				 * If we read parity information (unnecessarily
 				 * as it happens since no reconstruction was
@@ -1593,7 +1591,7 @@ reconstruct:
 
 			code = vdev_raidz_reconstruct(rm, tgts, n);
 
-			if (raidz_checksum_verify(bp, data) == 0) {
+			if (raidz_checksum_verify(bp, data, bytes) == 0) {
 				/*
 				 * If we read more parity disks than were used
 				 * for reconstruction, confirm that the other
@@ -1633,7 +1631,9 @@ reconstruct:
 
 	n = 0;
 	for (c = 0; c < rm->rm_cols; c++) {
-		if (rm->rm_col[c].rc_tried)
+		rc = &rm->rm_col[c];
+
+		if (rc->rc_tried)
 			continue;
 
 		cvd = vdev_child(vd, rc->rc_devidx);
@@ -1665,8 +1665,8 @@ reconstruct:
 	if (total_errors > rm->rm_firstdatacol) {
 		error = EIO;
 	} else if (total_errors < rm->rm_firstdatacol &&
-	    (code = vdev_raidz_combrec(rm, bp, data, offset, total_errors,
-	     data_errors)) != 0) {
+	    (code = vdev_raidz_combrec(rm, bp, data, offset, bytes,
+	     total_errors, data_errors)) != 0) {
 		/*
 		 * If we didn't use all the available parity for the
 		 * combinatorial reconstruction, verify that the remaining


More information about the svn-src-stable-8 mailing list