svn commit: r229760 - in stable/8/sys: boot/zfs cddl/boot/zfs
Martin Matuska
mm at FreeBSD.org
Sat Jan 7 02:24:00 UTC 2012
Author: mm
Date: Sat Jan 7 02:23:58 2012
New Revision: 229760
URL: http://svn.freebsd.org/changeset/base/229760
Log:
MFC r226549,r226550,r226551,r226552,r226553,r226568
MFC r226549 (pjd):
Remove redundant size calculation.
MFC r226550 (pjd):
Initialize 'rc' properly before using it. This error could lead to infinite
loop when data reconstruction was needed.
MFC r226551 (pjd):
Don't mark vdev as healthy too soon, so we won't try to use invalid vdevs.
MFC r226552 (pjd):
Never pass NULL block pointer when reading. This is neither expected nor
handled by lower layers like vdev_raidz, which uses bp for checksum
verification. This bug could lead to NULL pointer reference and resets
during boot.
MFC r226553 (pjd):
Always pass data size for checksum verification function, as using
physical block size declared in bp may not always be what we want.
For example in case of gang block header physical block size declared
in bp is much larger than SPA_GANGBLOCKSIZE (512 bytes) and checksum
calculation failed. This bug could lead to accessing unallocated
memory and resets/failures during boot.
MFC r226568 (pjd) [1]:
- Correctly read gang header from raidz.
- Decompress assembled gang block data if compressed.
- Verify checksum of a gang header.
- Verify checksum of assembled gang block data.
- Verify checksum of uber block.
Submitted by: avg [1]
Modified:
stable/8/sys/boot/zfs/zfsimpl.c
stable/8/sys/cddl/boot/zfs/zfssubr.c
Directory Properties:
stable/8/sys/ (props changed)
Modified: stable/8/sys/boot/zfs/zfsimpl.c
==============================================================================
--- stable/8/sys/boot/zfs/zfsimpl.c Sat Jan 7 02:09:49 2012 (r229759)
+++ stable/8/sys/boot/zfs/zfsimpl.c Sat Jan 7 02:23:58 2012 (r229760)
@@ -347,7 +347,7 @@ vdev_read_phys(vdev_t *vdev, const blkpt
rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
if (rc)
return (rc);
- if (bp && zio_checksum_error(bp, buf, offset))
+ if (bp && zio_checksum_verify(bp, buf))
return (EIO);
return (0);
@@ -543,8 +543,6 @@ vdev_init_from_nvlist(const unsigned cha
vdev->v_state = VDEV_STATE_DEGRADED;
else if (isnt_present)
vdev->v_state = VDEV_STATE_CANT_OPEN;
- else
- vdev->v_state = VDEV_STATE_HEALTHY;
}
rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
@@ -800,6 +798,7 @@ vdev_probe(vdev_phys_read_t *read, void
BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+ DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
return (EIO);
@@ -912,6 +911,7 @@ vdev_probe(vdev_phys_read_t *read, void
if (vdev) {
vdev->v_phys_read = read;
vdev->v_read_priv = read_priv;
+ vdev->v_state = VDEV_STATE_HEALTHY;
} else {
printf("ZFS: inconsistent nvlist contents\n");
return (EIO);
@@ -941,7 +941,7 @@ vdev_probe(vdev_phys_read_t *read, void
BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
- if (vdev_read_phys(vdev, NULL, upbuf, off, VDEV_UBERBLOCK_SIZE(vdev)))
+ if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
continue;
if (up->ub_magic != UBERBLOCK_MAGIC)
@@ -974,34 +974,39 @@ ilog2(int n)
}
static int
-zio_read_gang(spa_t *spa, const blkptr_t *bp, const dva_t *dva, void *buf)
+zio_read_gang(spa_t *spa, const blkptr_t *bp, void *buf)
{
+ blkptr_t gbh_bp;
zio_gbh_phys_t zio_gb;
- vdev_t *vdev;
- int vdevid;
- off_t offset;
+ char *pbuf;
int i;
- vdevid = DVA_GET_VDEV(dva);
- offset = DVA_GET_OFFSET(dva);
- STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink)
- if (vdev->v_id == vdevid)
- break;
- if (!vdev || !vdev->v_read)
- return (EIO);
- if (vdev->v_read(vdev, NULL, &zio_gb, offset, SPA_GANGBLOCKSIZE))
+ /* Artificial BP for gang block header. */
+ gbh_bp = *bp;
+ BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
+ BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
+ BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
+ BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
+ for (i = 0; i < SPA_DVAS_PER_BP; i++)
+ DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
+
+ /* Read gang header block using the artificial BP. */
+ if (zio_read(spa, &gbh_bp, &zio_gb))
return (EIO);
+ pbuf = buf;
for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
blkptr_t *gbp = &zio_gb.zg_blkptr[i];
if (BP_IS_HOLE(gbp))
continue;
- if (zio_read(spa, gbp, buf))
+ if (zio_read(spa, gbp, pbuf))
return (EIO);
- buf = (char*)buf + BP_GET_PSIZE(gbp);
+ pbuf += BP_GET_PSIZE(gbp);
}
-
+
+ if (zio_checksum_verify(bp, buf))
+ return (EIO);
return (0);
}
@@ -1024,46 +1029,41 @@ zio_read(spa_t *spa, const blkptr_t *bp,
if (!dva->dva_word[0] && !dva->dva_word[1])
continue;
- if (DVA_GET_GANG(dva)) {
- error = zio_read_gang(spa, bp, dva, buf);
- if (error != 0)
- continue;
- } else {
- vdevid = DVA_GET_VDEV(dva);
- offset = DVA_GET_OFFSET(dva);
- STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
- if (vdev->v_id == vdevid)
- break;
- }
- if (!vdev || !vdev->v_read)
- continue;
+ vdevid = DVA_GET_VDEV(dva);
+ offset = DVA_GET_OFFSET(dva);
+ STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
+ if (vdev->v_id == vdevid)
+ break;
+ }
+ if (!vdev || !vdev->v_read)
+ continue;
- size = BP_GET_PSIZE(bp);
+ size = BP_GET_PSIZE(bp);
+ if (vdev->v_read == vdev_raidz_read) {
align = 1ULL << vdev->v_top->v_ashift;
if (P2PHASE(size, align) != 0)
size = P2ROUNDUP(size, align);
- if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
- pbuf = zfs_alloc(size);
- else
- pbuf = buf;
+ }
+ if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
+ pbuf = zfs_alloc(size);
+ else
+ pbuf = buf;
+ if (DVA_GET_GANG(dva))
+ error = zio_read_gang(spa, bp, pbuf);
+ else
error = vdev->v_read(vdev, bp, pbuf, offset, size);
- if (error == 0) {
- if (cpfunc != ZIO_COMPRESS_OFF) {
- error = zio_decompress_data(cpfunc,
- pbuf, BP_GET_PSIZE(bp), buf,
- BP_GET_LSIZE(bp));
- } else if (size != BP_GET_PSIZE(bp)) {
- bcopy(pbuf, buf, BP_GET_PSIZE(bp));
- }
- }
- if (buf != pbuf)
- zfs_free(pbuf, size);
- if (error != 0)
- continue;
+ if (error == 0) {
+ if (cpfunc != ZIO_COMPRESS_OFF)
+ error = zio_decompress_data(cpfunc, pbuf,
+ BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
+ else if (size != BP_GET_PSIZE(bp))
+ bcopy(pbuf, buf, BP_GET_PSIZE(bp));
}
- error = 0;
- break;
+ if (buf != pbuf)
+ zfs_free(pbuf, size);
+ if (error == 0)
+ break;
}
if (error != 0)
printf("ZFS: i/o error - all block copies unavailable\n");
Modified: stable/8/sys/cddl/boot/zfs/zfssubr.c
==============================================================================
--- stable/8/sys/cddl/boot/zfs/zfssubr.c Sat Jan 7 02:09:49 2012 (r229759)
+++ stable/8/sys/cddl/boot/zfs/zfssubr.c Sat Jan 7 02:23:58 2012 (r229760)
@@ -181,14 +181,17 @@ zio_checksum_label_verifier(zio_cksum_t
}
static int
-zio_checksum_error(const blkptr_t *bp, void *data, uint64_t offset)
+zio_checksum_verify(const blkptr_t *bp, void *data)
{
- unsigned int checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp);
- uint64_t size = BP_GET_PSIZE(bp);
+ uint64_t size;
+ unsigned int checksum;
zio_checksum_info_t *ci;
zio_cksum_t actual_cksum, expected_cksum, verifier;
int byteswap;
+ checksum = BP_GET_CHECKSUM(bp);
+ size = BP_GET_PSIZE(bp);
+
if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
return (EINVAL);
ci = &zio_checksum_table[checksum];
@@ -206,7 +209,8 @@ zio_checksum_error(const blkptr_t *bp, v
if (checksum == ZIO_CHECKSUM_GANG_HEADER)
zio_checksum_gang_verifier(&verifier, bp);
else if (checksum == ZIO_CHECKSUM_LABEL)
- zio_checksum_label_verifier(&verifier, offset);
+ zio_checksum_label_verifier(&verifier,
+ DVA_GET_OFFSET(BP_IDENTITY(bp)));
else
verifier = bp->blk_cksum;
@@ -224,7 +228,6 @@ zio_checksum_error(const blkptr_t *bp, v
byteswap_uint64_array(&expected_cksum,
sizeof (zio_cksum_t));
} else {
- ASSERT(!BP_IS_GANG(bp));
expected_cksum = bp->blk_cksum;
ci->ci_func[0](data, size, &actual_cksum);
}
@@ -1215,15 +1218,10 @@ static void
vdev_raidz_map_free(raidz_map_t *rm)
{
int c;
- size_t size;
for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
- size = 0;
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
- size += rm->rm_col[c].rc_size;
-
zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
}
@@ -1245,10 +1243,10 @@ vdev_child(vdev_t *pvd, uint64_t devidx)
* any ereports we generate can note it.
*/
static int
-raidz_checksum_verify(const blkptr_t *bp, void *data)
+raidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size)
{
- return (zio_checksum_error(bp, data, 0));
+ return (zio_checksum_verify(bp, data));
}
/*
@@ -1298,7 +1296,7 @@ raidz_parity_verify(raidz_map_t *rm)
*/
static int
vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data,
- off_t offset, int total_errors, int data_errors)
+ off_t offset, uint64_t bytes, int total_errors, int data_errors)
{
raidz_col_t *rc;
void *orig[VDEV_RAIDZ_MAXPARITY];
@@ -1377,7 +1375,7 @@ vdev_raidz_combrec(raidz_map_t *rm, cons
* success.
*/
code = vdev_raidz_reconstruct(rm, tgts, n);
- if (raidz_checksum_verify(bp, data) == 0) {
+ if (raidz_checksum_verify(bp, data, bytes) == 0) {
for (i = 0; i < n; i++) {
c = tgts[i];
rc = &rm->rm_col[c];
@@ -1548,7 +1546,7 @@ reconstruct:
*/
if (total_errors <= rm->rm_firstdatacol - parity_untried) {
if (data_errors == 0) {
- if (raidz_checksum_verify(bp, data) == 0) {
+ if (raidz_checksum_verify(bp, data, bytes) == 0) {
/*
* If we read parity information (unnecessarily
* as it happens since no reconstruction was
@@ -1593,7 +1591,7 @@ reconstruct:
code = vdev_raidz_reconstruct(rm, tgts, n);
- if (raidz_checksum_verify(bp, data) == 0) {
+ if (raidz_checksum_verify(bp, data, bytes) == 0) {
/*
* If we read more parity disks than were used
* for reconstruction, confirm that the other
@@ -1633,7 +1631,9 @@ reconstruct:
n = 0;
for (c = 0; c < rm->rm_cols; c++) {
- if (rm->rm_col[c].rc_tried)
+ rc = &rm->rm_col[c];
+
+ if (rc->rc_tried)
continue;
cvd = vdev_child(vd, rc->rc_devidx);
@@ -1665,8 +1665,8 @@ reconstruct:
if (total_errors > rm->rm_firstdatacol) {
error = EIO;
} else if (total_errors < rm->rm_firstdatacol &&
- (code = vdev_raidz_combrec(rm, bp, data, offset, total_errors,
- data_errors)) != 0) {
+ (code = vdev_raidz_combrec(rm, bp, data, offset, bytes,
+ total_errors, data_errors)) != 0) {
/*
* If we didn't use all the available parity for the
* combinatorial reconstruction, verify that the remaining
More information about the svn-src-all
mailing list