ZFS: i/o error - all block copies unavailable after upgrading
to r225312
Andriy Gapon
avg at FreeBSD.org
Fri Sep 16 07:20:46 UTC 2011
I've discovered another issue with both the current code and the previous
versions of the patch. So another version is at the same URL:
http://people.freebsd.org/~avg/zfs-boot-gang.diff
The problem was with gang blocks on a raidz vdev. zio_read_gang would pass a
NULL bp to vdev_raidz_read, but that function doesn't expect that and really
needs a valid bp.
Pawel, I also understood the code in zio_read that does size rounding up based
on v_ashift. That code is for vdev_raidz_read which needs the size to be
multiple of 1 << v_ashift. And, again, this would have posed a problem for
zio_read_gang, which always passed SPA_GANGBLOCKSIZE when reading a gang block
header.
So, a list of fixes and logical changes:
- correctly read gang header from raidz [*]
- decompress assembled gang block data if compressed
- verify checksum of a gang header
- verify checksum of assembled gang block data
- verify checksum of uber block
[*] - new in this version of patch.
Description of the code changes:
- remove offset parameter from zio_checksum_error
This parameter has only been used for the case of verifying checksum of a vdev
label. The offset is now passed via DVA offset field in a made up bp pointing
to the label.
zio_checksum_error now gets all checksum parameters from the bp.
- zio_read_gang new uses an artificial bp to read a gang header via zio_read
This solves all the problems with gang blocks on raidz as zio_read already has
all the code to handle raidz correctly.
- zio_read performs size rounding only if v_read == vdev_raidz_read
This is to make the intention of the code more clear. And also to slightly
optimize non-raidz cases with non-default ashift where an unnecessary
intermediate buffer would otherwise be used.
Some inline comments (marked with %):
Index: sys/cddl/boot/zfs/zfssubr.c
===================================================================
--- sys/cddl/boot/zfs/zfssubr.c (revision 225581)
+++ sys/cddl/boot/zfs/zfssubr.c (working copy)
@@ -181,14 +181,17 @@
}
static int
-zio_checksum_error(const blkptr_t *bp, void *data, uint64_t offset)
+zio_checksum_error(const blkptr_t *bp, void *data)
{
- unsigned int checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER :
BP_GET_CHECKSUM(bp);
- uint64_t size = BP_GET_PSIZE(bp);
+ uint64_t size;
+ unsigned int checksum;
zio_checksum_info_t *ci;
zio_cksum_t actual_cksum, expected_cksum, verifier;
int byteswap;
+ checksum = BP_GET_CHECKSUM(bp);
+ size = BP_GET_PSIZE(bp);
+
%
% checksum type and size are always taken from the bp.
% Note that BP_IS_GANG(bp) doesn't imply that the caller wants to verify
% the gang header (as was assumed before); the caller want want to verify
% the whole assembled data as well.
%
if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
return (EINVAL);
ci = &zio_checksum_table[checksum];
@@ -206,7 +209,8 @@
if (checksum == ZIO_CHECKSUM_GANG_HEADER)
zio_checksum_gang_verifier(&verifier, bp);
else if (checksum == ZIO_CHECKSUM_LABEL)
- zio_checksum_label_verifier(&verifier, offset);
+ zio_checksum_label_verifier(&verifier,
+ DVA_GET_OFFSET(BP_IDENTITY(bp)));
%
% label offset is taken from the bp now.
%
else
verifier = bp->blk_cksum;
@@ -224,7 +228,6 @@
byteswap_uint64_array(&expected_cksum,
sizeof (zio_cksum_t));
} else {
- ASSERT(!BP_IS_GANG(bp));
%
% the assert is no longer valid as we pass a gang block bp
% when verifying checksum of assembled data
%
expected_cksum = bp->blk_cksum;
ci->ci_func[0](data, size, &actual_cksum);
}
@@ -1248,7 +1251,7 @@
raidz_checksum_verify(const blkptr_t *bp, void *data)
{
- return (zio_checksum_error(bp, data, 0));
+ return (zio_checksum_error(bp, data));
}
/*
Index: sys/boot/zfs/zfsimpl.c
===================================================================
--- sys/boot/zfs/zfsimpl.c (revision 225581)
+++ sys/boot/zfs/zfsimpl.c (working copy)
@@ -347,7 +347,7 @@
rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
if (rc)
return (rc);
- if (bp && zio_checksum_error(bp, buf, offset))
+ if (bp && zio_checksum_error(bp, buf))
%
% the bp can still be NULL here: when raidz code reads blocks of data
% it doesn't pass any bp, because there is really no bp for that data.
% this should be the only case, in all other cases a valid bp must be provided.
%
return (EIO);
return (0);
@@ -800,6 +800,7 @@
BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+ DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
%
% as described above, the offset is now passed via DVA offset field
%
ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
return (EIO);
@@ -941,7 +942,7 @@
BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
- if (vdev_read_phys(vdev, NULL, upbuf, off, VDEV_UBERBLOCK_SIZE(vdev)))
+ if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
%
% pass the artificial uberblock bp to vdev_read_phys, so that it
% can call zio_checksum_error and verify the checksum
%
continue;
if (up->ub_magic != UBERBLOCK_MAGIC)
@@ -974,34 +975,39 @@
}
static int
-zio_read_gang(spa_t *spa, const blkptr_t *bp, const dva_t *dva, void *buf)
+zio_read_gang(spa_t *spa, const blkptr_t *bp, void *buf)
{
+ blkptr_t gbh_bp;
zio_gbh_phys_t zio_gb;
- vdev_t *vdev;
- int vdevid;
- off_t offset;
+ char *pbuf;
int i;
- vdevid = DVA_GET_VDEV(dva);
- offset = DVA_GET_OFFSET(dva);
- STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink)
- if (vdev->v_id == vdevid)
- break;
- if (!vdev || !vdev->v_read)
+ /* Artificial BP for gang block header. */
+ gbh_bp = *bp;
+ BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
+ BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
+ BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
+ BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
+ for (i = 0; i < SPA_DVAS_PER_BP; i++)
+ DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
%
% the artifical bp for the gang header.
% it has PSIZE and LSIZE of SPA_GANGBLOCKSIZE.
% cheksum is set ZIO_CHECKSUM_GANG_HEADER, so that zio_checksum_error
% does the right thing.
% compression is set to ZIO_COMPRESS_OFF, so that zio_read does the right thing.
% the gang bit is cleared from the DVAs - we read only the gang header block and
% do not want to create endless recursion here.
% vdevs and offsets are preserved in the DVAs of the BP.
%
+
+ /* Read gang header block using the artificial BP. */
+ if (zio_read(spa, &gbh_bp, &zio_gb))
return (EIO);
- if (vdev->v_read(vdev, NULL, &zio_gb, offset, SPA_GANGBLOCKSIZE))
- return (EIO);
%
% so now smarter zio_read can be used instead of dumber v_read.
% Which, again, would have crashed on NULL bp if v_read == vdev_raidz_read
%
+ pbuf = buf;
%
% keep the original buffer pointer, use pbuf to populate the buffer
%
for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
blkptr_t *gbp = &zio_gb.zg_blkptr[i];
if (BP_IS_HOLE(gbp))
continue;
- if (zio_read(spa, gbp, buf))
+ if (zio_read(spa, gbp, pbuf))
return (EIO);
- buf = (char*)buf + BP_GET_PSIZE(gbp);
+ pbuf += BP_GET_PSIZE(gbp);
}
-
+
+ if (zio_checksum_error(bp, buf))
+ return (EIO);
%
% the original pointer is used to verify checksum of the assembled data
% of the gang block.
% Note: this is where the gang bp is passed to zio_checksum_error.
%
return (0);
}
@@ -1024,46 +1030,41 @@
if (!dva->dva_word[0] && !dva->dva_word[1])
continue;
- if (DVA_GET_GANG(dva)) {
- error = zio_read_gang(spa, bp, dva, buf);
- if (error != 0)
- continue;
- } else {
- vdevid = DVA_GET_VDEV(dva);
- offset = DVA_GET_OFFSET(dva);
- STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
- if (vdev->v_id == vdevid)
- break;
- }
- if (!vdev || !vdev->v_read)
- continue;
+ vdevid = DVA_GET_VDEV(dva);
+ offset = DVA_GET_OFFSET(dva);
+ STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
+ if (vdev->v_id == vdevid)
+ break;
+ }
+ if (!vdev || !vdev->v_read)
+ continue;
- size = BP_GET_PSIZE(bp);
+ size = BP_GET_PSIZE(bp);
+ if (vdev->v_read == vdev_raidz_read) {
align = 1ULL << vdev->v_top->v_ashift;
if (P2PHASE(size, align) != 0)
size = P2ROUNDUP(size, align);
%
% do size rounding up only if v_read == vdev_raidz_read,
% because only vdev_raidz_read requires that.
% vdev_read_phys requires only 512 byte granularity.
%
- if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
- pbuf = zfs_alloc(size);
- else
- pbuf = buf;
+ }
+ if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
+ pbuf = zfs_alloc(size);
+ else
+ pbuf = buf;
%
% So use temporary buffer only if either the block is compressed or
% vdev_raidz_read requires a larger buffer
%
+ if (DVA_GET_GANG(dva))
+ error = zio_read_gang(spa, bp, pbuf);
+ else
error = vdev->v_read(vdev, bp, pbuf, offset, size);
- if (error == 0) {
- if (cpfunc != ZIO_COMPRESS_OFF) {
- error = zio_decompress_data(cpfunc,
- pbuf, BP_GET_PSIZE(bp), buf,
- BP_GET_LSIZE(bp));
- } else if (size != BP_GET_PSIZE(bp)) {
- bcopy(pbuf, buf, BP_GET_PSIZE(bp));
- }
- }
- if (buf != pbuf)
- zfs_free(pbuf, size);
- if (error != 0)
- continue;
+ if (error == 0) {
+ if (cpfunc != ZIO_COMPRESS_OFF)
+ error = zio_decompress_data(cpfunc, pbuf,
+ BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
+ else if (size != BP_GET_PSIZE(bp))
+ bcopy(pbuf, buf, BP_GET_PSIZE(bp));
}
%
% Decompression is now done for the gang blocks too.
% Ditto for the use of larger buffers in the raidz case.
%
- error = 0;
- break;
+ if (buf != pbuf)
+ zfs_free(pbuf, size);
+ if (error == 0)
+ break;
}
if (error != 0)
printf("ZFS: i/o error - all block copies unavailable\n");
--
Andriy Gapon
More information about the freebsd-fs
mailing list