ZDB -Z?
Zaphod Beeblebrox
zbeeble at gmail.com
Wed Dec 10 20:58:05 UTC 2014
I tried applying the patch to 10.1 and to -CURRENT (11) ... and I get:
[2:9:309]root at test-c1:/usr/src/cddl/contrib/opensolaris/cmd/zdb> patch
<~dgilbert/zdb-z-patch
Hmm... Looks like a unified diff to me...
The text leading up to this was:
--------------------------
|diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c
b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
|index c265c99..bf43ea1 100644
|--- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c
|+++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
--------------------------
Patching file zdb.c using Plan A...
Hunk #1 succeeded at 59.
Hunk #2 succeeded at 3085 with fuzz 1 (offset 339 lines).
Hunk #3 succeeded at 3305 with fuzz 2 (offset 339 lines).
Hunk #4 succeeded at 3329 with fuzz 2 (offset 339 lines).
Hunk #5 failed at 3408.
Hunk #6 failed at 3644.
Hunk #7 failed at 3659.
Hunk #8 failed at 3718.
Hunk #9 failed at 3849.
5 out of 9 hunks failed--saving rejects to zdb.c.rej
done
... what version of FreeBSD is this patch against?
On Tue, Dec 9, 2014 at 7:23 PM, Andrew Heybey <ath at niksun.com> wrote:
> On 11/24/14 1:49 PM, Zaphod Beeblebrox wrote:
> > I'm reading about someone else's recovery of files from a damaged ZFS
> > partition. He claims to have added (possibly to opensolaris or whatnot)
> an
> > argument to zdb '-Z' ... which operates somewhat like -R, but which
> > highlights what parts of the region are on what physical disks, and which
> > are parity.
> >
> > Has anyone patched this into FreeBSD?
>
> Sorry for the late reply, I am behind on my mailing list reading.
>
> I assume you were looking at this post:
>
> http://mbruning.blogspot.com/2009_12_01_archive.html
>
> I was also recently trying to recover data in a ZFS pool. I made an ugly
> attempt at -Z for zdb. It will not work for anything but RAIDZ pools (I
> tried it on one containing two 6-disk raidz1 vdevs). The diff (against
> FreeBSD 10) is in this email.
>
> I copy-pasted the static function vdev_raidz_map() out of libzfs since it
> is static and not callable externally. Not very tasteful but it worked for
> me.
>
> andrew
>
> commit 86ab9e2dab7e76dcdf527d2aa6b84a2fe429ee28
> Author: Andrew Heybey <ath at niksun.com>
> Date: Tue Nov 18 15:00:57 2014 -0500
>
> zdb: Add -Z flag like
> http://mbruning.blogspot.com/2009/12/zfs-raidz-data-walk.html
>
> diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c
> b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
> index c265c99..bf43ea1 100644
> --- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c
> +++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
> @@ -59,6 +59,7 @@
> #include <sys/ddt.h>
> #include <sys/zfeature.h>
> #include <zfs_comutil.h>
> +#include <sys/vdev_raidz.h>
> #undef ZFS_MAXNAMELEN
> #undef verify
> #include <libzfs.h>
> @@ -2745,6 +2746,168 @@ zdb_dump_block(char *label, void *buf, uint64_t
> size, int flags)
> }
> }
>
> +
> +typedef struct raidz_col {
> + uint64_t rc_devidx; /* child device index for I/O */
> + uint64_t rc_offset; /* device offset */
> + uint64_t rc_size; /* I/O size */
> + void *rc_data; /* I/O data */
> + void *rc_gdata; /* used to store the "good"
> version */
> + int rc_error; /* I/O error for this device */
> + uint8_t rc_tried; /* Did we attempt this I/O column?
> */
> + uint8_t rc_skipped; /* Did we skip this I/O column? */
> +} raidz_col_t;
> +
> +typedef struct raidz_map {
> + uint64_t rm_cols; /* Regular column count */
> + uint64_t rm_scols; /* Count including skipped columns
> */
> + uint64_t rm_bigcols; /* Number of oversized columns */
> + uint64_t rm_asize; /* Actual total I/O size */
> + uint64_t rm_missingdata; /* Count of missing data devices */
> + uint64_t rm_missingparity; /* Count of missing parity devices
> */
> + uint64_t rm_firstdatacol; /* First data column/parity count
> */
> + uint64_t rm_nskip; /* Skipped sectors for padding */
> + uint64_t rm_skipstart; /* Column index of padding start */
> + void *rm_datacopy; /* rm_asize-buffer of copied data
> */
> + uintptr_t rm_reports; /* # of referencing checksum
> reports */
> + uint8_t rm_freed; /* map no longer has referencing
> ZIO */
> + uint8_t rm_ecksuminjected; /* checksum error was injected */
> + raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
> +} raidz_map_t;
> +
> +/*
> + * Divides the IO evenly across all child vdevs; usually, dcols is
> + * the number of children in the target vdev.
> + *
> + * copy-pasted from vdev_raidz in the ZFS sources
> + */
> +raidz_map_t*
> +vdev_raidz_map(uint64_t size, uint64_t offset, uint64_t unit_shift,
> + uint64_t dcols, uint64_t nparity)
> +{
> + raidz_map_t* rm;
> + /* The starting RAIDZ (parent) vdev sector of the block. */
> + uint64_t b = offset >> unit_shift;
> + /* The zio's size in units of the vdev's minimum sector size. */
> + uint64_t s = size >> unit_shift;
> + /* The first column for this stripe. */
> + uint64_t f = b % dcols;
> + /* The starting byte offset on each child vdev. */
> + uint64_t o = (b / dcols) << unit_shift;
> + uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
> +
> + /*
> + * "Quotient": The number of data sectors for this stripe on all
> but
> + * the "big column" child vdevs that also contain "remainder" data.
> + */
> + q = s / (dcols - nparity);
> +
> + /*
> + * "Remainder": The number of partial stripe data sectors in this
> I/O.
> + * This will add a sector to some, but not all, child vdevs.
> + */
> + r = s - q * (dcols - nparity);
> +
> + /* The number of "big columns" - those which contain remainder
> data. */
> + bc = (r == 0 ? 0 : r + nparity);
> +
> + /*
> + * The total number of data and parity sectors associated with
> + * this I/O.
> + */
> + tot = s + nparity * (q + (r == 0 ? 0 : 1));
> +
> + /* acols: The columns that will be accessed. */
> + /* scols: The columns that will be accessed or skipped. */
> + if (q == 0) {
> + /* Our I/O request doesn't span all child vdevs. */
> + acols = bc;
> + scols = MIN(dcols, roundup(bc, nparity + 1));
> + } else {
> + acols = dcols;
> + scols = dcols;
> + }
> +
> + rm = umem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
> +
> + rm->rm_cols = acols;
> + rm->rm_scols = scols;
> + rm->rm_bigcols = bc;
> + rm->rm_skipstart = bc;
> + rm->rm_missingdata = 0;
> + rm->rm_missingparity = 0;
> + rm->rm_firstdatacol = nparity;
> + rm->rm_datacopy = NULL;
> + rm->rm_reports = 0;
> + rm->rm_freed = 0;
> + rm->rm_ecksuminjected = 0;
> +
> + asize = 0;
> +
> + for (c = 0; c < scols; c++) {
> + col = f + c;
> + coff = o;
> + if (col >= dcols) {
> + col -= dcols;
> + coff += 1ULL << unit_shift;
> + }
> + rm->rm_col[c].rc_devidx = col;
> + rm->rm_col[c].rc_offset = coff;
> + rm->rm_col[c].rc_data = NULL;
> + rm->rm_col[c].rc_gdata = NULL;
> + rm->rm_col[c].rc_error = 0;
> + rm->rm_col[c].rc_tried = 0;
> + rm->rm_col[c].rc_skipped = 0;
> +
> + if (c >= acols)
> + rm->rm_col[c].rc_size = 0;
> + else if (c < bc)
> + rm->rm_col[c].rc_size = (q + 1) << unit_shift;
> + else
> + rm->rm_col[c].rc_size = q << unit_shift;
> +
> + asize += rm->rm_col[c].rc_size;
> + }
> +
> + rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
> + rm->rm_nskip = roundup(tot, nparity + 1) - tot;
> +
> + /*
> + * If all data stored spans all columns, there's a danger that
> parity
> + * will always be on the same device and, since parity isn't read
> + * during normal operation, that that device's I/O bandwidth won't
> be
> + * used effectively. We therefore switch the parity every 1MB.
> + *
> + * ... at least that was, ostensibly, the theory. As a practical
> + * matter unless we juggle the parity between all devices evenly,
> we
> + * won't see any benefit. Further, occasional writes that aren't a
> + * multiple of the LCM of the number of children and the minimum
> + * stripe width are sufficient to avoid pessimal behavior.
> + * Unfortunately, this decision created an implicit on-disk format
> + * requirement that we need to support for all eternity, but only
> + * for single-parity RAID-Z.
> + *
> + * If we intend to skip a sector in the zeroth column for padding
> + * we must make sure to note this swap. We will never intend to
> + * skip the first column since at least one data and one parity
> + * column must appear in each row.
> + */
> + if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
> + devidx = rm->rm_col[0].rc_devidx;
> + o = rm->rm_col[0].rc_offset;
> + rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
> + rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
> + rm->rm_col[1].rc_devidx = devidx;
> + rm->rm_col[1].rc_offset = o;
> +
> + if (rm->rm_skipstart == 0)
> + rm->rm_skipstart = 1;
> + }
> +
> + return (rm);
> +}
> +
> +
> /*
> * There are two acceptable formats:
> * leaf_name - For example: c1t0d0 or /tmp/ztest.0a
> @@ -2803,8 +2966,10 @@ name:
> }
>
> /*
> - * Read a block from a pool and print it out. The syntax of the
> - * block descriptor is:
> + * Read a block from a pool and print it out, or (if Zflag is true)
> + * print out where the block is found on the constituents of the vdev.
> + *
> + * The syntax of the block descriptor is:
> *
> * pool:vdev_specifier:offset:size[:flags]
> *
> @@ -2825,7 +2990,7 @@ name:
> * * = not yet implemented
> */
> static void
> -zdb_read_block(char *thing, spa_t *spa)
> +zdb_read_block(char *thing, spa_t *spa, boolean_t Zflag)
> {
> blkptr_t blk, *bp = &blk;
> dva_t *dva = bp->blk_dva;
> @@ -2904,6 +3069,22 @@ zdb_read_block(char *thing, spa_t *spa)
> psize = size;
> lsize = size;
>
> + if (Zflag) {
> + raidz_map_t* rm;
> + rm = vdev_raidz_map(psize, offset, vd->vdev_ashift,
> + vd->vdev_children, vd->vdev_nparity);
> + (void) printf("columns %lu bigcols %lu asize %lu
> firstdatacol %lu\n",
> + rm->rm_cols, rm->rm_bigcols, rm->rm_asize,
> + rm->rm_firstdatacol);
> + for (int c = 0; c < rm->rm_scols; ++c) {
> + raidz_col_t* rc = &rm->rm_col[c];
> + (void) printf("devidx %lu offset 0x%lx size
> 0x%lx\n",
> + rc->rc_devidx, rc->rc_offset,
> rc->rc_size);
> + }
> + umem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
> + return;
> + }
> +
> pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
> lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
>
> @@ -3124,7 +3305,7 @@ main(int argc, char **argv)
>
> dprintf_setup(&argc, argv);
>
> - while ((c = getopt(argc, argv, "bcdhilmsuCDRSAFLXevp:t:U:P")) !=
> -1) {
> + while ((c = getopt(argc, argv, "bcdhilmsuCDRSAFLXevp:t:U:PZ")) !=
> -1) {
> switch (c) {
> case 'b':
> case 'c':
> @@ -3139,6 +3320,7 @@ main(int argc, char **argv)
> case 'D':
> case 'R':
> case 'S':
> + case 'Z':
> dump_opt[c]++;
> dump_all = 0;
> break;
> @@ -3197,6 +3379,9 @@ main(int argc, char **argv)
> if (dump_all)
> verbose = MAX(verbose, 1);
>
> + if (dump_opt['Z'])
> + dump_opt['R'] = 1;
> +
> for (c = 0; c < 256; c++) {
> if (dump_all && !strchr("elAFLRSXP", c))
> dump_opt[c] = 1;
> @@ -3325,7 +3510,7 @@ main(int argc, char **argv)
> flagbits['r'] = ZDB_FLAG_RAW;
>
> for (i = 0; i < argc; i++)
> - zdb_read_block(argv[i], spa);
> + zdb_read_block(argv[i], spa, dump_opt['Z']);
> }
>
> (os != NULL) ? dmu_objset_disown(os, FTAG) : spa_close(spa, FTAG);
>
>
More information about the freebsd-fs
mailing list