ZDB -Z?

Zaphod Beeblebrox zbeeble at gmail.com
Wed Dec 10 20:58:05 UTC 2014


I tried applying the patch to 10.1 and to -CURRENT (11) ... and I get:

[2:9:309]root at test-c1:/usr/src/cddl/contrib/opensolaris/cmd/zdb> patch
<~dgilbert/zdb-z-patch
Hmm...  Looks like a unified diff to me...
The text leading up to this was:
--------------------------
|diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c
b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
|index c265c99..bf43ea1 100644
|--- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c
|+++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
--------------------------
Patching file zdb.c using Plan A...
Hunk #1 succeeded at 59.
Hunk #2 succeeded at 3085 with fuzz 1 (offset 339 lines).
Hunk #3 succeeded at 3305 with fuzz 2 (offset 339 lines).
Hunk #4 succeeded at 3329 with fuzz 2 (offset 339 lines).
Hunk #5 failed at 3408.
Hunk #6 failed at 3644.
Hunk #7 failed at 3659.
Hunk #8 failed at 3718.
Hunk #9 failed at 3849.
5 out of 9 hunks failed--saving rejects to zdb.c.rej
done

... what version of FreeBSD is this patch against?

On Tue, Dec 9, 2014 at 7:23 PM, Andrew Heybey <ath at niksun.com> wrote:

> On 11/24/14 1:49 PM, Zaphod Beeblebrox wrote:
> > I'm reading about someone else's recovery of files from a damaged ZFS
> > partition.  He claims to have added (possibly to opensolaris or whatnot)
> an
> > argument to zdb '-Z' ... which operates somewhat like -R, but which
> > highlights what parts of the region are on what physical disks, and which
> > are parity.
> >
> > Has anyone patched this into FreeBSD?
>
> Sorry for the late reply, I am behind on my mailing list reading.
>
> I assume you were looking at this post:
>
> http://mbruning.blogspot.com/2009_12_01_archive.html
>
> I was also recently trying to recover data in a ZFS pool.  I made an ugly
> attempt at -Z for zdb.  It will not work for anything but RAIDZ pools (I
> tried it on one containing two 6-disk raidz1 vdevs).  The diff (against
> FreeBSD 10) is in this email.
>
> I copy-pasted the static function vdev_raidz_map() out of libzfs since it
> is static and not callable externally.  Not very tasteful but it worked for
> me.
>
> andrew
>
> commit 86ab9e2dab7e76dcdf527d2aa6b84a2fe429ee28
> Author: Andrew Heybey <ath at niksun.com>
> Date:   Tue Nov 18 15:00:57 2014 -0500
>
>     zdb: Add -Z flag like
> http://mbruning.blogspot.com/2009/12/zfs-raidz-data-walk.html
>
> diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c
> b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
> index c265c99..bf43ea1 100644
> --- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c
> +++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
> @@ -59,6 +59,7 @@
>  #include <sys/ddt.h>
>  #include <sys/zfeature.h>
>  #include <zfs_comutil.h>
> +#include <sys/vdev_raidz.h>
>  #undef ZFS_MAXNAMELEN
>  #undef verify
>  #include <libzfs.h>
> @@ -2745,6 +2746,168 @@ zdb_dump_block(char *label, void *buf, uint64_t
> size, int flags)
>         }
>  }
>
> +
> +typedef struct raidz_col {
> +       uint64_t rc_devidx;             /* child device index for I/O */
> +       uint64_t rc_offset;             /* device offset */
> +       uint64_t rc_size;               /* I/O size */
> +       void *rc_data;                  /* I/O data */
> +       void *rc_gdata;                 /* used to store the "good"
> version */
> +       int rc_error;                   /* I/O error for this device */
> +       uint8_t rc_tried;               /* Did we attempt this I/O column?
> */
> +       uint8_t rc_skipped;             /* Did we skip this I/O column? */
> +} raidz_col_t;
> +
> +typedef struct raidz_map {
> +       uint64_t rm_cols;               /* Regular column count */
> +       uint64_t rm_scols;              /* Count including skipped columns
> */
> +       uint64_t rm_bigcols;            /* Number of oversized columns */
> +       uint64_t rm_asize;              /* Actual total I/O size */
> +       uint64_t rm_missingdata;        /* Count of missing data devices */
> +       uint64_t rm_missingparity;      /* Count of missing parity devices
> */
> +       uint64_t rm_firstdatacol;       /* First data column/parity count
> */
> +       uint64_t rm_nskip;              /* Skipped sectors for padding */
> +       uint64_t rm_skipstart;          /* Column index of padding start */
> +       void *rm_datacopy;              /* rm_asize-buffer of copied data
> */
> +       uintptr_t rm_reports;           /* # of referencing checksum
> reports */
> +       uint8_t rm_freed;               /* map no longer has referencing
> ZIO */
> +       uint8_t rm_ecksuminjected;      /* checksum error was injected */
> +       raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
> +} raidz_map_t;
> +
> +/*
> + * Divides the IO evenly across all child vdevs; usually, dcols is
> + * the number of children in the target vdev.
> + *
> + * copy-pasted from vdev_raidz in the ZFS sources
> + */
> +raidz_map_t*
> +vdev_raidz_map(uint64_t size, uint64_t offset, uint64_t unit_shift,
> +              uint64_t dcols, uint64_t nparity)
> +{
> +       raidz_map_t* rm;
> +       /* The starting RAIDZ (parent) vdev sector of the block. */
> +       uint64_t b = offset >> unit_shift;
> +       /* The zio's size in units of the vdev's minimum sector size. */
> +       uint64_t s = size >> unit_shift;
> +       /* The first column for this stripe. */
> +       uint64_t f = b % dcols;
> +       /* The starting byte offset on each child vdev. */
> +       uint64_t o = (b / dcols) << unit_shift;
> +       uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
> +
> +       /*
> +        * "Quotient": The number of data sectors for this stripe on all
> but
> +        * the "big column" child vdevs that also contain "remainder" data.
> +        */
> +       q = s / (dcols - nparity);
> +
> +       /*
> +        * "Remainder": The number of partial stripe data sectors in this
> I/O.
> +        * This will add a sector to some, but not all, child vdevs.
> +        */
> +       r = s - q * (dcols - nparity);
> +
> +       /* The number of "big columns" - those which contain remainder
> data. */
> +       bc = (r == 0 ? 0 : r + nparity);
> +
> +       /*
> +        * The total number of data and parity sectors associated with
> +        * this I/O.
> +        */
> +       tot = s + nparity * (q + (r == 0 ? 0 : 1));
> +
> +       /* acols: The columns that will be accessed. */
> +       /* scols: The columns that will be accessed or skipped. */
> +       if (q == 0) {
> +               /* Our I/O request doesn't span all child vdevs. */
> +               acols = bc;
> +               scols = MIN(dcols, roundup(bc, nparity + 1));
> +       } else {
> +               acols = dcols;
> +               scols = dcols;
> +       }
> +
> +       rm = umem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
> +
> +       rm->rm_cols = acols;
> +       rm->rm_scols = scols;
> +       rm->rm_bigcols = bc;
> +       rm->rm_skipstart = bc;
> +       rm->rm_missingdata = 0;
> +       rm->rm_missingparity = 0;
> +       rm->rm_firstdatacol = nparity;
> +       rm->rm_datacopy = NULL;
> +       rm->rm_reports = 0;
> +       rm->rm_freed = 0;
> +       rm->rm_ecksuminjected = 0;
> +
> +       asize = 0;
> +
> +       for (c = 0; c < scols; c++) {
> +               col = f + c;
> +               coff = o;
> +               if (col >= dcols) {
> +                       col -= dcols;
> +                       coff += 1ULL << unit_shift;
> +               }
> +               rm->rm_col[c].rc_devidx = col;
> +               rm->rm_col[c].rc_offset = coff;
> +               rm->rm_col[c].rc_data = NULL;
> +               rm->rm_col[c].rc_gdata = NULL;
> +               rm->rm_col[c].rc_error = 0;
> +               rm->rm_col[c].rc_tried = 0;
> +               rm->rm_col[c].rc_skipped = 0;
> +
> +               if (c >= acols)
> +                       rm->rm_col[c].rc_size = 0;
> +               else if (c < bc)
> +                       rm->rm_col[c].rc_size = (q + 1) << unit_shift;
> +               else
> +                       rm->rm_col[c].rc_size = q << unit_shift;
> +
> +               asize += rm->rm_col[c].rc_size;
> +       }
> +
> +       rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
> +       rm->rm_nskip = roundup(tot, nparity + 1) - tot;
> +
> +       /*
> +        * If all data stored spans all columns, there's a danger that
> parity
> +        * will always be on the same device and, since parity isn't read
> +        * during normal operation, that that device's I/O bandwidth won't
> be
> +        * used effectively. We therefore switch the parity every 1MB.
> +        *
> +        * ... at least that was, ostensibly, the theory. As a practical
> +        * matter unless we juggle the parity between all devices evenly,
> we
> +        * won't see any benefit. Further, occasional writes that aren't a
> +        * multiple of the LCM of the number of children and the minimum
> +        * stripe width are sufficient to avoid pessimal behavior.
> +        * Unfortunately, this decision created an implicit on-disk format
> +        * requirement that we need to support for all eternity, but only
> +        * for single-parity RAID-Z.
> +        *
> +        * If we intend to skip a sector in the zeroth column for padding
> +        * we must make sure to note this swap. We will never intend to
> +        * skip the first column since at least one data and one parity
> +        * column must appear in each row.
> +        */
> +       if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
> +               devidx = rm->rm_col[0].rc_devidx;
> +               o = rm->rm_col[0].rc_offset;
> +               rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
> +               rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
> +               rm->rm_col[1].rc_devidx = devidx;
> +               rm->rm_col[1].rc_offset = o;
> +
> +               if (rm->rm_skipstart == 0)
> +                       rm->rm_skipstart = 1;
> +       }
> +
> +       return (rm);
> +}
> +
> +
>  /*
>   * There are two acceptable formats:
>   *     leaf_name         - For example: c1t0d0 or /tmp/ztest.0a
> @@ -2803,8 +2966,10 @@ name:
>  }
>
>  /*
> - * Read a block from a pool and print it out.  The syntax of the
> - * block descriptor is:
> + * Read a block from a pool and print it out, or (if Zflag is true)
> + * print out where the block is found on the constituents of the vdev.
> + *
> + * The syntax of the block descriptor is:
>   *
>   *     pool:vdev_specifier:offset:size[:flags]
>   *
> @@ -2825,7 +2990,7 @@ name:
>   *              * = not yet implemented
>   */
>  static void
> -zdb_read_block(char *thing, spa_t *spa)
> +zdb_read_block(char *thing, spa_t *spa, boolean_t Zflag)
>  {
>         blkptr_t blk, *bp = &blk;
>         dva_t *dva = bp->blk_dva;
> @@ -2904,6 +3069,22 @@ zdb_read_block(char *thing, spa_t *spa)
>         psize = size;
>         lsize = size;
>
> +       if (Zflag) {
> +               raidz_map_t* rm;
> +               rm = vdev_raidz_map(psize, offset, vd->vdev_ashift,
> +                                   vd->vdev_children, vd->vdev_nparity);
> +               (void) printf("columns %lu bigcols %lu asize %lu
> firstdatacol %lu\n",
> +                             rm->rm_cols, rm->rm_bigcols, rm->rm_asize,
> +                             rm->rm_firstdatacol);
> +               for (int c = 0; c < rm->rm_scols; ++c) {
> +                       raidz_col_t* rc = &rm->rm_col[c];
> +                       (void) printf("devidx %lu offset 0x%lx size
> 0x%lx\n",
> +                                     rc->rc_devidx, rc->rc_offset,
> rc->rc_size);
> +               }
> +               umem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
> +               return;
> +       }
> +
>         pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
>         lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
>
> @@ -3124,7 +3305,7 @@ main(int argc, char **argv)
>
>         dprintf_setup(&argc, argv);
>
> -       while ((c = getopt(argc, argv, "bcdhilmsuCDRSAFLXevp:t:U:P")) !=
> -1) {
> +       while ((c = getopt(argc, argv, "bcdhilmsuCDRSAFLXevp:t:U:PZ")) !=
> -1) {
>                 switch (c) {
>                 case 'b':
>                 case 'c':
> @@ -3139,6 +3320,7 @@ main(int argc, char **argv)
>                 case 'D':
>                 case 'R':
>                 case 'S':
> +               case 'Z':
>                         dump_opt[c]++;
>                         dump_all = 0;
>                         break;
> @@ -3197,6 +3379,9 @@ main(int argc, char **argv)
>         if (dump_all)
>                 verbose = MAX(verbose, 1);
>
> +       if (dump_opt['Z'])
> +           dump_opt['R'] = 1;
> +
>         for (c = 0; c < 256; c++) {
>                 if (dump_all && !strchr("elAFLRSXP", c))
>                         dump_opt[c] = 1;
> @@ -3325,7 +3510,7 @@ main(int argc, char **argv)
>                 flagbits['r'] = ZDB_FLAG_RAW;
>
>                 for (i = 0; i < argc; i++)
> -                       zdb_read_block(argv[i], spa);
> +                       zdb_read_block(argv[i], spa, dump_opt['Z']);
>         }
>
>         (os != NULL) ? dmu_objset_disown(os, FTAG) : spa_close(spa, FTAG);
>
>


More information about the freebsd-fs mailing list