[PATCH] Illumos #883 ZIL reuse during remount can lead to data
corruption
Pawel Jakub Dawidek
pjd at FreeBSD.org
Wed Jul 27 09:09:23 UTC 2011
On Tue, Jul 26, 2011 at 02:31:32PM +0200, Martin Matuska wrote:
> Illumos developers have fixed a problem in ZIL that may lead to data
> corruption during remount.
>
> A detailed description of this problem is available at:
> https://www.illumos.org/issues/883
>
> In my opinion this issue applies to FreeBSD, too, and we should fix this
> ASAP.
> Please review my patch (1:1 merge from illumos, no changes).
>
> References:
> illumos-gate revision: 13380:161b964a0e10
> https://github.com/illumos/illumos-gate/commit/c9ba2a43cb76c223d115e021fdabd2c066e020ed
This looks like a very important fix. There were several reports of
people not being able to import their pools because of space map
corruption and it seems like this will fix the underlying problem.
I'd really like to see this committed and merged to stable/8.
> Index: cddl/contrib/opensolaris/cmd/ztest/ztest.c
> ===================================================================
> --- cddl/contrib/opensolaris/cmd/ztest/ztest.c (revision 224409)
> +++ cddl/contrib/opensolaris/cmd/ztest/ztest.c (working copy)
> @@ -205,6 +205,7 @@
> */
> typedef struct ztest_ds {
> objset_t *zd_os;
> + rwlock_t zd_zilog_lock;
> zilog_t *zd_zilog;
> uint64_t zd_seq;
> ztest_od_t *zd_od; /* debugging aid */
> @@ -238,6 +239,7 @@
> ztest_func_t ztest_zap;
> ztest_func_t ztest_zap_parallel;
> ztest_func_t ztest_zil_commit;
> +ztest_func_t ztest_zil_remount;
> ztest_func_t ztest_dmu_read_write_zcopy;
> ztest_func_t ztest_dmu_objset_create_destroy;
> ztest_func_t ztest_dmu_prealloc;
> @@ -273,6 +275,7 @@
> { ztest_zap_parallel, 100, &zopt_always },
> { ztest_split_pool, 1, &zopt_always },
> { ztest_zil_commit, 1, &zopt_incessant },
> + { ztest_zil_remount, 1, &zopt_sometimes },
> { ztest_dmu_read_write_zcopy, 1, &zopt_often },
> { ztest_dmu_objset_create_destroy, 1, &zopt_often },
> { ztest_dsl_prop_get_set, 1, &zopt_often },
> @@ -986,6 +989,7 @@
> zd->zd_seq = 0;
> dmu_objset_name(os, zd->zd_name);
>
> + VERIFY(rwlock_init(&zd->zd_zilog_lock, USYNC_THREAD, NULL) == 0);
> VERIFY(_mutex_init(&zd->zd_dirobj_lock, USYNC_THREAD, NULL) == 0);
>
> for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
> @@ -1965,6 +1969,8 @@
> if (ztest_random(2) == 0)
> io_type = ZTEST_IO_WRITE_TAG;
>
> + (void) rw_rdlock(&zd->zd_zilog_lock);
> +
> switch (io_type) {
>
> case ZTEST_IO_WRITE_TAG:
> @@ -2000,6 +2006,8 @@
> break;
> }
>
> + (void) rw_unlock(&zd->zd_zilog_lock);
> +
> umem_free(data, blocksize);
> }
>
> @@ -2054,6 +2062,8 @@
> {
> zilog_t *zilog = zd->zd_zilog;
>
> + (void) rw_rdlock(&zd->zd_zilog_lock);
> +
> zil_commit(zilog, ztest_random(ZTEST_OBJECTS));
>
> /*
> @@ -2065,9 +2075,34 @@
> ASSERT(zd->zd_seq <= zilog->zl_commit_lr_seq);
> zd->zd_seq = zilog->zl_commit_lr_seq;
> mutex_exit(&zilog->zl_lock);
> +
> + (void) rw_unlock(&zd->zd_zilog_lock);
> }
>
> /*
> + * This function is designed to simulate the operations that occur during a
> + * mount/unmount operation. We hold the dataset across these operations in an
> + * attempt to expose any implicit assumptions about ZIL management.
> + */
> +/* ARGSUSED */
> +void
> +ztest_zil_remount(ztest_ds_t *zd, uint64_t id)
> +{
> + objset_t *os = zd->zd_os;
> +
> + (void) rw_wrlock(&zd->zd_zilog_lock);
> +
> + /* zfsvfs_teardown() */
> + zil_close(zd->zd_zilog);
> +
> + /* zfsvfs_setup() */
> + VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog);
> + zil_replay(os, zd, ztest_replay_vector);
> +
> + (void) rw_unlock(&zd->zd_zilog_lock);
> +}
> +
> +/*
> * Verify that we can't destroy an active pool, create an existing pool,
> * or create a pool with a bad vdev spec.
> */
> Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
> ===================================================================
> --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c (revision 224409)
> +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c (working copy)
> @@ -20,6 +20,7 @@
> */
> /*
> * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
> + * Copyright (c) 2011 by Delphix. All rights reserved.
> */
>
> /* Portions Copyright 2010 Robert Milkowski */
> @@ -567,7 +568,7 @@
>
> if (!list_is_empty(&zilog->zl_lwb_list)) {
> ASSERT(zh->zh_claim_txg == 0);
> - ASSERT(!keep_first);
> + VERIFY(!keep_first);
> while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
> list_remove(&zilog->zl_lwb_list, lwb);
> if (lwb->lwb_buf != NULL)
> @@ -1668,20 +1669,9 @@
> void
> zil_free(zilog_t *zilog)
> {
> - lwb_t *head_lwb;
> -
> zilog->zl_stop_sync = 1;
>
> - /*
> - * After zil_close() there should only be one lwb with a buffer.
> - */
> - head_lwb = list_head(&zilog->zl_lwb_list);
> - if (head_lwb) {
> - ASSERT(head_lwb == list_tail(&zilog->zl_lwb_list));
> - list_remove(&zilog->zl_lwb_list, head_lwb);
> - zio_buf_free(head_lwb->lwb_buf, head_lwb->lwb_sz);
> - kmem_cache_free(zil_lwb_cache, head_lwb);
> - }
> + ASSERT(list_is_empty(&zilog->zl_lwb_list));
> list_destroy(&zilog->zl_lwb_list);
>
> avl_destroy(&zilog->zl_vdev_tree);
> @@ -1721,6 +1711,10 @@
> {
> zilog_t *zilog = dmu_objset_zil(os);
>
> + ASSERT(zilog->zl_clean_taskq == NULL);
> + ASSERT(zilog->zl_get_data == NULL);
> + ASSERT(list_is_empty(&zilog->zl_lwb_list));
> +
> zilog->zl_get_data = get_data;
> zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri,
> 2, 2, TASKQ_PREPOPULATE);
> @@ -1734,7 +1728,7 @@
> void
> zil_close(zilog_t *zilog)
> {
> - lwb_t *tail_lwb;
> + lwb_t *lwb;
> uint64_t txg = 0;
>
> zil_commit(zilog, 0); /* commit all itx */
> @@ -1746,9 +1740,9 @@
> * destroy the zl_clean_taskq.
> */
> mutex_enter(&zilog->zl_lock);
> - tail_lwb = list_tail(&zilog->zl_lwb_list);
> - if (tail_lwb != NULL)
> - txg = tail_lwb->lwb_max_txg;
> + lwb = list_tail(&zilog->zl_lwb_list);
> + if (lwb != NULL)
> + txg = lwb->lwb_max_txg;
> mutex_exit(&zilog->zl_lock);
> if (txg)
> txg_wait_synced(zilog->zl_dmu_pool, txg);
> @@ -1756,6 +1750,19 @@
> taskq_destroy(zilog->zl_clean_taskq);
> zilog->zl_clean_taskq = NULL;
> zilog->zl_get_data = NULL;
> +
> + /*
> + * We should have only one LWB left on the list; remove it now.
> + */
> + mutex_enter(&zilog->zl_lock);
> + lwb = list_head(&zilog->zl_lwb_list);
> + if (lwb != NULL) {
> + ASSERT(lwb == list_tail(&zilog->zl_lwb_list));
> + list_remove(&zilog->zl_lwb_list, lwb);
> + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
> + kmem_cache_free(zil_lwb_cache, lwb);
> + }
> + mutex_exit(&zilog->zl_lock);
> }
>
> /*
--
Pawel Jakub Dawidek http://www.wheelsystems.com
FreeBSD committer http://www.FreeBSD.org
Am I Evil? Yes, I Am! http://yomoli.com
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 196 bytes
Desc: not available
Url : http://lists.freebsd.org/pipermail/zfs-devel/attachments/20110727/9f645753/attachment.pgp
More information about the zfs-devel
mailing list