svn commit: r268720 - in head: cddl/contrib/opensolaris/cmd/zpool sys/cddl/contrib/opensolaris/uts/common/fs/zfs

Xin LI delphij at FreeBSD.org
Tue Jul 15 22:44:05 UTC 2014


Author: delphij
Date: Tue Jul 15 22:44:04 2014
New Revision: 268720
URL: http://svnweb.freebsd.org/changeset/base/268720

Log:
  MFV r268714:
  
  Improve extreme rewind import.
  
  When doing an "extreme rewind" import ("zpool import -XF"), we attempt
  to verify all data in the pool, essentially scrubbing the entire pool.
  The problem is that spa_load_verify_cb() issues an unbounded number of
  concurrent scrub i/os.  This can lead to all of memory being used for
  these zio's, wedging the system. Like normal scrub, we need to put a
  cap on the number of outstanding i/os, and have the traverse thread
  block when we reach this cap.
  
  For this purpose the cap can be very large (10,000) to optimize the
  elevator algorithm.  Three kernel tunables have been added:
  
  	vfs.zfs.spa_load_verify_maxinflight
  	vfs.zfs.spa_load_verify_metadata
  	vfs.zfs.spa_load_verify_data
  
  The latter two tunables controls whether metadata and/or user data
  when doing extreme rewind.
  
  Make 'zpool import -T' imply scrub.
  
  Make zpool import -T <txg> accept hexadecimal values for the txg when
  prefixed with 0x.
  
  Skip txg's for which there is no uberblock when doing extreme rewind.
  
  Skip reading all user data twice by skipping prefetches when doing
  extreme rewinds as we do not access via the ARC.
  
  Illumos issues:
    4970 need controls on i/o issued by zpool import -XF
    4971 zpool import -T should accept hex values
    4972 zpool import -T implies extreme rewind, and thus a scrub
    4973 spa_load_retry retries the same txg
    4974 spa_load_verify() reads all data twice
  
  MFC after:	2 weeks

Modified:
  head/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
Directory Properties:
  head/cddl/contrib/opensolaris/   (props changed)
  head/sys/cddl/contrib/opensolaris/   (props changed)

Modified: head/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c	Tue Jul 15 22:34:54 2014	(r268719)
+++ head/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c	Tue Jul 15 22:44:04 2014	(r268720)
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2012 by Frederik Wessels. All rights reserved.
  * Copyright (c) 2012 Martin Matuska <mm at FreeBSD.org>. All rights reserved.
  * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved.
@@ -2033,7 +2033,7 @@ zpool_do_import(int argc, char **argv)
 			break;
 		case 'T':
 			errno = 0;
-			txg = strtoull(optarg, &endptr, 10);
+			txg = strtoull(optarg, &endptr, 0);
 			if (errno != 0 || *endptr != '\0') {
 				(void) fprintf(stderr,
 				    gettext("invalid txg value\n"));

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	Tue Jul 15 22:34:54 2014	(r268719)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	Tue Jul 15 22:44:04 2014	(r268720)
@@ -1873,6 +1873,7 @@ spa_load_verify_done(zio_t *zio)
 	spa_load_error_t *sle = zio->io_private;
 	dmu_object_type_t type = BP_GET_TYPE(bp);
 	int error = zio->io_error;
+	spa_t *spa = zio->io_spa;
 
 	if (error) {
 		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
@@ -1882,23 +1883,65 @@ spa_load_verify_done(zio_t *zio)
 			atomic_add_64(&sle->sle_data_count, 1);
 	}
 	zio_data_buf_free(zio->io_data, zio->io_size);
+
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_scrub_inflight--;
+	cv_broadcast(&spa->spa_scrub_io_cv);
+	mutex_exit(&spa->spa_scrub_lock);
 }
 
+/*
+ * Maximum number of concurrent scrub i/os to create while verifying
+ * a pool while importing it.
+ */
+int spa_load_verify_maxinflight = 10000;
+boolean_t spa_load_verify_metadata = B_TRUE;
+boolean_t spa_load_verify_data = B_TRUE;
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN,
+    &spa_load_verify_maxinflight, 0,
+    "Maximum number of concurrent scrub I/Os to create while verifying a "
+    "pool while importing it");
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN,
+    &spa_load_verify_metadata, 0,
+    "Check metadata on import?");
+ 
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN,
+    &spa_load_verify_data, 0,
+    "Check user data on import?");
+ 
 /*ARGSUSED*/
 static int
 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
-	if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
-		zio_t *rio = arg;
-		size_t size = BP_GET_PSIZE(bp);
-		void *data = zio_data_buf_alloc(size);
-
-		zio_nowait(zio_read(rio, spa, bp, data, size,
-		    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
-		    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
-		    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
-	}
+	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+		return (0);
+	/*
+	 * Note: normally this routine will not be called if
+	 * spa_load_verify_metadata is not set.  However, it may be useful
+	 * to manually set the flag after the traversal has begun.
+	 */
+	if (!spa_load_verify_metadata)
+		return (0);
+	if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data)
+		return (0);
+
+	zio_t *rio = arg;
+	size_t size = BP_GET_PSIZE(bp);
+	void *data = zio_data_buf_alloc(size);
+
+	mutex_enter(&spa->spa_scrub_lock);
+	while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
+		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+	spa->spa_scrub_inflight++;
+	mutex_exit(&spa->spa_scrub_lock);
+
+	zio_nowait(zio_read(rio, spa, bp, data, size,
+	    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
+	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
+	    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
 	return (0);
 }
 
@@ -1909,7 +1952,7 @@ spa_load_verify(spa_t *spa)
 	spa_load_error_t sle = { 0 };
 	zpool_rewind_policy_t policy;
 	boolean_t verify_ok = B_FALSE;
-	int error;
+	int error = 0;
 
 	zpool_get_rewind_policy(spa->spa_config, &policy);
 
@@ -1919,8 +1962,11 @@ spa_load_verify(spa_t *spa)
 	rio = zio_root(spa, NULL, &sle,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
 
-	error = traverse_pool(spa, spa->spa_verify_min_txg,
-	    TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
+	if (spa_load_verify_metadata) {
+		error = traverse_pool(spa, spa->spa_verify_min_txg,
+		    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
+		    spa_load_verify_cb, rio);
+	}
 
 	(void) zio_wait(rio);
 
@@ -2795,7 +2841,7 @@ spa_load_retry(spa_t *spa, spa_load_stat
 	spa_unload(spa);
 	spa_deactivate(spa);
 
-	spa->spa_load_max_txg--;
+	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
 
 	spa_activate(spa, mode);
 	spa_async_suspend(spa);
@@ -2825,6 +2871,8 @@ spa_load_best(spa_t *spa, spa_load_state
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	} else {
 		spa->spa_load_max_txg = max_request;
+		if (max_request != UINT64_MAX)
+			spa->spa_extreme_rewind = B_TRUE;
 	}
 
 	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,


More information about the svn-src-head mailing list