svn commit: r353621 - in head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys

Andriy Gapon avg at FreeBSD.org
Wed Oct 16 07:09:01 UTC 2019


Author: avg
Date: Wed Oct 16 07:09:00 2019
New Revision: 353621
URL: https://svnweb.freebsd.org/changeset/base/353621

Log:
  MFV r353619: 9691 fat zap should prefetch when iterating
  
  illumos/illumos-gate at 52abb70e073c2a88808c0d66fd810ba8c5080572
  https://github.com/illumos/illumos-gate/commit/52abb70e073c2a88808c0d66fd810ba8c5080572
  
  https://www.illumos.org/issues/9691
    When iterating over a ZAP object, we're almost always certain to
    iterate over the entire object. If there are multiple leaf blocks, we
    can realize a performance win by issuing reads for all the leaf blocks
    in parallel when the iteration begins.
    For example, if we have 10,000 snapshots, "zfs destroy -nv
    pool/fs at 1%9999" can take 30 minutes when the cache is cold. This
    change provides a >3x performance improvement, by issuing the reads
    for all ~64 blocks of each ZAP object in parallel.
  
  Author: Matthew Ahrens <mahrens at delphix.com>
  Obtained from:	illumos
  MFC after:	2 weeks

Modified:
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
Directory Properties:
  head/sys/cddl/contrib/opensolaris/   (props changed)

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c	Wed Oct 16 07:04:01 2019	(r353620)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt_zap.c	Wed Oct 16 07:09:00 2019	(r353621)
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -113,7 +114,18 @@ ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_
 	zap_attribute_t za;
 	int error;
 
-	zap_cursor_init_serialized(&zc, os, object, *walk);
+	if (*walk == 0) {
+		/*
+		 * We don't want to prefetch the entire ZAP object, because
+		 * it can be enormous.  Also the primary use of DDT iteration
+		 * is for scrubbing, in which case we will be issuing many
+		 * scrub i/os for each ZAP block that we read in, so
+		 * reading the ZAP is unlikely to be the bottleneck.
+		 */
+		zap_cursor_init_noprefetch(&zc, os, object);
+	} else {
+		zap_cursor_init_serialized(&zc, os, object, *walk);
+	}
 	if ((error = zap_cursor_retrieve(&zc, &za)) == 0) {
 		uchar_t cbuf[sizeof (dde->dde_phys) + 1];
 		uint64_t csize = za.za_num_integers;

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	Wed Oct 16 07:04:01 2019	(r353620)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	Wed Oct 16 07:09:00 2019	(r353621)
@@ -80,6 +80,13 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_per
  */
 int zfs_object_remap_one_indirect_delay_ticks = 0;
 
+/*
+ * Limit the amount we can prefetch with one call to this amount.  This
+ * helps to limit the amount of memory that can be used by prefetching.
+ * Larger objects should be prefetched a bit at a time.
+ */
+uint64_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
+
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "unallocated"		},
 	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "object directory"		},
@@ -641,6 +648,11 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t le
 		rw_exit(&dn->dn_struct_rwlock);
 		return;
 	}
+
+	/*
+	 * See comment before the definition of dmu_prefetch_max.
+	 */
+	len = MIN(len, dmu_prefetch_max);
 
 	/*
 	 * XXX - Note, if the dnode for the requested object is not

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h	Wed Oct 16 07:04:01 2019	(r353620)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h	Wed Oct 16 07:09:00 2019	(r353621)
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  */
 
@@ -349,6 +349,7 @@ typedef struct zap_cursor {
 	uint64_t zc_serialized;
 	uint64_t zc_hash;
 	uint32_t zc_cd;
+	boolean_t zc_prefetch;
 } zap_cursor_t;
 
 typedef struct {
@@ -375,6 +376,8 @@ typedef struct {
  * zapobj.  You must _fini the cursor when you are done with it.
  */
 void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj);
+void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os,
+    uint64_t zapobj);
 void zap_cursor_fini(zap_cursor_t *zc);
 
 /*

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c	Wed Oct 16 07:04:01 2019	(r353620)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c	Wed Oct 16 07:09:00 2019	(r353621)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
@@ -49,6 +49,36 @@
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
 
+/*
+ * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object
+ * (all leaf blocks) when we start iterating over it.
+ *
+ * For zap_cursor_init(), the callers all intend to iterate through all the
+ * entries.  There are a few cases where an error (typically i/o error) could
+ * cause it to bail out early.
+ *
+ * For zap_cursor_init_serialized(), there are callers that do the iteration
+ * outside of ZFS.  Typically they would iterate over everything, but we
+ * don't have control of that.  E.g. zfs_ioc_snapshot_list_next(),
+ * zcp_snapshots_iter(), and other iterators over things in the MOS - these
+ * are called by /sbin/zfs and channel programs.  The other example is
+ * zfs_readdir() which iterates over directory entries for the getdents()
+ * syscall.  /sbin/ls iterates to the end (unless it receives a signal), but
+ * userland doesn't have to.
+ *
+ * Given that the ZAP entries aren't returned in a specific order, the only
+ * legitimate use cases for partial iteration would be:
+ *
+ * 1. Pagination: e.g. you only want to display 100 entries at a time, so you
+ *    get the first 100 and then wait for the user to hit "next page", which
+ *    they may never do).
+ *
+ * 2. You want to know if there are more than X entries, without relying on
+ *    the zfs-specific implementation of the directory's st_size (which is
+ *    the number of entries).
+ */
+boolean_t zap_iterate_prefetch = B_TRUE;
+
 int fzap_default_block_shift = 14; /* 16k blocksize */
 
 extern inline zap_phys_t *zap_f_phys(zap_t *zap);
@@ -1168,6 +1198,20 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap
 
 	/* retrieve the next entry at or after zc_hash/zc_cd */
 	/* if no entry, return ENOENT */
+
+	/*
+	 * If we are reading from the beginning, we're almost
+	 * certain to iterate over the entire ZAP object.  If there are
+	 * multiple leaf blocks (freeblk > 2), prefetch the whole
+	 * object, so that we read the leaf blocks concurrently.
+	 * (Unless noprefetch was requested via zap_cursor_init_noprefetch()).
+	 */
+	if (zc->zc_hash == 0 && zap_iterate_prefetch &&
+	    zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
+		dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0,
+		    zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
+		    ZIO_PRIORITY_ASYNC_READ);
+	}
 
 	if (zc->zc_leaf &&
 	    (ZAP_HASH_IDX(zc->zc_hash,

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c	Wed Oct 16 07:04:01 2019	(r353620)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c	Wed Oct 16 07:09:00 2019	(r353621)
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2017 Nexenta Systems, Inc.
@@ -1394,9 +1394,9 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const
  * Routines for iterating over the attributes.
  */
 
-void
-zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
-    uint64_t serialized)
+static void
+zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+    uint64_t serialized, boolean_t prefetch)
 {
 	zc->zc_objset = os;
 	zc->zc_zap = NULL;
@@ -1405,12 +1405,33 @@ zap_cursor_init_serialized(zap_cursor_t *zc, objset_t 
 	zc->zc_serialized = serialized;
 	zc->zc_hash = 0;
 	zc->zc_cd = 0;
+	zc->zc_prefetch = prefetch;
 }
+void
+zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+    uint64_t serialized)
+{
+	zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
+}
 
+/*
+ * Initialize a cursor at the beginning of the ZAP object.  The entire
+ * ZAP object will be prefetched.
+ */
 void
 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
 {
-	zap_cursor_init_serialized(zc, os, zapobj, 0);
+	zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
+}
+
+/*
+ * Initialize a cursor at the beginning, but request that we not prefetch
+ * the entire ZAP object.
+ */
+void
+zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
+{
+	zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
 }
 
 void


More information about the svn-src-all mailing list