svn commit: r318946 - vendor-sys/illumos/dist/common/zfs vendor-sys/illumos/dist/uts/common vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor/illumos/di...

Andriy Gapon avg at FreeBSD.org
Fri May 26 12:13:30 UTC 2017


Author: avg
Date: Fri May 26 12:13:27 2017
New Revision: 318946
URL: https://svnweb.freebsd.org/changeset/base/318946

Log:
  8021 ARC buf data scatter-ization
  8100 8021 seems to cause random BAD TRAP: type=d (#gp General protection)
  
  illumos/illumos-gate at 770499e185d15678ccb0be57ebc626ad18d93383
  https://github.com/illumos/illumos-gate/commit/770499e185d15678ccb0be57ebc626ad18d93383
  
  https://www.illumos.org/issues/8021
    The ARC buf data project (known simply as "ABD" since its genesis in the ZoL
    community) changes the way the ARC allocates `b_pdata` memory from using linear
    `void *` buffers to using scatter/gather lists of fixed-size 1KB chunks. This
    improves ZFS's performance by helping to defragment the address space occupied
    by the ARC, in particular for cases where compressed ARC is enabled. It could
    also ease future work to allocate pages directly from `segkpm` for minimal-
    overhead memory allocations, bypassing the `kmem` subsystem.
    This is essentially the same change as the one which recently landed in ZFS on
    Linux, although they made some platform-specific changes while adapting this
    work to their codebase:
    1. Implemented the equivalent of the `segkpm` suggestion for future work
    mentioned above to bypass issues that they've had with the Linux kernel memory
    allocator.
    2. Changed the internal representation of the ABD's scatter/gather list so it
    could be used to pass I/O directly into Linux block device drivers. (This
    feature is not available in the illumos block device interface yet.)
  
  https://www.illumos.org/issues/8100
    My supermicro system is getting random BAD TRAP: type=d (#gp General
    protection) at about the stage where ZFS filesystems are mounted - usually
    console login prompt is already present but the services are still starting.
    After backing out 8021, the boot is completed and no panics do occur.
    Machine does dump, however savecore fails:
    savecore: bad magic number baddcafe
    I can get more data out with boot -k, if needed.
    # psrinfo -vp
    The physical processor has 4 cores and 8 virtual processors (0-7)
      The core has 2 virtual processors (0 4)
      The core has 2 virtual processors (1 5)
      The core has 2 virtual processors (2 6)
      The core has 2 virtual processors (3 7)
        x86 (GenuineIntel 306C3 family 6 model 60 step 3 clock 3500 MHz)
          Intel(r) Xeon(r) CPU E3-1246 v3 @ 3.50GHz
  
    # prtconf -m
    32657
  
    $ zpool status
      pool: rpool
     state: ONLINE
      scan: none requested
    config:
  
            NAME        STATE     READ WRITE CKSUM
            rpool       ONLINE       0     0     0
              raidz1-0  ONLINE       0     0     0
                c3t0d0  ONLINE       0     0     0
                c3t1d0  ONLINE       0     0     0
  
  Reviewed by: Matthew Ahrens mahrens at delphix.com
  Reviewed by: George Wilson george.wilson at delphix.com
  Reviewed by: Paul Dagnelie pcd at delphix.com
  Reviewed by: John Kennedy john.kennedy at delphix.com
  Reviewed by: Prakash Surya prakash.surya at delphix.com
  Reviewed by: Prashanth Sreenivasa pks at delphix.com
  Reviewed by: Pavel Zakharov pavel.zakharov at delphix.com
  Reviewed by: Chris Williamson chris.williamson at delphix.com
  Approved by: Richard Lowe <richlowe at richlowe.net>
  Author: Dan Kimmel <dan.kimmel at delphix.com>

Modified:
  vendor/illumos/dist/cmd/zdb/zdb.c
  vendor/illumos/dist/cmd/zdb/zdb_il.c
  vendor/illumos/dist/cmd/ztest/ztest.c
  vendor/illumos/dist/lib/libzfs/common/libzfs_sendrecv.c

Changes in other areas also in this revision:
Added:
  vendor-sys/illumos/dist/uts/common/fs/zfs/abd.c   (contents, props changed)
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/abd.h   (contents, props changed)
Modified:
  vendor-sys/illumos/dist/common/zfs/zfs_fletcher.c
  vendor-sys/illumos/dist/common/zfs/zfs_fletcher.h
  vendor-sys/illumos/dist/uts/common/Makefile.files
  vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/blkptr.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/ddt.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/edonr_zfs.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/lz4.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sha256.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/skein_zfs.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/ddt.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio_checksum.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio_compress.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_cache.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_disk.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_file.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_label.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_mirror.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_queue.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_raidz.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zio_checksum.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zio_compress.c

Modified: vendor/illumos/dist/cmd/zdb/zdb.c
==============================================================================
--- vendor/illumos/dist/cmd/zdb/zdb.c	Fri May 26 12:08:38 2017	(r318945)
+++ vendor/illumos/dist/cmd/zdb/zdb.c	Fri May 26 12:13:27 2017	(r318946)
@@ -60,6 +60,7 @@
 #include <sys/arc.h>
 #include <sys/ddt.h>
 #include <sys/zfeature.h>
+#include <sys/abd.h>
 #include <zfs_comutil.h>
 #undef verify
 #include <libzfs.h>
@@ -2535,7 +2536,7 @@ zdb_blkptr_done(zio_t *zio)
 	zdb_cb_t *zcb = zio->io_private;
 	zbookmark_phys_t *zb = &zio->io_bookmark;
 
-	zio_data_buf_free(zio->io_data, zio->io_size);
+	abd_free(zio->io_abd);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_scrub_inflight--;
@@ -2601,7 +2602,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog
 	if (!BP_IS_EMBEDDED(bp) &&
 	    (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
 		size_t size = BP_GET_PSIZE(bp);
-		void *data = zio_data_buf_alloc(size);
+		abd_t *abd = abd_alloc(size, B_FALSE);
 		int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
 
 		/* If it's an intent log block, failure is expected. */
@@ -2614,7 +2615,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog
 		spa->spa_scrub_inflight++;
 		mutex_exit(&spa->spa_scrub_lock);
 
-		zio_nowait(zio_read(NULL, spa, bp, data, size,
+		zio_nowait(zio_read(NULL, spa, bp, abd, size,
 		    zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
 	}
 
@@ -3394,6 +3395,13 @@ name:
 	return (NULL);
 }
 
+/* ARGSUSED */
+static int
+random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused)
+{
+	return (random_get_pseudo_bytes(buf, len));
+}
+
 /*
  * Read a block from a pool and print it out.  The syntax of the
  * block descriptor is:
@@ -3425,7 +3433,8 @@ zdb_read_block(char *thing, spa_t *spa)
 	uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
 	zio_t *zio;
 	vdev_t *vd;
-	void *pbuf, *lbuf, *buf;
+	abd_t *pabd;
+	void *lbuf, *buf;
 	char *s, *p, *dup, *vdev, *flagstr;
 	int i, error;
 
@@ -3496,7 +3505,7 @@ zdb_read_block(char *thing, spa_t *spa)
 	psize = size;
 	lsize = size;
 
-	pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+	pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE);
 	lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 
 	BP_ZERO(bp);
@@ -3524,15 +3533,15 @@ zdb_read_block(char *thing, spa_t *spa)
 		/*
 		 * Treat this as a normal block read.
 		 */
-		zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL,
+		zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
 		    ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
 	} else {
 		/*
 		 * Treat this as a vdev child I/O.
 		 */
-		zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize,
-		    ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
+		zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
+		    psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
 		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL));
@@ -3555,21 +3564,21 @@ zdb_read_block(char *thing, spa_t *spa)
 		void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 		void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 
-		bcopy(pbuf, pbuf2, psize);
+		abd_copy_to_buf(pbuf2, pabd, psize);
 
-		VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize,
-		    SPA_MAXBLOCKSIZE - psize) == 0);
+		VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize,
+		    random_get_pseudo_bytes_cb, NULL));
 
-		VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
-		    SPA_MAXBLOCKSIZE - psize) == 0);
+		VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
+		    SPA_MAXBLOCKSIZE - psize));
 
 		for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
 		    lsize -= SPA_MINBLOCKSIZE) {
 			for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
-				if (zio_decompress_data(c, pbuf, lbuf,
-				    psize, lsize) == 0 &&
-				    zio_decompress_data(c, pbuf2, lbuf2,
-				    psize, lsize) == 0 &&
+				if (zio_decompress_data(c, pabd,
+				    lbuf, psize, lsize) == 0 &&
+				    zio_decompress_data_buf(c, pbuf2,
+				    lbuf2, psize, lsize) == 0 &&
 				    bcmp(lbuf, lbuf2, lsize) == 0)
 					break;
 			}
@@ -3588,7 +3597,7 @@ zdb_read_block(char *thing, spa_t *spa)
 		buf = lbuf;
 		size = lsize;
 	} else {
-		buf = pbuf;
+		buf = abd_to_buf(pabd);
 		size = psize;
 	}
 
@@ -3606,7 +3615,7 @@ zdb_read_block(char *thing, spa_t *spa)
 		zdb_dump_block(thing, buf, size, flags);
 
 out:
-	umem_free(pbuf, SPA_MAXBLOCKSIZE);
+	abd_free(pabd);
 	umem_free(lbuf, SPA_MAXBLOCKSIZE);
 	free(dup);
 }

Modified: vendor/illumos/dist/cmd/zdb/zdb_il.c
==============================================================================
--- vendor/illumos/dist/cmd/zdb/zdb_il.c	Fri May 26 12:08:38 2017	(r318945)
+++ vendor/illumos/dist/cmd/zdb/zdb_il.c	Fri May 26 12:13:27 2017	(r318946)
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
  */
 
 /*
@@ -41,6 +41,7 @@
 #include <sys/resource.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
+#include <sys/abd.h>
 
 extern uint8_t dump_opt[256];
 
@@ -117,13 +118,27 @@ zil_prt_rec_rename(zilog_t *zilog, int t
 }
 
 /* ARGSUSED */
+static int
+zil_prt_rec_write_cb(void *data, size_t len, void *unused)
+{
+	char *cdata = data;
+	for (int i = 0; i < len; i++) {
+		if (isprint(*cdata))
+			(void) printf("%c ", *cdata);
+		else
+			(void) printf("%2X", *cdata);
+		cdata++;
+	}
+	return (0);
+}
+
+/* ARGSUSED */
 static void
 zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
 {
-	char *data, *dlimit;
+	abd_t *data;
 	blkptr_t *bp = &lr->lr_blkptr;
 	zbookmark_phys_t zb;
-	char buf[SPA_MAXBLOCKSIZE];
 	int verbose = MAX(dump_opt['d'], dump_opt['i']);
 	int error;
 
@@ -144,7 +159,6 @@ zil_prt_rec_write(zilog_t *zilog, int tx
 		if (BP_IS_HOLE(bp)) {
 			(void) printf("\t\t\tLSIZE 0x%llx\n",
 			    (u_longlong_t)BP_GET_LSIZE(bp));
-			bzero(buf, sizeof (buf));
 			(void) printf("%s<hole>\n", prefix);
 			return;
 		}
@@ -157,28 +171,26 @@ zil_prt_rec_write(zilog_t *zilog, int tx
 		    lr->lr_foid, ZB_ZIL_LEVEL,
 		    lr->lr_offset / BP_GET_LSIZE(bp));
 
+		data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE);
 		error = zio_wait(zio_read(NULL, zilog->zl_spa,
-		    bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
+		    bp, data, BP_GET_LSIZE(bp), NULL, NULL,
 		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
 		if (error)
-			return;
-		data = buf;
+			goto out;
 	} else {
-		data = (char *)(lr + 1);
+		/* data is stored after the end of the lr_write record */
+		data = abd_alloc(lr->lr_length, B_FALSE);
+		abd_copy_from_buf(data, lr + 1, lr->lr_length);
 	}
 
-	dlimit = data + MIN(lr->lr_length,
-	    (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE));
-
 	(void) printf("%s", prefix);
-	while (data < dlimit) {
-		if (isprint(*data))
-			(void) printf("%c ", *data);
-		else
-			(void) printf("%2X", *data);
-		data++;
-	}
+	(void) abd_iterate_func(data,
+	    0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)),
+	    zil_prt_rec_write_cb, NULL);
 	(void) printf("\n");
+
+out:
+	abd_free(data);
 }
 
 /* ARGSUSED */

Modified: vendor/illumos/dist/cmd/ztest/ztest.c
==============================================================================
--- vendor/illumos/dist/cmd/ztest/ztest.c	Fri May 26 12:08:38 2017	(r318945)
+++ vendor/illumos/dist/cmd/ztest/ztest.c	Fri May 26 12:13:27 2017	(r318946)
@@ -111,6 +111,7 @@
 #include <sys/refcount.h>
 #include <sys/zfeature.h>
 #include <sys/dsl_userhold.h>
+#include <sys/abd.h>
 #include <stdio.h>
 #include <stdio_ext.h>
 #include <stdlib.h>
@@ -188,6 +189,7 @@ extern uint64_t metaslab_df_alloc_thresh
 extern uint64_t zfs_deadman_synctime_ms;
 extern int metaslab_preload_limit;
 extern boolean_t zfs_compressed_arc_enabled;
+extern boolean_t zfs_abd_scatter_enabled;
 
 static ztest_shared_opts_t *ztest_shared_opts;
 static ztest_shared_opts_t ztest_opts;
@@ -5048,7 +5050,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_
 	enum zio_checksum checksum = spa_dedup_checksum(spa);
 	dmu_buf_t *db;
 	dmu_tx_t *tx;
-	void *buf;
+	abd_t *abd;
 	blkptr_t blk;
 	int copies = 2 * ZIO_DEDUPDITTO_MIN;
 
@@ -5128,14 +5130,14 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_
 	 * Damage the block.  Dedup-ditto will save us when we read it later.
 	 */
 	psize = BP_GET_PSIZE(&blk);
-	buf = zio_buf_alloc(psize);
-	ztest_pattern_set(buf, psize, ~pattern);
+	abd = abd_alloc_linear(psize, B_TRUE);
+	ztest_pattern_set(abd_to_buf(abd), psize, ~pattern);
 
 	(void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
-	    buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
+	    abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
 
-	zio_buf_free(buf, psize);
+	abd_free(abd);
 
 	(void) rw_unlock(&ztest_name_lock);
 }
@@ -5418,6 +5420,12 @@ ztest_resume_thread(void *arg)
 		 */
 		if (ztest_random(10) == 0)
 			zfs_compressed_arc_enabled = ztest_random(2);
+
+		/*
+		 * Periodically change the zfs_abd_scatter_enabled setting.
+		 */
+		if (ztest_random(10) == 0)
+			zfs_abd_scatter_enabled = ztest_random(2);
 	}
 	return (NULL);
 }

Modified: vendor/illumos/dist/lib/libzfs/common/libzfs_sendrecv.c
==============================================================================
--- vendor/illumos/dist/lib/libzfs/common/libzfs_sendrecv.c	Fri May 26 12:08:38 2017	(r318945)
+++ vendor/illumos/dist/lib/libzfs/common/libzfs_sendrecv.c	Fri May 26 12:13:27 2017	(r318946)
@@ -192,19 +192,19 @@ dump_record(dmu_replay_record_t *drr, vo
 {
 	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
-	fletcher_4_incremental_native(drr,
+	(void) fletcher_4_incremental_native(drr,
 	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
 	if (drr->drr_type != DRR_BEGIN) {
 		ASSERT(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
 		    drr_checksum.drr_checksum));
 		drr->drr_u.drr_checksum.drr_checksum = *zc;
 	}
-	fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum,
-	    sizeof (zio_cksum_t), zc);
+	(void) fletcher_4_incremental_native(
+	    &drr->drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), zc);
 	if (write(outfd, drr, sizeof (*drr)) == -1)
 		return (errno);
 	if (payload_len != 0) {
-		fletcher_4_incremental_native(payload, payload_len, zc);
+		(void) fletcher_4_incremental_native(payload, payload_len, zc);
 		if (write(outfd, payload, payload_len) == -1)
 			return (errno);
 	}
@@ -2093,9 +2093,9 @@ recv_read(libzfs_handle_t *hdl, int fd, 
 
 	if (zc) {
 		if (byteswap)
-			fletcher_4_incremental_byteswap(buf, ilen, zc);
+			(void) fletcher_4_incremental_byteswap(buf, ilen, zc);
 		else
-			fletcher_4_incremental_native(buf, ilen, zc);
+			(void) fletcher_4_incremental_native(buf, ilen, zc);
 	}
 	return (0);
 }
@@ -3649,7 +3649,8 @@ zfs_receive_impl(libzfs_handle_t *hdl, c
 		 * recv_read() above; do it again correctly.
 		 */
 		bzero(&zcksum, sizeof (zio_cksum_t));
-		fletcher_4_incremental_byteswap(&drr, sizeof (drr), &zcksum);
+		(void) fletcher_4_incremental_byteswap(&drr,
+		    sizeof (drr), &zcksum);
 		flags->byteswap = B_TRUE;
 
 		drr.drr_type = BSWAP_32(drr.drr_type);


More information about the svn-src-all mailing list