svn commit: r274337 - in head: cddl/contrib/opensolaris/cmd/zdb cddl/contrib/opensolaris/cmd/zfs cddl/contrib/opensolaris/cmd/zpool cddl/contrib/opensolaris/cmd/zstreamdump cddl/contrib/opensolaris...

Xin LI delphij at FreeBSD.org
Mon Nov 10 08:20:29 UTC 2014


Author: delphij
Date: Mon Nov 10 08:20:21 2014
New Revision: 274337
URL: https://svnweb.freebsd.org/changeset/base/274337

Log:
  MFV r274273:
  
  ZFS large block support.
  
  Please note that booting from datasets that have recordsize greater
  than 128KB is not supported (but it's Okay to enable the feature on
  the pool).  This *may* remain unchanged because of memory constraint.
  
  Limited safety belt is provided for mounted root filesystem but use
  caution is advised.
  
  Illumos issue:
      5027 zfs large block support
  
  MFC after:	1 month

Modified:
  head/cddl/contrib/opensolaris/cmd/zdb/zdb.c
  head/cddl/contrib/opensolaris/cmd/zfs/zfs.8
  head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
  head/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7
  head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c
  head/cddl/contrib/opensolaris/cmd/ztest/ztest.c
  head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
  head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
  head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c
  head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c
  head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h
  head/sys/boot/zfs/zfsimpl.c
  head/sys/cddl/boot/zfs/zfsimpl.h
  head/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
  head/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
  head/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
  head/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
  head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
Directory Properties:
  head/cddl/contrib/opensolaris/   (props changed)
  head/cddl/contrib/opensolaris/cmd/zfs/   (props changed)
  head/cddl/contrib/opensolaris/lib/libzfs/   (props changed)
  head/sys/cddl/contrib/opensolaris/   (props changed)

Modified: head/cddl/contrib/opensolaris/cmd/zdb/zdb.c
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/zdb/zdb.c	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/cddl/contrib/opensolaris/cmd/zdb/zdb.c	Mon Nov 10 08:20:21 2014	(r274337)
@@ -2147,6 +2147,8 @@ dump_label(const char *dev)
 	(void) close(fd);
 }
 
+static uint64_t num_large_blocks;
+
 /*ARGSUSED*/
 static int
 dump_one_dir(const char *dsname, void *arg)
@@ -2159,6 +2161,8 @@ dump_one_dir(const char *dsname, void *a
 		(void) printf("Could not open %s, error %d\n", dsname, error);
 		return (0);
 	}
+	if (dmu_objset_ds(os)->ds_large_blocks)
+		num_large_blocks++;
 	dump_dir(os);
 	dmu_objset_disown(os, FTAG);
 	fuid_table_destroy();
@@ -2169,7 +2173,7 @@ dump_one_dir(const char *dsname, void *a
 /*
  * Block statistics.
  */
-#define	PSIZE_HISTO_SIZE (SPA_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1)
+#define	PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
 typedef struct zdb_blkstats {
 	uint64_t zb_asize;
 	uint64_t zb_lsize;
@@ -2234,7 +2238,15 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *
 		zb->zb_lsize += BP_GET_LSIZE(bp);
 		zb->zb_psize += BP_GET_PSIZE(bp);
 		zb->zb_count++;
-		zb->zb_psize_histogram[BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT]++;
+
+		/*
+		 * The histogram is only big enough to record blocks up to
+		 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
+		 * "other", bucket.
+		 */
+		int idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
+		idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
+		zb->zb_psize_histogram[idx]++;
 
 		zb->zb_gangs += BP_COUNT_GANG(bp);
 
@@ -2946,6 +2958,7 @@ dump_zpool(spa_t *spa)
 		dump_metaslab_groups(spa);
 
 	if (dump_opt['d'] || dump_opt['i']) {
+		uint64_t refcount;
 		dump_dir(dp->dp_meta_objset);
 		if (dump_opt['d'] >= 3) {
 			dump_bpobj(&spa->spa_deferred_bpobj,
@@ -2965,8 +2978,21 @@ dump_zpool(spa_t *spa)
 		}
 		(void) dmu_objset_find(spa_name(spa), dump_one_dir,
 		    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+
+		(void) feature_get_refcount(spa,
+		    &spa_feature_table[SPA_FEATURE_LARGE_BLOCKS], &refcount);
+		if (num_large_blocks != refcount) {
+			(void) printf("large_blocks feature refcount mismatch: "
+			    "expected %lld != actual %lld\n",
+			    (longlong_t)num_large_blocks,
+			    (longlong_t)refcount);
+			rc = 2;
+		} else {
+			(void) printf("Verified large_blocks feature refcount "
+			    "is correct (%llu)\n", (longlong_t)refcount);
+		}
 	}
-	if (dump_opt['b'] || dump_opt['c'])
+	if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
 		rc = dump_block_stats(spa);
 
 	if (rc == 0)

Modified: head/cddl/contrib/opensolaris/cmd/zfs/zfs.8
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/zfs/zfs.8	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/cddl/contrib/opensolaris/cmd/zfs/zfs.8	Mon Nov 10 08:20:21 2014	(r274337)
@@ -30,7 +30,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd June 30, 2014
+.Dd November 10, 2014
 .Dt ZFS 8
 .Os
 .Sh NAME
@@ -179,12 +179,12 @@
 .Ar bookmark
 .Nm
 .Cm send
-.Op Fl DnPpRve
+.Op Fl DnPpRveL
 .Op Fl i Ar snapshot | Fl I Ar snapshot
 .Ar snapshot
 .Nm
 .Cm send
-.Op Fl e
+.Op Fl eL
 .Op Fl i Ar snapshot Ns | Ns bookmark
 .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
 .Nm
@@ -1187,6 +1187,12 @@ systems is strongly discouraged, and may
 .Pp
 The size specified must be a power of two greater than or equal to 512 and less
 than or equal to 128 Kbytes.
+If the
+.Sy large_blocks
+feature is enabled on the pool, the size may be up to 1 Mbyte.
+See
+.Xr zpool-features 7
+for details on ZFS feature flags.
 .Pp
 Changing the file system's
 .Sy recordsize
@@ -2477,7 +2483,7 @@ feature.
 .It Xo
 .Nm
 .Cm send
-.Op Fl DnPpRve
+.Op Fl DnPpRveL
 .Op Fl i Ar snapshot | Fl I Ar snapshot
 .Ar snapshot
 .Xc
@@ -2549,6 +2555,22 @@ be used regardless of the dataset's
 property, but performance will be much better if the filesystem uses a
 dedup-capable checksum (eg.
 .Sy sha256 ) .
+.It Fl L
+Generate a stream which may contain blocks larger than 128KB.
+This flag
+has no effect if the
+.Sy large_blocks
+pool feature is disabled, or if the
+.Sy recordsize
+property of this filesystem has never been set above 128KB.
+The receiving system must have the
+.Sy large_blocks
+pool feature enabled as well.
+See
+.Xr zpool-features 7
+for details on ZFS feature flags and the
+.Sy large_blocks
+feature.
 .It Fl e
 Generate a more compact stream by using WRITE_EMBEDDED records for blocks
 which are stored more compactly on disk by the
@@ -2596,7 +2618,7 @@ on future versions of
 .It Xo
 .Nm
 .Cm send
-.Op Fl e
+.Op Fl eL
 .Op Fl i Ar snapshot Ns | Ns Ar bookmark
 .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
 .Xc
@@ -2622,6 +2644,22 @@ specified as the last component of the n
 If the incremental target is a clone, the incremental source can
 be the origin snapshot, or an earlier snapshot in the origin's filesystem,
 or the origin's origin, etc.
+.It Fl L
+Generate a stream which may contain blocks larger than 128KB.
+This flag
+has no effect if the
+.Sy large_blocks
+pool feature is disabled, or if the
+.Sy recordsize
+property of this filesystem has never been set above 128KB.
+The receiving system must have the
+.Sy large_blocks
+pool feature enabled as well.
+See
+.Xr zpool-features 7
+for details on ZFS feature flags and the
+.Sy large_blocks
+feature.
 .It Fl e
 Generate a more compact stream by using WRITE_EMBEDDED records for blocks
 which are stored more compactly on disk by the

Modified: head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c	Mon Nov 10 08:20:21 2014	(r274337)
@@ -274,9 +274,9 @@ get_usage(zfs_help_t idx)
 	case HELP_ROLLBACK:
 		return (gettext("\trollback [-rRf] <snapshot>\n"));
 	case HELP_SEND:
-		return (gettext("\tsend [-DnPpRve] [-[iI] snapshot] "
+		return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] "
 		    "<snapshot>\n"
-		    "\tsend [-e] [-i snapshot|bookmark] "
+		    "\tsend [-Le] [-i snapshot|bookmark] "
 		    "<filesystem|volume|snapshot>\n"));
 	case HELP_SET:
 		return (gettext("\tset <property=value> "
@@ -3709,7 +3709,7 @@ zfs_do_send(int argc, char **argv)
 	boolean_t extraverbose = B_FALSE;
 
 	/* check options */
-	while ((c = getopt(argc, argv, ":i:I:RDpvnPe")) != -1) {
+	while ((c = getopt(argc, argv, ":i:I:RDpvnPLe")) != -1) {
 		switch (c) {
 		case 'i':
 			if (fromname)
@@ -3744,6 +3744,9 @@ zfs_do_send(int argc, char **argv)
 		case 'n':
 			flags.dryrun = B_TRUE;
 			break;
+		case 'L':
+			flags.largeblock = B_TRUE;
+			break;
 		case 'e':
 			flags.embed_data = B_TRUE;
 			break;
@@ -3800,6 +3803,8 @@ zfs_do_send(int argc, char **argv)
 		if (zhp == NULL)
 			return (1);
 
+		if (flags.largeblock)
+			lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK;
 		if (flags.embed_data)
 			lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
 

Modified: head/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7	Mon Nov 10 08:20:21 2014	(r274337)
@@ -23,7 +23,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd July 1, 2014
+.Dd November 10, 2014
 .Dt ZPOOL-FEATURES 7
 .Os
 .Sh NAME
@@ -427,6 +427,33 @@ This feature becomes
 as soon as it is enabled and will
 never return to being
 .Sy enabled .
+.It Sy large_blocks
+.Bl -column "READ\-ONLY COMPATIBLE" "org.open-zfs:large_block"
+.It GUID Ta org.open-zfs:large_block
+.It READ\-ONLY COMPATIBLE Ta no
+.It DEPENDENCIES Ta extensible_dataset
+.El
+.Pp
+The
+.Sy large_block
+feature allows the record size on a dataset to be
+set larger than 128KB.
+.Pp
+This feature becomes
+.Sy active
+once a
+.Sy recordsize
+property has been set larger than 128KB, and will return to being 
+.Sy enabled
+once all filesystems that have ever had their recordsize larger than 128KB
+are destroyed.
+.Pp
+Please note that booting from datasets that have recordsize greater than
+128KB is
+.Em NOT
+supported by the
+.Fx
+boot loader.
 .El
 .Sh SEE ALSO
 .Xr zpool 8

Modified: head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c	Mon Nov 10 08:20:21 2014	(r274337)
@@ -54,7 +54,6 @@ uint64_t total_stream_len = 0;
 FILE *send_stream = 0;
 boolean_t do_byteswap = B_FALSE;
 boolean_t do_cksum = B_TRUE;
-#define	INITIAL_BUFLEN (1<<20)
 
 static void
 usage(void)
@@ -67,6 +66,18 @@ usage(void)
 	exit(1);
 }
 
+static void *
+safe_malloc(size_t size)
+{
+	void *rv = malloc(size);
+	if (rv == NULL) {
+		(void) fprintf(stderr, "ERROR; failed to allocate %zu bytes\n",
+		    size);
+		abort();
+	}
+	return (rv);
+}
+
 /*
  * ssread - send stream read.
  *
@@ -158,7 +169,7 @@ print_block(char *buf, int length)
 int
 main(int argc, char *argv[])
 {
-	char *buf = malloc(INITIAL_BUFLEN);
+	char *buf = safe_malloc(SPA_MAXBLOCKSIZE);
 	uint64_t drr_record_count[DRR_NUMTYPES] = { 0 };
 	uint64_t total_records = 0;
 	dmu_replay_record_t thedrr;
@@ -307,9 +318,9 @@ main(int argc, char *argv[])
 				nvlist_t *nv;
 				int sz = drr->drr_payloadlen;
 
-				if (sz > INITIAL_BUFLEN) {
+				if (sz > SPA_MAXBLOCKSIZE) {
 					free(buf);
-					buf = malloc(sz);
+					buf = safe_malloc(sz);
 				}
 				(void) ssread(buf, sz, &zc);
 				if (ferror(send_stream))

Modified: head/cddl/contrib/opensolaris/cmd/ztest/ztest.c
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/ztest/ztest.c	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/cddl/contrib/opensolaris/cmd/ztest/ztest.c	Mon Nov 10 08:20:21 2014	(r274337)
@@ -987,9 +987,15 @@ ztest_spa_get_ashift() {
 static int
 ztest_random_blocksize(void)
 {
-	// Choose a block size >= the ashift.
-	uint64_t block_shift =
-	    ztest_random(SPA_MAXBLOCKSHIFT - ztest_spa_get_ashift() + 1);
+	uint64_t block_shift;
+	/*
+	 * Choose a block size >= the ashift.
+	 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks.
+	 */
+	int maxbs = SPA_OLD_MAXBLOCKSHIFT;
+	if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE)
+		maxbs = 20;
+	block_shift = ztest_random(maxbs - ztest_spa_get_ashift() + 1);
 	return (1 << (SPA_MINBLOCKSHIFT + block_shift));
 }
 
@@ -4789,7 +4795,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint6
 	char path0[MAXPATHLEN];
 	char pathrand[MAXPATHLEN];
 	size_t fsize;
-	int bshift = SPA_MAXBLOCKSHIFT + 2;	/* don't scrog all labels */
+	int bshift = SPA_OLD_MAXBLOCKSHIFT + 2;	/* don't scrog all labels */
 	int iters = 1000;
 	int maxfaults;
 	int mirror_save;

Modified: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
==============================================================================
--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h	Mon Nov 10 08:20:21 2014	(r274337)
@@ -609,6 +609,9 @@ typedef struct sendflags {
 	/* show progress (ie. -v) */
 	boolean_t progress;
 
+	/* large blocks (>128K) are permitted */
+	boolean_t largeblock;
+
 	/* WRITE_EMBEDDED records of type DATA are permitted */
 	boolean_t embed_data;
 } sendflags_t;

Modified: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
==============================================================================
--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c	Mon Nov 10 08:20:21 2014	(r274337)
@@ -1080,21 +1080,36 @@ zfs_valid_proplist(libzfs_handle_t *hdl,
 			break;
 		}
 
-		case ZFS_PROP_RECORDSIZE:
 		case ZFS_PROP_VOLBLOCKSIZE:
-			/* must be power of two within SPA_{MIN,MAX}BLOCKSIZE */
+		case ZFS_PROP_RECORDSIZE:
+		{
+			int maxbs = SPA_MAXBLOCKSIZE;
+			if (zhp != NULL) {
+				maxbs = zpool_get_prop_int(zhp->zpool_hdl,
+				    ZPOOL_PROP_MAXBLOCKSIZE, NULL);
+			}
+			/*
+			 * Volumes are limited to a volblocksize of 128KB,
+			 * because they typically service workloads with
+			 * small random writes, which incur a large performance
+			 * penalty with large blocks.
+			 */
+			if (prop == ZFS_PROP_VOLBLOCKSIZE)
+				maxbs = SPA_OLD_MAXBLOCKSIZE;
+			/*
+			 * The value must be a power of two between
+			 * SPA_MINBLOCKSIZE and maxbs.
+			 */
 			if (intval < SPA_MINBLOCKSIZE ||
-			    intval > SPA_MAXBLOCKSIZE || !ISP2(intval)) {
+			    intval > maxbs || !ISP2(intval)) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-				    "'%s' must be power of 2 from %u "
-				    "to %uk"), propname,
-				    (uint_t)SPA_MINBLOCKSIZE,
-				    (uint_t)SPA_MAXBLOCKSIZE >> 10);
+				    "'%s' must be power of 2 from 512B "
+				    "to %uKB"), propname, maxbs >> 10);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			break;
-
+		}
 		case ZFS_PROP_MLSLABEL:
 		{
 #ifdef sun
@@ -1471,7 +1486,9 @@ zfs_setprop_error(libzfs_handle_t *hdl, 
 		break;
 
 	case ERANGE:
-		if (prop == ZFS_PROP_COMPRESSION) {
+	case EDOM:
+		if (prop == ZFS_PROP_COMPRESSION ||
+		    prop == ZFS_PROP_RECORDSIZE) {
 			(void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "property setting is not allowed on "
 			    "bootable datasets"));
@@ -3197,9 +3214,7 @@ zfs_create(libzfs_handle_t *hdl, const c
 		case EDOM:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "volume block size must be power of 2 from "
-			    "%u to %uk"),
-			    (uint_t)SPA_MINBLOCKSIZE,
-			    (uint_t)SPA_MAXBLOCKSIZE >> 10);
+			    "512B to 128KB"));
 
 			return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 

Modified: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c
==============================================================================
--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c	Mon Nov 10 08:20:21 2014	(r274337)
@@ -215,7 +215,7 @@ static void *
 cksummer(void *arg)
 {
 	dedup_arg_t *dda = arg;
-	char *buf = malloc(1<<20);
+	char *buf = zfs_alloc(dda->dedup_hdl, SPA_MAXBLOCKSIZE);
 	dmu_replay_record_t thedrr;
 	dmu_replay_record_t *drr = &thedrr;
 	struct drr_begin *drrb = &thedrr.drr_u.drr_begin;
@@ -280,9 +280,9 @@ cksummer(void *arg)
 			    DMU_COMPOUNDSTREAM && drr->drr_payloadlen != 0) {
 				int sz = drr->drr_payloadlen;
 
-				if (sz > 1<<20) {
-					free(buf);
-					buf = malloc(sz);
+				if (sz > SPA_MAXBLOCKSIZE) {
+					buf = zfs_realloc(dda->dedup_hdl, buf,
+					    SPA_MAXBLOCKSIZE, sz);
 				}
 				(void) ssread(buf, sz, ofp);
 				if (ferror(stdin))
@@ -815,7 +815,7 @@ typedef struct send_dump_data {
 	char prevsnap[ZFS_MAXNAMELEN];
 	uint64_t prevsnap_obj;
 	boolean_t seenfrom, seento, replicate, doall, fromorigin;
-	boolean_t verbose, dryrun, parsable, progress, embed_data;
+	boolean_t verbose, dryrun, parsable, progress, embed_data, large_block;
 	int outfd;
 	boolean_t err;
 	nvlist_t *fss;
@@ -1163,6 +1163,8 @@ dump_snapshot(zfs_handle_t *zhp, void *a
 		}
 
 		enum lzc_send_flags flags = 0;
+		if (sdd->large_block)
+			flags |= LZC_SEND_FLAG_LARGE_BLOCK;
 		if (sdd->embed_data)
 			flags |= LZC_SEND_FLAG_EMBED_DATA;
 
@@ -1511,6 +1513,7 @@ zfs_send(zfs_handle_t *zhp, const char *
 	sdd.parsable = flags->parsable;
 	sdd.progress = flags->progress;
 	sdd.dryrun = flags->dryrun;
+	sdd.large_block = flags->largeblock;
 	sdd.embed_data = flags->embed_data;
 	sdd.filter_cb = filter_func;
 	sdd.filter_cb_arg = cb_arg;
@@ -2545,7 +2548,7 @@ static int
 recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
 {
 	dmu_replay_record_t *drr;
-	void *buf = malloc(1<<20);
+	void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE);
 	char errbuf[1024];
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,

Modified: head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c
==============================================================================
--- head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c	Mon Nov 10 08:20:21 2014	(r274337)
@@ -502,6 +502,10 @@ lzc_get_holds(const char *snapname, nvli
  *
  * "fd" is the file descriptor to write the send stream to.
  *
+ * If "flags" contains LZC_SEND_FLAG_LARGE_BLOCK, the stream is permitted
+ * to contain DRR_WRITE records with drr_length > 128K, and DRR_OBJECT
+ * records with drr_blksz > 128K.
+ *
  * If "flags" contains LZC_SEND_FLAG_EMBED_DATA, the stream is permitted
  * to contain DRR_WRITE_EMBEDDED records with drr_etype==BP_EMBEDDED_TYPE_DATA,
  * which the receiving system must support (as indicated by support
@@ -518,6 +522,8 @@ lzc_send(const char *snapname, const cha
 	fnvlist_add_int32(args, "fd", fd);
 	if (from != NULL)
 		fnvlist_add_string(args, "fromsnap", from);
+	if (flags & LZC_SEND_FLAG_LARGE_BLOCK)
+		fnvlist_add_boolean(args, "largeblockok");
 	if (flags & LZC_SEND_FLAG_EMBED_DATA)
 		fnvlist_add_boolean(args, "embedok");
 	err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL);

Modified: head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h
==============================================================================
--- head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h	Mon Nov 10 08:20:21 2014	(r274337)
@@ -54,7 +54,8 @@ int lzc_release(nvlist_t *, nvlist_t **)
 int lzc_get_holds(const char *, nvlist_t **);
 
 enum lzc_send_flags {
-	LZC_SEND_FLAG_EMBED_DATA = 1 << 0
+	LZC_SEND_FLAG_EMBED_DATA = 1 << 0,
+	LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1
 };
 
 int lzc_send(const char *, const char *, int, enum lzc_send_flags);

Modified: head/sys/boot/zfs/zfsimpl.c
==============================================================================
--- head/sys/boot/zfs/zfsimpl.c	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/sys/boot/zfs/zfsimpl.c	Mon Nov 10 08:20:21 2014	(r274337)
@@ -57,6 +57,7 @@ static const char *features_for_read[] =
 	"com.delphix:hole_birth",
 	"com.delphix:extensible_dataset",
 	"com.delphix:embedded_data",
+	"org.open-zfs:large_blocks",
 	NULL
 };
 
@@ -1222,6 +1223,11 @@ dnode_read(const spa_t *spa, const dnode
 	int nlevels = dnode->dn_nlevels;
 	int i, rc;
 
+	if (bsize > SPA_MAXBLOCKSIZE) {
+		printf("ZFS: I/O error - blocks larger than 128K are not supported\n");
+		return (EIO);
+	}
+
 	/*
 	 * Note: bsize may not be a power of two here so we need to do an
 	 * actual divide rather than a bitshift.

Modified: head/sys/cddl/boot/zfs/zfsimpl.h
==============================================================================
--- head/sys/cddl/boot/zfs/zfsimpl.h	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/sys/cddl/boot/zfs/zfsimpl.h	Mon Nov 10 08:20:21 2014	(r274337)
@@ -113,17 +113,14 @@
 #define	BSWAP_64(x)	((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32))
 
 /*
- * We currently support nine block sizes, from 512 bytes to 128K.
- * We could go higher, but the benefits are near-zero and the cost
- * of COWing a giant block to modify one byte would become excessive.
+ * Note: the boot loader can't actually read blocks larger than 128KB,
+ * due to lack of memory.  Therefore its SPA_MAXBLOCKSIZE is still 128KB.
  */
 #define	SPA_MINBLOCKSHIFT	9
 #define	SPA_MAXBLOCKSHIFT	17
 #define	SPA_MINBLOCKSIZE	(1ULL << SPA_MINBLOCKSHIFT)
 #define	SPA_MAXBLOCKSIZE	(1ULL << SPA_MAXBLOCKSHIFT)
 
-#define	SPA_BLOCKSIZES		(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
-
 /*
  * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
  * The ASIZE encoding should be at least 64 times larger (6 more bits)

Modified: head/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c	Mon Nov 10 08:20:21 2014	(r274337)
@@ -56,7 +56,8 @@ valid_char(char c, boolean_t after_colon
 {
 	return ((c >= 'a' && c <= 'z') ||
 	    (c >= '0' && c <= '9') ||
-	    c == (after_colon ? '_' : '.'));
+	    (after_colon && c == '_') ||
+	    (!after_colon && (c == '.' || c == '-')));
 }
 
 /*
@@ -220,4 +221,13 @@ zpool_feature_init(void)
 	    "com.delphix:embedded_data", "embedded_data",
 	    "Blocks which compress very well use even less space.",
 	    B_FALSE, B_TRUE, B_TRUE, NULL);
+
+	static const spa_feature_t large_blocks_deps[] = {
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_LARGE_BLOCKS,
+	    "org.open-zfs:large_blocks", "large_blocks",
+	    "Support for blocks larger than 128KB.", B_FALSE, B_FALSE, B_FALSE,
+	    large_blocks_deps);
 }

Modified: head/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h	Mon Nov 10 08:20:21 2014	(r274337)
@@ -50,6 +50,7 @@ typedef enum spa_feature {
 	SPA_FEATURE_EMBEDDED_DATA,
 	SPA_FEATURE_BOOKMARKS,
 	SPA_FEATURE_FS_SS_LIMIT,
+	SPA_FEATURE_LARGE_BLOCKS,
 	SPA_FEATURES
 } spa_feature_t;
 

Modified: head/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c	Mon Nov 10 08:20:21 2014	(r274337)
@@ -409,8 +409,8 @@ zfs_prop_init(void)
 
 	/* inherit number properties */
 	zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
-	    SPA_MAXBLOCKSIZE, PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM, "512 to 128k, power of 2", "RECSIZE");
+	    SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE");
 
 	/* hidden properties */
 	zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,

Modified: head/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c	Mon Nov 10 08:20:21 2014	(r274337)
@@ -127,6 +127,8 @@ zpool_prop_init(void)
 	/* hidden properties */
 	zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
 	    PROP_READONLY, ZFS_TYPE_POOL, "NAME");
+	zprop_register_hidden(ZPOOL_PROP_MAXBLOCKSIZE, "maxblocksize",
+	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE");
 }
 
 /*

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c	Mon Nov 10 08:20:21 2014	(r274337)
@@ -43,7 +43,7 @@ bpobj_alloc_empty(objset_t *os, int bloc
 		if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
 			ASSERT0(dp->dp_empty_bpobj);
 			dp->dp_empty_bpobj =
-			    bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx);
+			    bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
 			VERIFY(zap_add(os,
 			    DMU_POOL_DIRECTORY_OBJECT,
 			    DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
@@ -398,7 +398,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint6
 	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
 	if (bpo->bpo_phys->bpo_subobjs == 0) {
 		bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
-		    DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
+		    DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
+		    DMU_OT_NONE, 0, tx);
 	}
 
 	dmu_object_info_t doi;

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c	Mon Nov 10 08:20:21 2014	(r274337)
@@ -65,7 +65,7 @@ bptree_alloc(objset_t *os, dmu_tx_t *tx)
 	bptree_phys_t *bt;
 
 	obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
-	    SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
+	    SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
 	    sizeof (bptree_phys_t), tx);
 
 	/*

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	Mon Nov 10 08:20:21 2014	(r274337)
@@ -2022,10 +2022,8 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake,
 		return (SET_ERROR(ENOTSUP));
 	if (blksz == 0)
 		blksz = SPA_MINBLOCKSIZE;
-	if (blksz > SPA_MAXBLOCKSIZE)
-		blksz = SPA_MAXBLOCKSIZE;
-	else
-		blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
+	ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
+	blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	Mon Nov 10 08:20:21 2014	(r274337)
@@ -255,6 +255,14 @@ logbias_changed_cb(void *arg, uint64_t n
 		zil_set_logbias(os->os_zil, newval);
 }
 
+static void
+recordsize_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+
+	os->os_recordsize = newval;
+}
+
 void
 dmu_objset_byteswap(void *buf, size_t size)
 {
@@ -384,6 +392,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dat
 				    ZFS_PROP_REDUNDANT_METADATA),
 				    redundant_metadata_changed_cb, os);
 			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+				    recordsize_changed_cb, os);
+			}
 		}
 		if (err != 0) {
 			VERIFY(arc_buf_remove_ref(os->os_phys_buf,
@@ -642,6 +655,9 @@ dmu_objset_evict(objset_t *os)
 			VERIFY0(dsl_prop_unregister(ds,
 			    zfs_prop_to_name(ZFS_PROP_REDUNDANT_METADATA),
 			    redundant_metadata_changed_cb, os));
+			VERIFY0(dsl_prop_unregister(ds,
+			    zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+			    recordsize_changed_cb, os));
 		}
 		VERIFY0(dsl_prop_unregister(ds,
 		    zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c	Mon Nov 10 08:20:21 2014	(r274337)
@@ -227,11 +227,12 @@ dump_write(dmu_sendarg_t *dsp, dmu_objec
 	drrw->drr_offset = offset;
 	drrw->drr_length = blksz;
 	drrw->drr_toguid = dsp->dsa_toguid;
-	if (BP_IS_EMBEDDED(bp)) {
+	if (bp == NULL || BP_IS_EMBEDDED(bp)) {
 		/*
-		 * There's no pre-computed checksum of embedded BP's, so
-		 * (like fletcher4-checkummed blocks) userland will have
-		 * to compute a dedup-capable checksum itself.
+		 * There's no pre-computed checksum for partial-block
+		 * writes or embedded BP's, so (like
+		 * fletcher4-checkummed blocks) userland will have to
+		 * compute a dedup-capable checksum itself.
 		 */
 		drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
 	} else {
@@ -393,6 +394,10 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t 
 	drro->drr_compress = dnp->dn_compress;
 	drro->drr_toguid = dsp->dsa_toguid;
 
+	if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+	    drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
+		drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
+
 	if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
 		return (SET_ERROR(EINTR));
 
@@ -512,6 +517,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, co
 		uint32_t aflags = ARC_WAIT;
 		arc_buf_t *abuf;
 		int blksz = BP_GET_LSIZE(bp);
+		uint64_t offset;
 
 		ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 		ASSERT0(zb->zb_level);
@@ -532,8 +538,24 @@ backup_cb(spa_t *spa, zilog_t *zilog, co
 			}
 		}
 
-		err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
-		    blksz, bp, abuf->b_data);
+		offset = zb->zb_blkid * blksz;
+
+		if (!(dsp->dsa_featureflags &
+		    DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+		    blksz > SPA_OLD_MAXBLOCKSIZE) {
+			char *buf = abuf->b_data;
+			while (blksz > 0 && err == 0) {
+				int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
+				err = dump_write(dsp, type, zb->zb_object,
+				    offset, n, NULL, buf);
+				offset += n;
+				buf += n;
+				blksz -= n;
+			}
+		} else {
+			err = dump_write(dsp, type, zb->zb_object,
+			    offset, blksz, bp, abuf->b_data);
+		}
 		(void) arc_buf_remove_ref(abuf, &abuf);
 	}
 
@@ -548,9 +570,9 @@ static int
 dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
     zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
 #ifdef illumos
-    int outfd, vnode_t *vp, offset_t *off)
+    boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off)
 #else
-    int outfd, struct file *fp, offset_t *off)
+    boolean_t large_block_ok, int outfd, struct file *fp, offset_t *off)
 #endif
 {
 	objset_t *os;
@@ -586,6 +608,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp,
 	}
 #endif
 
+	if (large_block_ok && ds->ds_large_blocks)
+		featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
 	if (embedok &&
 	    spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
 		featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
@@ -682,10 +706,11 @@ out:
 
 int
 dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
+    boolean_t embedok, boolean_t large_block_ok,
 #ifdef illumos
-    boolean_t embedok, int outfd, vnode_t *vp, offset_t *off)
+    int outfd, vnode_t *vp, offset_t *off)
 #else
-    boolean_t embedok, int outfd, struct file *fp, offset_t *off)
+    int outfd, struct file *fp, offset_t *off)
 #endif
 {
 	dsl_pool_t *dp;
@@ -720,18 +745,19 @@ dmu_send_obj(const char *pool, uint64_t 
 		zb.zbm_guid = fromds->ds_phys->ds_guid;
 		is_clone = (fromds->ds_dir != ds->ds_dir);
 		dsl_dataset_rele(fromds, FTAG);
-		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
-		    outfd, fp, off);
+		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+		    embedok, large_block_ok, outfd, fp, off);
 	} else {
-		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
-		    outfd, fp, off);
+		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+		    embedok, large_block_ok, outfd, fp, off);
 	}
 	dsl_dataset_rele(ds, FTAG);
 	return (err);
 }
 
 int
-dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+dmu_send(const char *tosnap, const char *fromsnap,
+    boolean_t embedok, boolean_t large_block_ok,
 #ifdef illumos
     int outfd, vnode_t *vp, offset_t *off)
 #else
@@ -802,11 +828,11 @@ dmu_send(const char *tosnap, const char 
 			dsl_pool_rele(dp, FTAG);
 			return (err);
 		}
-		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
-		    outfd, fp, off);
+		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+		    embedok, large_block_ok, outfd, fp, off);
 	} else {
-		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
-		    outfd, fp, off);
+		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+		    embedok, large_block_ok, outfd, fp, off);
 	}
 	if (owned)
 		dsl_dataset_disown(ds, FTAG);
@@ -1006,6 +1032,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t
 	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
 		return (SET_ERROR(ENOTSUP));
 
+	/*
+	 * The receiving code doesn't know how to translate large blocks
+	 * to smaller ones, so the pool must have the LARGE_BLOCKS
+	 * feature enabled if the stream has LARGE_BLOCKS.
+	 */
+	if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+		return (SET_ERROR(ENOTSUP));
+
 	error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
 	if (error == 0) {
 		/* target fs already exists; recv into temp clone */
@@ -1131,6 +1166,13 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t 
 	}
 	VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
 
+	if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+	    DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+	    !newds->ds_large_blocks) {
+		dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
+		newds->ds_large_blocks = B_TRUE;
+	}
+
 	dmu_buf_will_dirty(newds->ds_dbuf, tx);
 	newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
 
@@ -1283,6 +1325,7 @@ restore_read(struct restorearg *ra, int 
 
 	/* some things will require 8-byte alignment, so everything must */
 	ASSERT0(len % 8);
+	ASSERT3U(len, <=, ra->bufsize);
 
 	while (done < len) {
 		ssize_t resid;
@@ -1420,7 +1463,7 @@ restore_object(struct restorearg *ra, ob
 	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
 	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
 	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
-	    drro->drr_blksz > SPA_MAXBLOCKSIZE ||
+	    drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(os)) ||
 	    drro->drr_bonuslen > DN_MAX_BONUSLEN) {
 		return (SET_ERROR(EINVAL));
 	}
@@ -1693,7 +1736,7 @@ restore_spill(struct restorearg *ra, obj
 	int err;
 
 	if (drrs->drr_length < SPA_MINBLOCKSIZE ||
-	    drrs->drr_length > SPA_MAXBLOCKSIZE)
+	    drrs->drr_length > spa_maxblocksize(dmu_objset_spa(os)))
 		return (SET_ERROR(EINVAL));
 
 	data = restore_read(ra, drrs->drr_length, NULL);
@@ -1781,7 +1824,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, 
 	ra.td = curthread;
 	ra.fp = fp;
 	ra.voff = *voffp;
-	ra.bufsize = 1<<20;
+	ra.bufsize = SPA_MAXBLOCKSIZE;
 	ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
 
 	/* these were verified in dmu_recv_begin */

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c	Mon Nov 10 08:20:21 2014	(r274337)
@@ -224,7 +224,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, u
 		return;
 
 	min_bs = SPA_MINBLOCKSHIFT;
-	max_bs = SPA_MAXBLOCKSHIFT;
+	max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
 	min_ibs = DN_MIN_INDBLKSHIFT;
 	max_ibs = DN_MAX_INDBLKSHIFT;
 
@@ -293,6 +293,14 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, u
 			 */
 			ASSERT(dn->dn_datablkshift != 0);
 			min_bs = max_bs = dn->dn_datablkshift;
+		} else {
+			/*
+			 * The blocksize can increase up to the recordsize,
+			 * or if it is already more than the recordsize,
+			 * up to the next power of 2.
+			 */
+			min_bs = highbit64(dn->dn_datablksz - 1);
+			max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1));
 		}
 
 		/*
@@ -751,11 +759,11 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t o
 		bp = &dn->dn_phys->dn_blkptr[0];
 		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 		    bp, bp->blk_birth))
-			txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+			txh->txh_space_tooverwrite += MZAP_MAX_BLKSZ;
 		else
-			txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+			txh->txh_space_towrite += MZAP_MAX_BLKSZ;
 		if (!BP_IS_HOLE(bp))
-			txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+			txh->txh_space_tounref += MZAP_MAX_BLKSZ;
 		return;
 	}
 
@@ -1549,18 +1557,18 @@ dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t
 
 	/* If blkptr doesn't exist then add space to towrite */
 	if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
-		txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+		txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
 	} else {
 		blkptr_t *bp;
 
 		bp = &dn->dn_phys->dn_spill;
 		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 		    bp, bp->blk_birth))
-			txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+			txh->txh_space_tooverwrite += SPA_OLD_MAXBLOCKSIZE;
 		else
-			txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+			txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
 		if (!BP_IS_HOLE(bp))
-			txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+			txh->txh_space_tounref += SPA_OLD_MAXBLOCKSIZE;
 	}
 }
 

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c	Mon Nov 10 00:07:06 2014	(r274336)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c	Mon Nov 10 08:20:21 2014	(r274337)
@@ -513,10 +513,10 @@ dnode_allocate(dnode_t *dn, dmu_object_t
 {
 	int i;
 
+	ASSERT3U(blocksize, <=,
+	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 	if (blocksize == 0)
 		blocksize = 1 << zfs_default_bs;
-	else if (blocksize > SPA_MAXBLOCKSIZE)
-		blocksize = SPA_MAXBLOCKSIZE;
 	else
 		blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
 
@@ -597,7 +597,8 @@ dnode_reallocate(dnode_t *dn, dmu_object
 	int nblkptr;
 
 	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-head mailing list