svn commit: r317414 - in head: cddl/contrib/opensolaris/cmd/zfs cddl/contrib/opensolaris/cmd/zstreamdump cddl/contrib/opensolaris/lib/libzfs/common cddl/contrib/opensolaris/lib/libzfs_core/common s...
Josh Paetzel
jpaetzel at FreeBSD.org
Tue Apr 25 17:57:46 UTC 2017
Author: jpaetzel
Date: Tue Apr 25 17:57:43 2017
New Revision: 317414
URL: https://svnweb.freebsd.org/changeset/base/317414
Log:
MFV 316894
7252 7628 compressed zfs send / receive
illumos/illumos-gate at 5602294fda888d923d57a78bafdaf48ae6223dea
https://github.com/illumos/illumos-gate/commit/5602294fda888d923d57a78bafdaf48ae6223dea
https://www.illumos.org/issues/7252
This feature includes code to allow a system with compressed ARC enabled to
send data in its compressed form straight out of the ARC, and receive data in
its compressed form directly into the ARC.
https://www.illumos.org/issues/7628
We should have longer, more readable versions of the ZFS send / recv options.
7628 create long versions of ZFS send / receive options
Reviewed by: George Wilson <george.wilson at delphix.com>
Reviewed by: John Kennedy <john.kennedy at delphix.com>
Reviewed by: Matthew Ahrens <mahrens at delphix.com>
Reviewed by: Paul Dagnelie <pcd at delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov at delphix.com>
Reviewed by: Sebastien Roy <sebastien.roy at delphix.com>
Reviewed by: David Quigley <dpquigl at davequigley.com>
Reviewed by: Thomas Caputi <tcaputi at datto.com>
Approved by: Dan McDonald <danmcd at omniti.com>
Author: Dan Kimmel <dan.kimmel at delphix.com>
Modified:
head/cddl/contrib/opensolaris/cmd/zfs/zfs.8
head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c
head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c
head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c
head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lz4.c
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
Directory Properties:
head/cddl/contrib/opensolaris/ (props changed)
head/cddl/contrib/opensolaris/cmd/zfs/ (props changed)
head/cddl/contrib/opensolaris/lib/libzfs/ (props changed)
head/sys/cddl/contrib/opensolaris/ (props changed)
Modified: head/cddl/contrib/opensolaris/cmd/zfs/zfs.8
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/zfs/zfs.8 Tue Apr 25 17:46:44 2017 (r317413)
+++ head/cddl/contrib/opensolaris/cmd/zfs/zfs.8 Tue Apr 25 17:57:43 2017 (r317414)
@@ -180,12 +180,12 @@
.Ar bookmark
.Nm
.Cm send
-.Op Fl DnPpRveL
+.Op Fl DLPRcenpv
.Op Fl i Ar snapshot | Fl I Ar snapshot
.Ar snapshot
.Nm
.Cm send
-.Op Fl eL
+.Op Fl Lce
.Op Fl i Ar snapshot Ns | Ns bookmark
.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
.Nm
@@ -2535,7 +2535,7 @@ feature.
.It Xo
.Nm
.Cm send
-.Op Fl DnPpRveL
+.Op Fl DLPRcenpv
.Op Fl i Ar snapshot | Fl I Ar snapshot
.Ar snapshot
.Xc
@@ -2580,7 +2580,7 @@ The incremental
source may be specified as with the
.Fl i
option.
-.It Fl R
+.It Fl R, -replicate
Generate a replication stream package, which will replicate the specified
filesystem, and all descendent file systems, up to the named snapshot. When
received, all properties, snapshots, descendent file systems, and clones are
@@ -2598,7 +2598,7 @@ is received. If the
.Fl F
flag is specified when this stream is received, snapshots and file systems that
do not exist on the sending side are destroyed.
-.It Fl D
+.It Fl D, -dedup
Generate a deduplicated stream. Blocks which would have been sent multiple
times in the send stream will only be sent once. The receiving system must
also support this feature to receive a deduplicated stream. This flag can
@@ -2607,7 +2607,7 @@ be used regardless of the dataset's
property, but performance will be much better if the filesystem uses a
dedup-capable checksum (eg.
.Sy sha256 ) .
-.It Fl L
+.It Fl L, -large-block
Generate a stream which may contain blocks larger than 128KB.
This flag
has no effect if the
@@ -2623,7 +2623,7 @@ See
for details on ZFS feature flags and the
.Sy large_blocks
feature.
-.It Fl e
+.It Fl e, -embed
Generate a more compact stream by using WRITE_EMBEDDED records for blocks
which are stored more compactly on disk by the
.Sy embedded_data
@@ -2646,11 +2646,25 @@ See
for details on ZFS feature flags and the
.Sy embedded_data
feature.
-.It Fl p
+.It Fl c, -compressed
+Generate a more compact stream by using compressed WRITE records for blocks
+which are compressed on disk and in memory (see the
+.Sy compression property for details). If the
+.Sy lz4_compress
+feature is active on the sending system, then the receiving system must have that
+feature enabled as well. If the
+.Sy large_blocks
+feature is enabled on the sending system but the
+.Fl L
+option is not supplied in conjunction with
+.Fl c
+then the data will be decompressed before sending so it can be split
+into smaller block sizes.
+.It Fl p, -props
Include the dataset's properties in the stream. This flag is implicit when
.Fl R
is specified. The receiving system must also support this feature.
-.It Fl n
+.It Fl n, -dryrun
Do a dry-run ("No-op") send. Do not generate any actual send data. This is
useful in conjunction with the
.Fl v
@@ -2660,9 +2674,9 @@ flags to determine what data will be sen
In this case, the verbose output will be written to
standard output (contrast with a non-dry-run, where the stream is written
to standard output and the verbose output goes to standard error).
-.It Fl P
+.It Fl P, -parsable
Print machine-parsable verbose information about the stream package generated.
-.It Fl v
+.It Fl v, -verbose
Print verbose information about the stream package generated.
This information includes a per-second report of how much data has been sent.
.El
@@ -2673,7 +2687,7 @@ on future versions of
.It Xo
.Nm
.Cm send
-.Op Fl eL
+.Op Fl Lce
.Op Fl i Ar snapshot Ns | Ns Ar bookmark
.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
.Xc
@@ -2699,7 +2713,7 @@ specified as the last component of the n
If the incremental target is a clone, the incremental source can
be the origin snapshot, or an earlier snapshot in the origin's filesystem,
or the origin's origin, etc.
-.It Fl L
+.It Fl L, -large-block
Generate a stream which may contain blocks larger than 128KB.
This flag
has no effect if the
@@ -2715,7 +2729,22 @@ See
for details on ZFS feature flags and the
.Sy large_blocks
feature.
-.It Fl e
+.It Fl c, -compressed
+Generate a more compact stream by using compressed WRITE records for blocks
+which are compressed on disk and in memory (see the
+.Sy compression
+property for details). If the
+.Sy lz4_compress
+feature is active on the sending system, then the receiving system must have
+that feature enabled as well. If the
+.Sy large_blocks
+feature is enabled on the sending system but the
+.Fl L
+option is not supplied in conjunction with
+.Fl c
+then the data will be decompressed before sending so it can be split
+into smaller block sizes.
+.It Fl e, -embed
Generate a more compact stream by using WRITE_EMBEDDED records for blocks
which are stored more compactly on disk by the
.Sy embedded_data
Modified: head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c Tue Apr 25 17:46:44 2017 (r317413)
+++ head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c Tue Apr 25 17:57:43 2017 (r317414)
@@ -35,6 +35,7 @@
#include <assert.h>
#include <ctype.h>
#include <errno.h>
+#include <getopt.h>
#include <libgen.h>
#include <libintl.h>
#include <libuutil.h>
@@ -278,7 +279,7 @@ get_usage(zfs_help_t idx)
case HELP_ROLLBACK:
return (gettext("\trollback [-rRf] <snapshot>\n"));
case HELP_SEND:
- return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] "
+ return (gettext("\tsend [-DnPpRvLec] [-[iI] snapshot] "
"<snapshot>\n"
"\tsend [-Le] [-i snapshot|bookmark] "
"<filesystem|volume|snapshot>\n"
@@ -3771,8 +3772,23 @@ zfs_do_send(int argc, char **argv)
nvlist_t *dbgnv = NULL;
boolean_t extraverbose = B_FALSE;
+ struct option long_options[] = {
+ {"replicate", no_argument, NULL, 'R'},
+ {"props", no_argument, NULL, 'p'},
+ {"parsable", no_argument, NULL, 'P'},
+ {"dedup", no_argument, NULL, 'D'},
+ {"verbose", no_argument, NULL, 'v'},
+ {"dryrun", no_argument, NULL, 'n'},
+ {"large-block", no_argument, NULL, 'L'},
+ {"embed", no_argument, NULL, 'e'},
+ {"resume", required_argument, NULL, 't'},
+ {"compressed", no_argument, NULL, 'c'},
+ {0, 0, 0, 0}
+ };
+
/* check options */
- while ((c = getopt(argc, argv, ":i:I:RDpvnPLet:")) != -1) {
+ while ((c = getopt_long(argc, argv, ":i:I:RbDpvnPLet:c", long_options,
+ NULL)) != -1) {
switch (c) {
case 'i':
if (fromname)
@@ -3816,12 +3832,17 @@ zfs_do_send(int argc, char **argv)
case 't':
resume_token = optarg;
break;
+ case 'c':
+ flags.compress = B_TRUE;
+ break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
usage(B_FALSE);
break;
case '?':
+ /*FALLTHROUGH*/
+ default:
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
@@ -3892,6 +3913,8 @@ zfs_do_send(int argc, char **argv)
lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK;
if (flags.embed_data)
lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
+ if (flags.compress)
+ lzc_flags |= LZC_SEND_FLAG_COMPRESS;
if (fromname != NULL &&
(fromname[0] == '#' || fromname[0] == '@')) {
Modified: head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c Tue Apr 25 17:46:44 2017 (r317413)
+++ head/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c Tue Apr 25 17:57:43 2017 (r317414)
@@ -25,8 +25,8 @@
*/
/*
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#include <ctype.h>
@@ -39,6 +39,7 @@
#include <sys/dmu.h>
#include <sys/zfs_ioctl.h>
+#include <sys/zio.h>
#include <zfs_fletcher.h>
/*
@@ -251,6 +252,7 @@ main(int argc, char *argv[])
(void) fprintf(stderr, "invalid option '%c'\n",
optopt);
usage();
+ break;
}
}
@@ -453,38 +455,50 @@ main(int argc, char *argv[])
drrw->drr_object = BSWAP_64(drrw->drr_object);
drrw->drr_type = BSWAP_32(drrw->drr_type);
drrw->drr_offset = BSWAP_64(drrw->drr_offset);
- drrw->drr_length = BSWAP_64(drrw->drr_length);
+ drrw->drr_logical_size =
+ BSWAP_64(drrw->drr_logical_size);
drrw->drr_toguid = BSWAP_64(drrw->drr_toguid);
drrw->drr_key.ddk_prop =
BSWAP_64(drrw->drr_key.ddk_prop);
+ drrw->drr_compressed_size =
+ BSWAP_64(drrw->drr_compressed_size);
}
+
+ uint64_t payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
+
/*
* If this is verbose and/or dump output,
* print info on the modified block
*/
if (verbose) {
(void) printf("WRITE object = %llu type = %u "
- "checksum type = %u\n"
- " offset = %llu length = %llu "
+ "checksum type = %u compression type = %u\n"
+ " offset = %llu logical_size = %llu "
+ "compressed_size = %llu "
+ "payload_size = %llu "
"props = %llx\n",
(u_longlong_t)drrw->drr_object,
drrw->drr_type,
drrw->drr_checksumtype,
+ drrw->drr_compressiontype,
(u_longlong_t)drrw->drr_offset,
- (u_longlong_t)drrw->drr_length,
+ (u_longlong_t)drrw->drr_logical_size,
+ (u_longlong_t)drrw->drr_compressed_size,
+ (u_longlong_t)payload_size,
(u_longlong_t)drrw->drr_key.ddk_prop);
}
+
/*
* Read the contents of the block in from STDIN to buf
*/
- (void) ssread(buf, drrw->drr_length, &zc);
+ (void) ssread(buf, payload_size, &zc);
/*
* If in dump mode
*/
if (dump) {
- print_block(buf, drrw->drr_length);
+ print_block(buf, payload_size);
}
- total_write_size += drrw->drr_length;
+ total_write_size += payload_size;
break;
case DRR_WRITE_BYREF:
Modified: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
==============================================================================
--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h Tue Apr 25 17:46:44 2017 (r317413)
+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h Tue Apr 25 17:57:43 2017 (r317414)
@@ -616,6 +616,9 @@ typedef struct sendflags {
/* WRITE_EMBEDDED records of type DATA are permitted */
boolean_t embed_data;
+
+ /* compressed WRITE records are permitted */
+ boolean_t compress;
} sendflags_t;
typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *);
Modified: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c
==============================================================================
--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c Tue Apr 25 17:46:44 2017 (r317413)
+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_sendrecv.c Tue Apr 25 17:57:43 2017 (r317414)
@@ -354,8 +354,10 @@ cksummer(void *arg)
{
struct drr_write *drrw = &drr->drr_u.drr_write;
dataref_t dataref;
+ uint64_t payload_size;
- (void) ssread(buf, drrw->drr_length, ofp);
+ payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
+ (void) ssread(buf, payload_size, ofp);
/*
* Use the existing checksum if it's dedup-capable,
@@ -369,7 +371,7 @@ cksummer(void *arg)
zio_cksum_t tmpsha256;
SHA256Init(&ctx);
- SHA256Update(&ctx, buf, drrw->drr_length);
+ SHA256Update(&ctx, buf, payload_size);
SHA256Final(&tmpsha256, &ctx);
drrw->drr_key.ddk_cksum.zc_word[0] =
BE_64(tmpsha256.zc_word[0]);
@@ -399,7 +401,7 @@ cksummer(void *arg)
wbr_drrr->drr_object = drrw->drr_object;
wbr_drrr->drr_offset = drrw->drr_offset;
- wbr_drrr->drr_length = drrw->drr_length;
+ wbr_drrr->drr_length = drrw->drr_logical_size;
wbr_drrr->drr_toguid = drrw->drr_toguid;
wbr_drrr->drr_refguid = dataref.ref_guid;
wbr_drrr->drr_refobject =
@@ -421,7 +423,7 @@ cksummer(void *arg)
goto out;
} else {
/* block not previously seen */
- if (dump_record(drr, buf, drrw->drr_length,
+ if (dump_record(drr, buf, payload_size,
&stream_cksum, outfd) != 0)
goto out;
}
@@ -924,7 +926,7 @@ typedef struct send_dump_data {
uint64_t prevsnap_obj;
boolean_t seenfrom, seento, replicate, doall, fromorigin;
boolean_t verbose, dryrun, parsable, progress, embed_data, std_out;
- boolean_t large_block;
+ boolean_t large_block, compress;
int outfd;
boolean_t err;
nvlist_t *fss;
@@ -940,7 +942,7 @@ typedef struct send_dump_data {
static int
estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj,
- boolean_t fromorigin, uint64_t *sizep)
+ boolean_t fromorigin, enum lzc_send_flags flags, uint64_t *sizep)
{
zfs_cmd_t zc = { 0 };
libzfs_handle_t *hdl = zhp->zfs_hdl;
@@ -953,6 +955,7 @@ estimate_ioctl(zfs_handle_t *zhp, uint64
zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
zc.zc_fromobj = fromsnap_obj;
zc.zc_guid = 1; /* estimate flag */
+ zc.zc_flags = flags;
if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) {
char errbuf[1024];
@@ -1192,6 +1195,7 @@ dump_snapshot(zfs_handle_t *zhp, void *a
progress_arg_t pa = { 0 };
pthread_t tid;
char *thissnap;
+ enum lzc_send_flags flags = 0;
int err;
boolean_t isfromsnap, istosnap, fromorigin;
boolean_t exclude = B_FALSE;
@@ -1220,6 +1224,13 @@ dump_snapshot(zfs_handle_t *zhp, void *a
if (istosnap)
sdd->seento = B_TRUE;
+ if (sdd->large_block)
+ flags |= LZC_SEND_FLAG_LARGE_BLOCK;
+ if (sdd->embed_data)
+ flags |= LZC_SEND_FLAG_EMBED_DATA;
+ if (sdd->compress)
+ flags |= LZC_SEND_FLAG_COMPRESS;
+
if (!sdd->doall && !isfromsnap && !istosnap) {
if (sdd->replicate) {
char *snapname;
@@ -1266,7 +1277,7 @@ dump_snapshot(zfs_handle_t *zhp, void *a
if (sdd->verbose) {
uint64_t size = 0;
(void) estimate_ioctl(zhp, sdd->prevsnap_obj,
- fromorigin, &size);
+ fromorigin, flags, &size);
send_print_verbose(fout, zhp->zfs_name,
sdd->prevsnap[0] ? sdd->prevsnap : NULL,
@@ -1291,12 +1302,6 @@ dump_snapshot(zfs_handle_t *zhp, void *a
}
}
- enum lzc_send_flags flags = 0;
- if (sdd->large_block)
- flags |= LZC_SEND_FLAG_LARGE_BLOCK;
- if (sdd->embed_data)
- flags |= LZC_SEND_FLAG_EMBED_DATA;
-
err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj,
fromorigin, sdd->outfd, flags, sdd->debugnv);
@@ -1602,8 +1607,12 @@ zfs_send_resume(libzfs_handle_t *hdl, se
fromguid = 0;
(void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid);
+ if (flags->largeblock || nvlist_exists(resume_nvl, "largeblockok"))
+ lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK;
if (flags->embed_data || nvlist_exists(resume_nvl, "embedok"))
lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
+ if (flags->compress || nvlist_exists(resume_nvl, "compressok"))
+ lzc_flags |= LZC_SEND_FLAG_COMPRESS;
if (guid_to_name(hdl, toname, toguid, B_FALSE, name) != 0) {
if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) {
@@ -1636,7 +1645,8 @@ zfs_send_resume(libzfs_handle_t *hdl, se
if (flags->verbose) {
uint64_t size = 0;
- error = lzc_send_space(zhp->zfs_name, fromname, &size);
+ error = lzc_send_space(zhp->zfs_name, fromname,
+ lzc_flags, &size);
if (error == 0)
size = MAX(0, (int64_t)(size - bytes));
send_print_verbose(stderr, zhp->zfs_name, fromname,
@@ -1866,6 +1876,7 @@ zfs_send(zfs_handle_t *zhp, const char *
sdd.dryrun = flags->dryrun;
sdd.large_block = flags->largeblock;
sdd.embed_data = flags->embed_data;
+ sdd.compress = flags->compress;
sdd.filter_cb = filter_func;
sdd.filter_cb_arg = cb_arg;
if (debugnvp)
@@ -2960,11 +2971,17 @@ recv_skip(libzfs_handle_t *hdl, int fd,
case DRR_WRITE:
if (byteswap) {
- drr->drr_u.drr_write.drr_length =
- BSWAP_64(drr->drr_u.drr_write.drr_length);
+ drr->drr_u.drr_write.drr_logical_size =
+ BSWAP_64(
+ drr->drr_u.drr_write.drr_logical_size);
+ drr->drr_u.drr_write.drr_compressed_size =
+ BSWAP_64(
+ drr->drr_u.drr_write.drr_compressed_size);
}
+ uint64_t payload_size =
+ DRR_WRITE_PAYLOAD_SIZE(&drr->drr_u.drr_write);
(void) recv_read(hdl, fd, buf,
- drr->drr_u.drr_write.drr_length, B_FALSE, NULL);
+ payload_size, B_FALSE, NULL);
break;
case DRR_SPILL:
if (byteswap) {
Modified: head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c
==============================================================================
--- head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c Tue Apr 25 17:46:44 2017 (r317413)
+++ head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c Tue Apr 25 17:57:43 2017 (r317414)
@@ -534,6 +534,8 @@ lzc_send_resume(const char *snapname, co
fnvlist_add_boolean(args, "largeblockok");
if (flags & LZC_SEND_FLAG_EMBED_DATA)
fnvlist_add_boolean(args, "embedok");
+ if (flags & LZC_SEND_FLAG_COMPRESS)
+ fnvlist_add_boolean(args, "compressok");
if (resumeobj != 0 || resumeoff != 0) {
fnvlist_add_uint64(args, "resume_object", resumeobj);
fnvlist_add_uint64(args, "resume_offset", resumeoff);
@@ -559,7 +561,8 @@ lzc_send_resume(const char *snapname, co
* an equivalent snapshot.
*/
int
-lzc_send_space(const char *snapname, const char *from, uint64_t *spacep)
+lzc_send_space(const char *snapname, const char *from,
+ enum lzc_send_flags flags, uint64_t *spacep)
{
nvlist_t *args;
nvlist_t *result;
@@ -568,6 +571,12 @@ lzc_send_space(const char *snapname, con
args = fnvlist_alloc();
if (from != NULL)
fnvlist_add_string(args, "from", from);
+ if (flags & LZC_SEND_FLAG_LARGE_BLOCK)
+ fnvlist_add_boolean(args, "largeblockok");
+ if (flags & LZC_SEND_FLAG_EMBED_DATA)
+ fnvlist_add_boolean(args, "embedok");
+ if (flags & LZC_SEND_FLAG_COMPRESS)
+ fnvlist_add_boolean(args, "compressok");
err = lzc_ioctl(ZFS_IOC_SEND_SPACE, snapname, args, &result);
nvlist_free(args);
if (err == 0)
Modified: head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h
==============================================================================
--- head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h Tue Apr 25 17:46:44 2017 (r317413)
+++ head/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h Tue Apr 25 17:57:43 2017 (r317414)
@@ -62,13 +62,14 @@ int lzc_get_holds(const char *, nvlist_t
enum lzc_send_flags {
LZC_SEND_FLAG_EMBED_DATA = 1 << 0,
- LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1
+ LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1,
+ LZC_SEND_FLAG_COMPRESS = 1 << 2
};
int lzc_send(const char *, const char *, int, enum lzc_send_flags);
int lzc_send_resume(const char *, const char *, int,
enum lzc_send_flags, uint64_t, uint64_t);
-int lzc_send_space(const char *, const char *, uint64_t *);
+int lzc_send_space(const char *, const char *, enum lzc_send_flags, uint64_t *);
struct dmu_replay_record;
Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Tue Apr 25 17:46:44 2017 (r317413)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Tue Apr 25 17:57:43 2017 (r317414)
@@ -77,10 +77,10 @@
* A new reference to a cache buffer can be obtained in two
* ways: 1) via a hash table lookup using the DVA as a key,
* or 2) via one of the ARC lists. The arc_read() interface
- * uses method 1, while the internal arc algorithms for
+ * uses method 1, while the internal ARC algorithms for
* adjusting the cache use method 2. We therefore provide two
* types of locks: 1) the hash table lock array, and 2) the
- * arc list locks.
+ * ARC list locks.
*
* Buffers do not have their own mutexes, rather they rely on the
* hash table mutexes for the bulk of their protection (i.e. most
@@ -93,21 +93,12 @@
* buf_hash_remove() expects the appropriate hash mutex to be
* already held before it is invoked.
*
- * Each arc state also has a mutex which is used to protect the
+ * Each ARC state also has a mutex which is used to protect the
* buffer list associated with the state. When attempting to
- * obtain a hash table lock while holding an arc list lock you
+ * obtain a hash table lock while holding an ARC list lock you
* must use: mutex_tryenter() to avoid deadlock. Also note that
* the active state mutex must be held before the ghost state mutex.
*
- * Arc buffers may have an associated eviction callback function.
- * This function will be invoked prior to removing the buffer (e.g.
- * in arc_do_user_evicts()). Note however that the data associated
- * with the buffer may be evicted prior to the callback. The callback
- * must be made with *no locks held* (to prevent deadlock). Additionally,
- * the users of callbacks must ensure that their private data is
- * protected from simultaneous callbacks from arc_clear_callback()
- * and arc_do_user_evicts().
- *
* Note that the majority of the performance stats are manipulated
* with atomic operations.
*
@@ -136,67 +127,81 @@
* are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
* the arc_buf_hdr_t that will point to the data block in memory. A block can
* only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
- * caches data in two ways -- in a list of arc buffers (arc_buf_t) and
+ * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
* also in the arc_buf_hdr_t's private physical data block pointer (b_pdata).
- * Each arc buffer (arc_buf_t) is being actively accessed by a specific ARC
- * consumer, and always contains uncompressed data. The ARC will provide
- * references to this data and will keep it cached until it is no longer in
- * use. Typically, the arc will try to cache only the L1ARC's physical data
- * block and will aggressively evict any arc_buf_t that is no longer referenced.
- * The amount of memory consumed by the arc_buf_t's can be seen via the
- * "overhead_size" kstat.
- *
*
- * arc_buf_hdr_t
- * +-----------+
- * | |
- * | |
- * | |
- * +-----------+
- * l2arc_buf_hdr_t| |
- * | |
- * +-----------+
- * l1arc_buf_hdr_t| |
- * | | arc_buf_t
- * | b_buf +------------>+---------+ arc_buf_t
- * | | |b_next +---->+---------+
- * | b_pdata +-+ |---------| |b_next +-->NULL
- * +-----------+ | | | +---------+
- * | |b_data +-+ | |
- * | +---------+ | |b_data +-+
- * +->+------+ | +---------+ |
- * (potentially) | | | |
- * compressed | | | |
- * data +------+ | v
- * +->+------+ +------+
- * uncompressed | | | |
- * data | | | |
- * +------+ +------+
- *
- * The L1ARC's data pointer, however, may or may not be uncompressed. The
- * ARC has the ability to store the physical data (b_pdata) associated with
- * the DVA of the arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk
- * physical block, it will match its on-disk compression characteristics.
- * If the block on-disk is compressed, then the physical data block
- * in the cache will also be compressed and vice-versa. This behavior
- * can be disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
+ * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
+ * ability to store the physical data (b_pdata) associated with the DVA of the
+ * arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk physical block,
+ * it will match its on-disk compression characteristics. This behavior can be
+ * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
* compressed ARC functionality is disabled, the b_pdata will point to an
* uncompressed version of the on-disk data.
*
+ * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
+ * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
+ * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
+ * consumer. The ARC will provide references to this data and will keep it
+ * cached until it is no longer in use. The ARC caches only the L1ARC's physical
+ * data block and will evict any arc_buf_t that is no longer referenced. The
+ * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
+ * "overhead_size" kstat.
+ *
+ * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
+ * compressed form. The typical case is that consumers will want uncompressed
+ * data, and when that happens a new data buffer is allocated where the data is
+ * decompressed for them to use. Currently the only consumer who wants
+ * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
+ * exists on disk. When this happens, the arc_buf_t's data buffer is shared
+ * with the arc_buf_hdr_t.
+ *
+ * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
+ * first one is owned by a compressed send consumer (and therefore references
+ * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
+ * used by any other consumer (and has its own uncompressed copy of the data
+ * buffer).
+ *
+ * arc_buf_hdr_t
+ * +-----------+
+ * | fields |
+ * | common to |
+ * | L1- and |
+ * | L2ARC |
+ * +-----------+
+ * | l2arc_buf_hdr_t
+ * | |
+ * +-----------+
+ * | l1arc_buf_hdr_t
+ * | | arc_buf_t
+ * | b_buf +------------>+-----------+ arc_buf_t
+ * | b_pdata +-+ |b_next +---->+-----------+
+ * +-----------+ | |-----------| |b_next +-->NULL
+ * | |b_comp = T | +-----------+
+ * | |b_data +-+ |b_comp = F |
+ * | +-----------+ | |b_data +-+
+ * +->+------+ | +-----------+ |
+ * compressed | | | |
+ * data | |<--------------+ | uncompressed
+ * +------+ compressed, | data
+ * shared +-->+------+
+ * data | |
+ * | |
+ * +------+
+ *
* When a consumer reads a block, the ARC must first look to see if the
- * arc_buf_hdr_t is cached. If the hdr is cached and already has an arc_buf_t,
- * then an additional arc_buf_t is allocated and the uncompressed data is
- * bcopied from the existing arc_buf_t. If the hdr is cached but does not
- * have an arc_buf_t, then the ARC allocates a new arc_buf_t and decompresses
- * the b_pdata contents into the arc_buf_t's b_data. If the arc_buf_hdr_t's
- * b_pdata is not compressed, then the block is shared with the newly
- * allocated arc_buf_t. This block sharing only occurs with one arc_buf_t
- * in the arc buffer chain. Sharing the block reduces the memory overhead
- * required when the hdr is caching uncompressed blocks or the compressed
- * arc functionality has been disabled via 'zfs_compressed_arc_enabled'.
+ * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
+ * arc_buf_t and either copies uncompressed data into a new data buffer from an
+ * existing uncompressed arc_buf_t, decompresses the hdr's b_pdata buffer into a
+ * new data buffer, or shares the hdr's b_pdata buffer, depending on whether the
+ * hdr is compressed and the desired compression characteristics of the
+ * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
+ * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
+ * the last buffer in the hdr's b_buf list, however a shared compressed buf can
+ * be anywhere in the hdr's list.
*
* The diagram below shows an example of an uncompressed ARC hdr that is
- * sharing its data with an arc_buf_t:
+ * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
+ * the last element in the buf list):
*
* arc_buf_hdr_t
* +-----------+
@@ -225,20 +230,24 @@
* | +------+ |
* +---------------------------------+
*
- * Writing to the arc requires that the ARC first discard the b_pdata
+ * Writing to the ARC requires that the ARC first discard the hdr's b_pdata
* since the physical block is about to be rewritten. The new data contents
- * will be contained in the arc_buf_t (uncompressed). As the I/O pipeline
- * performs the write, it may compress the data before writing it to disk.
- * The ARC will be called with the transformed data and will bcopy the
- * transformed on-disk block into a newly allocated b_pdata.
+ * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
+ * it may compress the data before writing it to disk. The ARC will be called
+ * with the transformed data and will bcopy the transformed on-disk block into
+ * a newly allocated b_pdata. Writes are always done into buffers which have
+ * either been loaned (and hence are new and don't have other readers) or
+ * buffers which have been released (and hence have their own hdr, if there
+ * were originally other readers of the buf's original hdr). This ensures that
+ * the ARC only needs to update a single buf and its hdr after a write occurs.
*
* When the L2ARC is in use, it will also take advantage of the b_pdata. The
* L2ARC will always write the contents of b_pdata to the L2ARC. This means
- * that when compressed arc is enabled that the L2ARC blocks are identical
+ * that when compressed ARC is enabled that the L2ARC blocks are identical
* to the on-disk block in the main data pool. This provides a significant
* advantage since the ARC can leverage the bp's checksum when reading from the
* L2ARC to determine if the contents are valid. However, if the compressed
- * arc is disabled, then the L2ARC's block must be transformed to look
+ * ARC is disabled, then the L2ARC's block must be transformed to look
* like the physical block in the main data pool before comparing the
* checksum and determining its validity.
*/
@@ -910,6 +919,7 @@ struct arc_callback {
void *acb_private;
arc_done_func_t *acb_done;
arc_buf_t *acb_buf;
+ boolean_t acb_compressed;
zio_t *acb_zio_dummy;
arc_callback_t *acb_next;
};
@@ -961,7 +971,7 @@ typedef struct l1arc_buf_hdr {
zio_cksum_t *b_freeze_cksum;
#ifdef ZFS_DEBUG
/*
- * used for debugging wtih kmem_flags - by allocating and freeing
+ * Used for debugging with kmem_flags - by allocating and freeing
* b_thawed when the buffer is thawed, we get a record of the stack
* trace that thawed it.
*/
@@ -1172,6 +1182,8 @@ sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_AR
HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL)
+#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED)
+#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
/*
* Other sizes
@@ -1332,7 +1344,7 @@ static kmutex_t l2arc_free_on_write_mtx;
static uint64_t l2arc_ndev; /* number of devices */
typedef struct l2arc_read_callback {
- arc_buf_hdr_t *l2rcb_hdr; /* read buffer */
+ arc_buf_hdr_t *l2rcb_hdr; /* read header */
blkptr_t l2rcb_bp; /* original blkptr */
zbookmark_phys_t l2rcb_zb; /* original bookmark */
int l2rcb_flags; /* original flags */
@@ -1682,6 +1694,31 @@ retry:
}
}
+/*
+ * This is the size that the buf occupies in memory. If the buf is compressed,
+ * it will correspond to the compressed size. You should use this method of
+ * getting the buf size unless you explicitly need the logical size.
+ */
+int32_t
+arc_buf_size(arc_buf_t *buf)
+{
+ return (ARC_BUF_COMPRESSED(buf) ?
+ HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
+}
+
+int32_t
+arc_buf_lsize(arc_buf_t *buf)
+{
+ return (HDR_GET_LSIZE(buf->b_hdr));
+}
+
+enum zio_compress
+arc_get_compression(arc_buf_t *buf)
+{
+ return (ARC_BUF_COMPRESSED(buf) ?
+ HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
+}
+
#define ARC_MINTIME (hz>>4) /* 62 ms */
static inline boolean_t
@@ -1690,9 +1727,21 @@ arc_buf_is_shared(arc_buf_t *buf)
boolean_t shared = (buf->b_data != NULL &&
buf->b_data == buf->b_hdr->b_l1hdr.b_pdata);
IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
+ IMPLY(shared, ARC_BUF_SHARED(buf));
+ IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
+
+ /*
+ * It would be nice to assert arc_can_share() too, but the "hdr isn't
+ * already being shared" requirement prevents us from doing that.
+ */
+
return (shared);
}
+/*
+ * Free the checksum associated with this header. If there is no checksum, this
+ * is a no-op.
+ */
static inline void
arc_cksum_free(arc_buf_hdr_t *hdr)
{
@@ -1705,6 +1754,25 @@ arc_cksum_free(arc_buf_hdr_t *hdr)
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
}
+/*
+ * Return true iff at least one of the bufs on hdr is not compressed.
+ */
+static boolean_t
+arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
+{
+ for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
+ if (!ARC_BUF_COMPRESSED(b)) {
+ return (B_TRUE);
+ }
+ }
+ return (B_FALSE);
+}
+
+/*
+ * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
+ * matches the checksum that is stored in the hdr. If there is no checksum,
+ * or if the buf is compressed, this is a no-op.
+ */
static void
arc_cksum_verify(arc_buf_t *buf)
{
@@ -1714,6 +1782,12 @@ arc_cksum_verify(arc_buf_t *buf)
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
+ if (ARC_BUF_COMPRESSED(buf)) {
+ ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
+ arc_hdr_has_uncompressed_buf(hdr));
+ return;
+ }
+
ASSERT(HDR_HAS_L1HDR(hdr));
mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
@@ -1721,7 +1795,8 @@ arc_cksum_verify(arc_buf_t *buf)
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
return;
}
- fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, &zc);
+
+ fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
panic("buffer modified while frozen!");
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
@@ -1795,6 +1870,12 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, z
return (valid_cksum);
}
+/*
+ * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
+ * checksum and attaches it to the buf's hdr so that we can ensure that the buf
+ * isn't modified later on. If buf is compressed or there is already a checksum
+ * on the hdr, this is a no-op (we only checksum uncompressed bufs).
+ */
static void
arc_cksum_compute(arc_buf_t *buf)
{
@@ -1804,14 +1885,21 @@ arc_cksum_compute(arc_buf_t *buf)
return;
ASSERT(HDR_HAS_L1HDR(hdr));
+
mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
+ ASSERT(arc_hdr_has_uncompressed_buf(hdr));
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+ return;
+ } else if (ARC_BUF_COMPRESSED(buf)) {
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
return;
}
+
+ ASSERT(!ARC_BUF_COMPRESSED(buf));
hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
KM_SLEEP);
- fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL,
+ fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
hdr->b_l1hdr.b_freeze_cksum);
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
#ifdef illumos
@@ -1855,7 +1943,7 @@ arc_buf_watch(arc_buf_t *buf)
procctl_t ctl;
ctl.cmd = PCWATCH;
ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
- ctl.prwatch.pr_size = HDR_GET_LSIZE(buf->b_hdr);
+ ctl.prwatch.pr_size = arc_buf_size(buf);
ctl.prwatch.pr_wflags = WA_WRITE;
result = write(arc_procfd, &ctl, sizeof (ctl));
ASSERT3U(result, ==, sizeof (ctl));
@@ -1877,6 +1965,12 @@ arc_buf_type(arc_buf_hdr_t *hdr)
return (type);
}
+boolean_t
+arc_is_metadata(arc_buf_t *buf)
+{
+ return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
+}
+
static uint32_t
arc_bufc_to_flags(arc_buf_contents_t type)
{
@@ -1898,12 +1992,19 @@ arc_buf_thaw(arc_buf_t *buf)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
- if (zfs_flags & ZFS_DEBUG_MODIFY) {
- if (hdr->b_l1hdr.b_state != arc_anon)
- panic("modifying non-anon buffer!");
- if (HDR_IO_IN_PROGRESS(hdr))
- panic("modifying buffer while i/o in progress!");
- arc_cksum_verify(buf);
+ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+
+ arc_cksum_verify(buf);
+
+ /*
+ * Compressed buffers do not manipulate the b_freeze_cksum or
+ * allocate b_thawed.
+ */
+ if (ARC_BUF_COMPRESSED(buf)) {
+ ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
+ arc_hdr_has_uncompressed_buf(hdr));
+ return;
}
ASSERT(HDR_HAS_L1HDR(hdr));
@@ -1934,6 +2035,12 @@ arc_buf_freeze(arc_buf_t *buf)
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
+ if (ARC_BUF_COMPRESSED(buf)) {
+ ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
+ arc_hdr_has_uncompressed_buf(hdr));
+ return;
+ }
+
hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
@@ -1942,7 +2049,6 @@ arc_buf_freeze(arc_buf_t *buf)
hdr->b_l1hdr.b_state == arc_anon);
arc_cksum_compute(buf);
mutex_exit(hash_lock);
-
}
/*
@@ -1999,47 +2105,157 @@ arc_hdr_set_compress(arc_buf_hdr_t *hdr,
}
}
+/*
+ * Looks for another buf on the same hdr which has the data decompressed, copies
+ * from it, and returns true. If no such buf exists, returns false.
+ */
+static boolean_t
+arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ boolean_t copied = B_FALSE;
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT3P(buf->b_data, !=, NULL);
+ ASSERT(!ARC_BUF_COMPRESSED(buf));
+
+ for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
+ from = from->b_next) {
+ /* can't use our own data buffer */
+ if (from == buf) {
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-all
mailing list