svn commit: r302991 - vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor/illumos/dist/cmd/zdb vendor/illumos/dist/cmd/ztest
Andriy Gapon
avg at FreeBSD.org
Mon Jul 18 06:57:25 UTC 2016
Author: avg
Date: Mon Jul 18 06:57:24 2016
New Revision: 302991
URL: https://svnweb.freebsd.org/changeset/base/302991
Log:
6950 ARC should cache compressed data
illumos/illumos-gate at dcbf3bd6a1f1360fc1afcee9e22c6dcff7844bf2
https://github.com/illumos/illumos-gate/commit/dcbf3bd6a1f1360fc1afcee9e22c6dcff7844bf2
https://www.illumos.org/issues/6950
When reading compressed data from disk, the ARC should keep the compressed
block cached and only decompress it when consumers access the block. The
uncompressed data should be short-lived allowing the ARC to cache a much larger
amount of data. The DMU would also maintain a smaller cache of uncompressed
blocks to minimize the impact of decompressing frequently accessed blocks.
Reviewed by: Prakash Surya <prakash.surya at delphix.com>
Reviewed by: Dan Kimmel <dan.kimmel at delphix.com>
Reviewed by: Matt Ahrens <mahrens at delphix.com>
Reviewed by: Paul Dagnelie <pcd at delphix.com>
Reviewed by: Don Brady <don.brady at intel.com>
Reviewed by: Richard Elling <Richard.Elling at RichardElling.com>
Approved by: Richard Lowe <richlowe at richlowe.net>
Author: George Wilson <george.wilson at delphix.com>
Modified:
vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_diff.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c
vendor-sys/illumos/dist/uts/common/fs/zfs/refcount.c
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/arc.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/refcount.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio_checksum.h
vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c
vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c
vendor-sys/illumos/dist/uts/common/fs/zfs/zio_checksum.c
Changes in other areas also in this revision:
Modified:
vendor/illumos/dist/cmd/zdb/zdb.c
vendor/illumos/dist/cmd/ztest/ztest.c
Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c Mon Jul 18 06:47:08 2016 (r302990)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c Mon Jul 18 06:57:24 2016 (r302991)
@@ -120,9 +120,134 @@
* - ARC header release, as it removes from L2ARC buflists
*/
+/*
+ * ARC operation:
+ *
+ * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
+ * This structure can point either to a block that is still in the cache or to
+ * one that is only accessible in an L2 ARC device, or it can provide
+ * information about a block that was recently evicted. If a block is
+ * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
+ * information to retrieve it from the L2ARC device. This information is
+ * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
+ * that is in this state cannot access the data directly.
+ *
+ * Blocks that are actively being referenced or have not been evicted
+ * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
+ * the arc_buf_hdr_t that will point to the data block in memory. A block can
+ * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
+ * caches data in two ways -- in a list of arc buffers (arc_buf_t) and
+ * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata).
+ * Each arc buffer (arc_buf_t) is being actively accessed by a specific ARC
+ * consumer, and always contains uncompressed data. The ARC will provide
+ * references to this data and will keep it cached until it is no longer in
+ * use. Typically, the arc will try to cache only the L1ARC's physical data
+ * block and will aggressively evict any arc_buf_t that is no longer referenced.
+ * The amount of memory consumed by the arc_buf_t's can be seen via the
+ * "overhead_size" kstat.
+ *
+ *
+ * arc_buf_hdr_t
+ * +-----------+
+ * | |
+ * | |
+ * | |
+ * +-----------+
+ * l2arc_buf_hdr_t| |
+ * | |
+ * +-----------+
+ * l1arc_buf_hdr_t| |
+ * | | arc_buf_t
+ * | b_buf +------------>+---------+ arc_buf_t
+ * | | |b_next +---->+---------+
+ * | b_pdata +-+ |---------| |b_next +-->NULL
+ * +-----------+ | | | +---------+
+ * | |b_data +-+ | |
+ * | +---------+ | |b_data +-+
+ * +->+------+ | +---------+ |
+ * (potentially) | | | |
+ * compressed | | | |
+ * data +------+ | v
+ * +->+------+ +------+
+ * uncompressed | | | |
+ * data | | | |
+ * +------+ +------+
+ *
+ * The L1ARC's data pointer, however, may or may not be uncompressed. The
+ * ARC has the ability to store the physical data (b_pdata) associated with
+ * the DVA of the arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk
+ * physical block, it will match its on-disk compression characteristics.
+ * If the block on-disk is compressed, then the physical data block
+ * in the cache will also be compressed and vice-versa. This behavior
+ * can be disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
+ * compressed ARC functionality is disabled, the b_pdata will point to an
+ * uncompressed version of the on-disk data.
+ *
+ * When a consumer reads a block, the ARC must first look to see if the
+ * arc_buf_hdr_t is cached. If the hdr is cached and already has an arc_buf_t,
+ * then an additional arc_buf_t is allocated and the uncompressed data is
+ * bcopied from the existing arc_buf_t. If the hdr is cached but does not
+ * have an arc_buf_t, then the ARC allocates a new arc_buf_t and decompresses
+ * the b_pdata contents into the arc_buf_t's b_data. If the arc_buf_hdr_t's
+ * b_pdata is not compressed, then the block is shared with the newly
+ * allocated arc_buf_t. This block sharing only occurs with one arc_buf_t
+ * in the arc buffer chain. Sharing the block reduces the memory overhead
+ * required when the hdr is caching uncompressed blocks or the compressed
+ * arc functionality has been disabled via 'zfs_compressed_arc_enabled'.
+ *
+ * The diagram below shows an example of an uncompressed ARC hdr that is
+ * sharing its data with an arc_buf_t:
+ *
+ * arc_buf_hdr_t
+ * +-----------+
+ * | |
+ * | |
+ * | |
+ * +-----------+
+ * l2arc_buf_hdr_t| |
+ * | |
+ * +-----------+
+ * l1arc_buf_hdr_t| |
+ * | | arc_buf_t (shared)
+ * | b_buf +------------>+---------+ arc_buf_t
+ * | | |b_next +---->+---------+
+ * | b_pdata +-+ |---------| |b_next +-->NULL
+ * +-----------+ | | | +---------+
+ * | |b_data +-+ | |
+ * | +---------+ | |b_data +-+
+ * +->+------+ | +---------+ |
+ * | | | |
+ * uncompressed | | | |
+ * data +------+ | |
+ * ^ +->+------+ |
+ * | uncompressed | | |
+ * | data | | |
+ * | +------+ |
+ * +---------------------------------+
+ *
+ * Writing to the arc requires that the ARC first discard the b_pdata
+ * since the physical block is about to be rewritten. The new data contents
+ * will be contained in the arc_buf_t (uncompressed). As the I/O pipeline
+ * performs the write, it may compress the data before writing it to disk.
+ * The ARC will be called with the transformed data and will bcopy the
+ * transformed on-disk block into a newly allocated b_pdata.
+ *
+ * When the L2ARC is in use, it will also take advantage of the b_pdata. The
+ * L2ARC will always write the contents of b_pdata to the L2ARC. This means
+ * that when compressed arc is enabled that the L2ARC blocks are identical
+ * to the on-disk block in the main data pool. This provides a significant
+ * advantage since the ARC can leverage the bp's checksum when reading from the
+ * L2ARC to determine if the contents are valid. However, if the compressed
+ * arc is disabled, then the L2ARC's block must be transformed to look
+ * like the physical block in the main data pool before comparing the
+ * checksum and determining its validity.
+ */
+
#include <sys/spa.h>
#include <sys/zio.h>
+#include <sys/spa_impl.h>
#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
#include <sys/zfs_context.h>
#include <sys/arc.h>
#include <sys/refcount.h>
@@ -151,10 +276,6 @@ static kcondvar_t arc_reclaim_thread_cv;
static boolean_t arc_reclaim_thread_exit;
static kcondvar_t arc_reclaim_waiters_cv;
-static kmutex_t arc_user_evicts_lock;
-static kcondvar_t arc_user_evicts_cv;
-static boolean_t arc_user_evicts_thread_exit;
-
uint_t arc_reduce_dnlc_percent = 3;
/*
@@ -230,9 +351,10 @@ uint64_t zfs_arc_meta_min = 0;
int zfs_arc_grow_retry = 0;
int zfs_arc_shrink_shift = 0;
int zfs_arc_p_min_shift = 0;
-int zfs_disable_dup_eviction = 0;
int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
+boolean_t zfs_compressed_arc_enabled = B_TRUE;
+
/*
* Note that buffers can be in one of 6 states:
* ARC_anon - anonymous (discussed below)
@@ -273,7 +395,7 @@ typedef struct arc_state {
/*
* total amount of evictable data in this state
*/
- uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];
+ refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
/*
* total amount of data in this state; this includes: evictable,
* non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
@@ -338,6 +460,26 @@ typedef struct arc_stats {
kstat_named_t arcstat_c_max;
kstat_named_t arcstat_size;
/*
+ * Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata.
+ * Note that the compressed bytes may match the uncompressed bytes
+ * if the block is either not compressed or compressed arc is disabled.
+ */
+ kstat_named_t arcstat_compressed_size;
+ /*
+ * Uncompressed size of the data stored in b_pdata. If compressed
+ * arc is disabled then this value will be identical to the stat
+ * above.
+ */
+ kstat_named_t arcstat_uncompressed_size;
+ /*
+ * Number of bytes stored in all the arc_buf_t's. This is classified
+ * as "overhead" since this data is typically short-lived and will
+ * be evicted from the arc when it becomes unreferenced unless the
+ * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
+ * values have been set (see comment in dbuf.c for more information).
+ */
+ kstat_named_t arcstat_overhead_size;
+ /*
* Number of bytes consumed by internal ARC structures necessary
* for tracking purposes; these structures are not actually
* backed by ARC buffers. This includes arc_buf_hdr_t structures
@@ -482,20 +624,13 @@ typedef struct arc_stats {
kstat_named_t arcstat_l2_evict_reading;
kstat_named_t arcstat_l2_evict_l1cached;
kstat_named_t arcstat_l2_free_on_write;
- kstat_named_t arcstat_l2_cdata_free_on_write;
kstat_named_t arcstat_l2_abort_lowmem;
kstat_named_t arcstat_l2_cksum_bad;
kstat_named_t arcstat_l2_io_error;
kstat_named_t arcstat_l2_size;
kstat_named_t arcstat_l2_asize;
kstat_named_t arcstat_l2_hdr_size;
- kstat_named_t arcstat_l2_compress_successes;
- kstat_named_t arcstat_l2_compress_zeros;
- kstat_named_t arcstat_l2_compress_failures;
kstat_named_t arcstat_memory_throttle_count;
- kstat_named_t arcstat_duplicate_buffers;
- kstat_named_t arcstat_duplicate_buffers_size;
- kstat_named_t arcstat_duplicate_reads;
kstat_named_t arcstat_meta_used;
kstat_named_t arcstat_meta_limit;
kstat_named_t arcstat_meta_max;
@@ -537,6 +672,9 @@ static arc_stats_t arc_stats = {
{ "c_min", KSTAT_DATA_UINT64 },
{ "c_max", KSTAT_DATA_UINT64 },
{ "size", KSTAT_DATA_UINT64 },
+ { "compressed_size", KSTAT_DATA_UINT64 },
+ { "uncompressed_size", KSTAT_DATA_UINT64 },
+ { "overhead_size", KSTAT_DATA_UINT64 },
{ "hdr_size", KSTAT_DATA_UINT64 },
{ "data_size", KSTAT_DATA_UINT64 },
{ "metadata_size", KSTAT_DATA_UINT64 },
@@ -570,20 +708,13 @@ static arc_stats_t arc_stats = {
{ "l2_evict_reading", KSTAT_DATA_UINT64 },
{ "l2_evict_l1cached", KSTAT_DATA_UINT64 },
{ "l2_free_on_write", KSTAT_DATA_UINT64 },
- { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 },
{ "l2_abort_lowmem", KSTAT_DATA_UINT64 },
{ "l2_cksum_bad", KSTAT_DATA_UINT64 },
{ "l2_io_error", KSTAT_DATA_UINT64 },
{ "l2_size", KSTAT_DATA_UINT64 },
{ "l2_asize", KSTAT_DATA_UINT64 },
{ "l2_hdr_size", KSTAT_DATA_UINT64 },
- { "l2_compress_successes", KSTAT_DATA_UINT64 },
- { "l2_compress_zeros", KSTAT_DATA_UINT64 },
- { "l2_compress_failures", KSTAT_DATA_UINT64 },
{ "memory_throttle_count", KSTAT_DATA_UINT64 },
- { "duplicate_buffers", KSTAT_DATA_UINT64 },
- { "duplicate_buffers_size", KSTAT_DATA_UINT64 },
- { "duplicate_reads", KSTAT_DATA_UINT64 },
{ "arc_meta_used", KSTAT_DATA_UINT64 },
{ "arc_meta_limit", KSTAT_DATA_UINT64 },
{ "arc_meta_max", KSTAT_DATA_UINT64 },
@@ -656,8 +787,12 @@ static arc_state_t *arc_l2c_only;
#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
-#define L2ARC_IS_VALID_COMPRESS(_c_) \
- ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
+/* compressed size of entire arc */
+#define arc_compressed_size ARCSTAT(arcstat_compressed_size)
+/* uncompressed size of entire arc */
+#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size)
+/* number of bytes in the arc from arc_buf_t's */
+#define arc_overhead_size ARCSTAT(arcstat_overhead_size)
static int arc_no_grow; /* Don't try to grow cache size */
static uint64_t arc_tempreserve;
@@ -717,6 +852,7 @@ struct arc_write_callback {
*/
typedef struct l1arc_buf_hdr {
kmutex_t b_freeze_lock;
+ zio_cksum_t *b_freeze_cksum;
#ifdef ZFS_DEBUG
/*
* used for debugging wtih kmem_flags - by allocating and freeing
@@ -727,9 +863,10 @@ typedef struct l1arc_buf_hdr {
#endif
arc_buf_t *b_buf;
- uint32_t b_datacnt;
+ uint32_t b_bufcnt;
/* for waiting on writes to complete */
kcondvar_t b_cv;
+ uint8_t b_byteswap;
/* protected by arc state mutex */
arc_state_t *b_state;
@@ -742,8 +879,7 @@ typedef struct l1arc_buf_hdr {
refcount_t b_refcnt;
arc_callback_t *b_acb;
- /* temporary buffer holder for in-flight compressed data */
- void *b_tmp_cdata;
+ void *b_pdata;
} l1arc_buf_hdr_t;
typedef struct l2arc_dev l2arc_dev_t;
@@ -752,9 +888,6 @@ typedef struct l2arc_buf_hdr {
/* protected by arc_buf_hdr mutex */
l2arc_dev_t *b_dev; /* L2ARC device */
uint64_t b_daddr; /* disk address, offset byte */
- /* real alloc'd buffer size depending on b_compress applied */
- int32_t b_asize;
- uint8_t b_compress;
list_node_t b_l2node;
} l2arc_buf_hdr_t;
@@ -763,20 +896,37 @@ struct arc_buf_hdr {
/* protected by hash lock */
dva_t b_dva;
uint64_t b_birth;
- /*
- * Even though this checksum is only set/verified when a buffer is in
- * the L1 cache, it needs to be in the set of common fields because it
- * must be preserved from the time before a buffer is written out to
- * L2ARC until after it is read back in.
- */
- zio_cksum_t *b_freeze_cksum;
+ arc_buf_contents_t b_type;
arc_buf_hdr_t *b_hash_next;
arc_flags_t b_flags;
- /* immutable */
- int32_t b_size;
- uint64_t b_spa;
+ /*
+ * This field stores the size of the data buffer after
+ * compression, and is set in the arc's zio completion handlers.
+ * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
+ *
+ * While the block pointers can store up to 32MB in their psize
+ * field, we can only store up to 32MB minus 512B. This is due
+ * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
+ * a field of zeros represents 512B in the bp). We can't use a
+ * bias of 1 since we need to reserve a psize of zero, here, to
+ * represent holes and embedded blocks.
+ *
+ * This isn't a problem in practice, since the maximum size of a
+ * buffer is limited to 16MB, so we never need to store 32MB in
+ * this field. Even in the upstream illumos code base, the
+ * maximum size of a buffer is limited to 16MB.
+ */
+ uint16_t b_psize;
+
+ /*
+ * This field stores the size of the data buffer before
+ * compression, and cannot change once set. It is in units
+ * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
+ */
+ uint16_t b_lsize; /* immutable */
+ uint64_t b_spa; /* immutable */
/* L2ARC fields. Undefined when not in L2ARC. */
l2arc_buf_hdr_t b_l2hdr;
@@ -784,9 +934,6 @@ struct arc_buf_hdr {
l1arc_buf_hdr_t b_l1hdr;
};
-static arc_buf_t *arc_eviction_list;
-static arc_buf_hdr_t arc_eviction_hdr;
-
#define GHOST_STATE(state) \
((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
(state) == arc_l2c_only)
@@ -795,25 +942,35 @@ static arc_buf_hdr_t arc_eviction_hdr;
#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
-#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
-#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
+#define HDR_COMPRESSION_ENABLED(hdr) \
+ ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE)
-#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
#define HDR_L2_READING(hdr) \
- (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \
- ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
+ (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \
+ ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
+#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
#define HDR_ISTYPE_METADATA(hdr) \
- ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
+ ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr))
#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
+/* For storing compression mode in b_flags */
+#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1)
+
+#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \
+ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
+#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
+ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
+
+#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL)
+
/*
* Other sizes
*/
@@ -866,16 +1023,6 @@ uint64_t zfs_crc64_table[256];
#define L2ARC_FEED_SECS 1 /* caching interval secs */
#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
-/*
- * Used to distinguish headers that are being process by
- * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk
- * address. This can happen when the header is added to the l2arc's list
- * of buffers to write in the first stage of l2arc_write_buffers(), but
- * has not yet been written out which happens in the second stage of
- * l2arc_write_buffers().
- */
-#define L2ARC_ADDR_UNSET ((uint64_t)(-1))
-
#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
@@ -917,12 +1064,10 @@ static kmutex_t l2arc_free_on_write_mtx;
static uint64_t l2arc_ndev; /* number of devices */
typedef struct l2arc_read_callback {
- arc_buf_t *l2rcb_buf; /* read buffer */
- spa_t *l2rcb_spa; /* spa */
+ arc_buf_hdr_t *l2rcb_hdr; /* read buffer */
blkptr_t l2rcb_bp; /* original blkptr */
zbookmark_phys_t l2rcb_zb; /* original bookmark */
int l2rcb_flags; /* original flags */
- enum zio_compress l2rcb_compress; /* applied compress */
} l2arc_read_callback_t;
typedef struct l2arc_write_callback {
@@ -934,7 +1079,7 @@ typedef struct l2arc_data_free {
/* protected by l2arc_free_on_write_mtx */
void *l2df_data;
size_t l2df_size;
- void (*l2df_func)(void *, size_t);
+ arc_buf_contents_t l2df_type;
list_node_t l2df_list_node;
} l2arc_data_free_t;
@@ -942,21 +1087,22 @@ static kmutex_t l2arc_feed_thr_lock;
static kcondvar_t l2arc_feed_thr_cv;
static uint8_t l2arc_thread_exit;
-static void arc_get_data_buf(arc_buf_t *);
+static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
+static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
+static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr);
+static void arc_hdr_alloc_pdata(arc_buf_hdr_t *);
static void arc_access(arc_buf_hdr_t *, kmutex_t *);
static boolean_t arc_is_overflowing();
static void arc_buf_watch(arc_buf_t *);
static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
+static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
+static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
static void l2arc_read_done(zio_t *);
-static boolean_t l2arc_compress_buf(arc_buf_hdr_t *);
-static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
-static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
-
static uint64_t
buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
{
@@ -974,14 +1120,14 @@ buf_hash(uint64_t spa, const dva_t *dva,
return (crc);
}
-#define BUF_EMPTY(buf) \
- ((buf)->b_dva.dva_word[0] == 0 && \
- (buf)->b_dva.dva_word[1] == 0)
-
-#define BUF_EQUAL(spa, dva, birth, buf) \
- ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
- ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
- ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
+#define HDR_EMPTY(hdr) \
+ ((hdr)->b_dva.dva_word[0] == 0 && \
+ (hdr)->b_dva.dva_word[1] == 0)
+
+#define HDR_EQUAL(spa, dva, birth, hdr) \
+ ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
+ ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
+ ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
static void
buf_discard_identity(arc_buf_hdr_t *hdr)
@@ -1003,7 +1149,7 @@ buf_hash_find(uint64_t spa, const blkptr
mutex_enter(hash_lock);
for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
hdr = hdr->b_hash_next) {
- if (BUF_EQUAL(spa, dva, birth, hdr)) {
+ if (HDR_EQUAL(spa, dva, birth, hdr)) {
*lockp = hash_lock;
return (hdr);
}
@@ -1041,13 +1187,13 @@ buf_hash_insert(arc_buf_hdr_t *hdr, kmut
for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
fhdr = fhdr->b_hash_next, i++) {
- if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
+ if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
return (fhdr);
}
hdr->b_hash_next = buf_hash_table.ht_table[idx];
buf_hash_table.ht_table[idx] = hdr;
- hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
+ arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
/* collect some hash table performance data */
if (i > 0) {
@@ -1075,12 +1221,12 @@ buf_hash_remove(arc_buf_hdr_t *hdr)
hdrp = &buf_hash_table.ht_table[idx];
while ((fhdr = *hdrp) != hdr) {
- ASSERT(fhdr != NULL);
+ ASSERT3P(fhdr, !=, NULL);
hdrp = &fhdr->b_hash_next;
}
*hdrp = hdr->b_hash_next;
hdr->b_hash_next = NULL;
- hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE;
+ arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
/* collect some hash table performance data */
ARCSTAT_BUMPDOWN(arcstat_hash_elements);
@@ -1166,7 +1312,7 @@ hdr_full_dest(void *vbuf, void *unused)
{
arc_buf_hdr_t *hdr = vbuf;
- ASSERT(BUF_EMPTY(hdr));
+ ASSERT(HDR_EMPTY(hdr));
cv_destroy(&hdr->b_l1hdr.b_cv);
refcount_destroy(&hdr->b_l1hdr.b_refcnt);
mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
@@ -1180,7 +1326,7 @@ hdr_l2only_dest(void *vbuf, void *unused
{
arc_buf_hdr_t *hdr = vbuf;
- ASSERT(BUF_EMPTY(hdr));
+ ASSERT(HDR_EMPTY(hdr));
arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
}
@@ -1253,166 +1399,138 @@ retry:
}
}
-/*
- * Transition between the two allocation states for the arc_buf_hdr struct.
- * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
- * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
- * version is used when a cache buffer is only in the L2ARC in order to reduce
- * memory usage.
- */
-static arc_buf_hdr_t *
-arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
-{
- ASSERT(HDR_HAS_L2HDR(hdr));
-
- arc_buf_hdr_t *nhdr;
- l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
-
- ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
- (old == hdr_l2only_cache && new == hdr_full_cache));
-
- nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
-
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
- buf_hash_remove(hdr);
-
- bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
-
- if (new == hdr_full_cache) {
- nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
- /*
- * arc_access and arc_change_state need to be aware that a
- * header has just come out of L2ARC, so we set its state to
- * l2c_only even though it's about to change.
- */
- nhdr->b_l1hdr.b_state = arc_l2c_only;
-
- /* Verify previous threads set to NULL before freeing */
- ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL);
- } else {
- ASSERT(hdr->b_l1hdr.b_buf == NULL);
- ASSERT0(hdr->b_l1hdr.b_datacnt);
-
- /*
- * If we've reached here, We must have been called from
- * arc_evict_hdr(), as such we should have already been
- * removed from any ghost list we were previously on
- * (which protects us from racing with arc_evict_state),
- * thus no locking is needed during this check.
- */
- ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
-
- /*
- * A buffer must not be moved into the arc_l2c_only
- * state if it's not finished being written out to the
- * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field
- * might try to be accessed, even though it was removed.
- */
- VERIFY(!HDR_L2_WRITING(hdr));
- VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+#define ARC_MINTIME (hz>>4) /* 62 ms */
-#ifdef ZFS_DEBUG
- if (hdr->b_l1hdr.b_thawed != NULL) {
- kmem_free(hdr->b_l1hdr.b_thawed, 1);
- hdr->b_l1hdr.b_thawed = NULL;
- }
-#endif
+static inline boolean_t
+arc_buf_is_shared(arc_buf_t *buf)
+{
+ boolean_t shared = (buf->b_data != NULL &&
+ buf->b_data == buf->b_hdr->b_l1hdr.b_pdata);
+ IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
+ return (shared);
+}
- nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
+static inline void
+arc_cksum_free(arc_buf_hdr_t *hdr)
+{
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
+ if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
+ kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
+ hdr->b_l1hdr.b_freeze_cksum = NULL;
}
- /*
- * The header has been reallocated so we need to re-insert it into any
- * lists it was on.
- */
- (void) buf_hash_insert(nhdr, NULL);
-
- ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
-
- mutex_enter(&dev->l2ad_mtx);
-
- /*
- * We must place the realloc'ed header back into the list at
- * the same spot. Otherwise, if it's placed earlier in the list,
- * l2arc_write_buffers() could find it during the function's
- * write phase, and try to write it out to the l2arc.
- */
- list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
- list_remove(&dev->l2ad_buflist, hdr);
-
- mutex_exit(&dev->l2ad_mtx);
-
- /*
- * Since we're using the pointer address as the tag when
- * incrementing and decrementing the l2ad_alloc refcount, we
- * must remove the old pointer (that we're about to destroy) and
- * add the new pointer to the refcount. Otherwise we'd remove
- * the wrong pointer address when calling arc_hdr_destroy() later.
- */
-
- (void) refcount_remove_many(&dev->l2ad_alloc,
- hdr->b_l2hdr.b_asize, hdr);
-
- (void) refcount_add_many(&dev->l2ad_alloc,
- nhdr->b_l2hdr.b_asize, nhdr);
-
- buf_discard_identity(hdr);
- hdr->b_freeze_cksum = NULL;
- kmem_cache_free(old, hdr);
-
- return (nhdr);
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
}
-
-#define ARC_MINTIME (hz>>4) /* 62 ms */
-
static void
arc_cksum_verify(arc_buf_t *buf)
{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
zio_cksum_t zc;
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
- mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
- if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) {
- mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
+ if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
return;
}
- fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc);
- if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
+ fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, &zc);
+ if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
panic("buffer modified while frozen!");
- mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
}
-static int
-arc_cksum_equal(arc_buf_t *buf)
+static boolean_t
+arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
{
- zio_cksum_t zc;
- int equal;
+ enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp);
+ boolean_t valid_cksum;
- mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
- fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc);
- equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
- mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+ ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
+ VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
+
+ /*
+ * We rely on the blkptr's checksum to determine if the block
+ * is valid or not. When compressed arc is enabled, the l2arc
+ * writes the block to the l2arc just as it appears in the pool.
+ * This allows us to use the blkptr's checksum to validate the
+ * data that we just read off of the l2arc without having to store
+ * a separate checksum in the arc_buf_hdr_t. However, if compressed
+ * arc is disabled, then the data written to the l2arc is always
+ * uncompressed and won't match the block as it exists in the main
+ * pool. When this is the case, we must first compress it if it is
+ * compressed on the main pool before we can validate the checksum.
+ */
+ if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) {
+ ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+ uint64_t lsize = HDR_GET_LSIZE(hdr);
+ uint64_t csize;
+
+ void *cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr));
+ csize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
+ ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
+ if (csize < HDR_GET_PSIZE(hdr)) {
+ /*
+ * Compressed blocks are always a multiple of the
+ * smallest ashift in the pool. Ideally, we would
+ * like to round up the csize to the next
+ * spa_min_ashift but that value may have changed
+ * since the block was last written. Instead,
+ * we rely on the fact that the hdr's psize
+ * was set to the psize of the block when it was
+ * last written. We set the csize to that value
+ * and zero out any part that should not contain
+ * data.
+ */
+ bzero((char *)cbuf + csize, HDR_GET_PSIZE(hdr) - csize);
+ csize = HDR_GET_PSIZE(hdr);
+ }
+ zio_push_transform(zio, cbuf, csize, HDR_GET_PSIZE(hdr), NULL);
+ }
- return (equal);
+ /*
+ * Block pointers always store the checksum for the logical data.
+ * If the block pointer has the gang bit set, then the checksum
+ * it represents is for the reconstituted data and not for an
+ * individual gang member. The zio pipeline, however, must be able to
+ * determine the checksum of each of the gang constituents so it
+ * treats the checksum comparison differently than what we need
+ * for l2arc blocks. This prevents us from using the
+ * zio_checksum_error() interface directly. Instead we must call the
+ * zio_checksum_error_impl() so that we can ensure the checksum is
+ * generated using the correct checksum algorithm and accounts for the
+ * logical I/O size and not just a gang fragment.
+ */
+ valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
+ BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size,
+ zio->io_offset, NULL) == 0);
+ zio_pop_transforms(zio);
+ return (valid_cksum);
}
static void
-arc_cksum_compute(arc_buf_t *buf, boolean_t force)
+arc_cksum_compute(arc_buf_t *buf)
{
- if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
+ ASSERT(HDR_HAS_L1HDR(hdr));
mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
- if (buf->b_hdr->b_freeze_cksum != NULL) {
- mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+ if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
return;
}
- buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
- fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
- NULL, buf->b_hdr->b_freeze_cksum);
- mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+ hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
+ KM_SLEEP);
+ fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL,
+ hdr->b_l1hdr.b_freeze_cksum);
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
arc_buf_watch(buf);
}
@@ -1451,7 +1569,7 @@ arc_buf_watch(arc_buf_t *buf)
procctl_t ctl;
ctl.cmd = PCWATCH;
ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
- ctl.prwatch.pr_size = buf->b_hdr->b_size;
+ ctl.prwatch.pr_size = HDR_GET_LSIZE(buf->b_hdr);
ctl.prwatch.pr_wflags = WA_WRITE;
result = write(arc_procfd, &ctl, sizeof (ctl));
ASSERT3U(result, ==, sizeof (ctl));
@@ -1462,11 +1580,14 @@ arc_buf_watch(arc_buf_t *buf)
static arc_buf_contents_t
arc_buf_type(arc_buf_hdr_t *hdr)
{
+ arc_buf_contents_t type;
if (HDR_ISTYPE_METADATA(hdr)) {
- return (ARC_BUFC_METADATA);
+ type = ARC_BUFC_METADATA;
} else {
- return (ARC_BUFC_DATA);
+ type = ARC_BUFC_DATA;
}
+ VERIFY3U(hdr->b_type, ==, type);
+ return (type);
}
static uint32_t
@@ -1488,29 +1609,29 @@ arc_bufc_to_flags(arc_buf_contents_t typ
void
arc_buf_thaw(arc_buf_t *buf)
{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
if (zfs_flags & ZFS_DEBUG_MODIFY) {
- if (buf->b_hdr->b_l1hdr.b_state != arc_anon)
+ if (hdr->b_l1hdr.b_state != arc_anon)
panic("modifying non-anon buffer!");
- if (HDR_IO_IN_PROGRESS(buf->b_hdr))
+ if (HDR_IO_IN_PROGRESS(hdr))
panic("modifying buffer while i/o in progress!");
arc_cksum_verify(buf);
}
- mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
- if (buf->b_hdr->b_freeze_cksum != NULL) {
- kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
- buf->b_hdr->b_freeze_cksum = NULL;
- }
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ arc_cksum_free(hdr);
+ mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
#ifdef ZFS_DEBUG
if (zfs_flags & ZFS_DEBUG_MODIFY) {
- if (buf->b_hdr->b_l1hdr.b_thawed != NULL)
- kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1);
- buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
+ if (hdr->b_l1hdr.b_thawed != NULL)
+ kmem_free(hdr->b_l1hdr.b_thawed, 1);
+ hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
}
#endif
- mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
arc_buf_unwatch(buf);
}
@@ -1518,53 +1639,246 @@ arc_buf_thaw(arc_buf_t *buf)
void
arc_buf_freeze(arc_buf_t *buf)
{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
kmutex_t *hash_lock;
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
- hash_lock = HDR_LOCK(buf->b_hdr);
+ hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
- ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
- buf->b_hdr->b_l1hdr.b_state == arc_anon);
- arc_cksum_compute(buf, B_FALSE);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL ||
+ hdr->b_l1hdr.b_state == arc_anon);
+ arc_cksum_compute(buf);
mutex_exit(hash_lock);
}
+/*
+ * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
+ * the following functions should be used to ensure that the flags are
+ * updated in a thread-safe way. When manipulating the flags either
+ * the hash_lock must be held or the hdr must be undiscoverable. This
+ * ensures that we're not racing with any other threads when updating
+ * the flags.
+ */
+static inline void
+arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
+{
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ hdr->b_flags |= flags;
+}
+
+static inline void
+arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
+{
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ hdr->b_flags &= ~flags;
+}
+
+/*
+ * Setting the compression bits in the arc_buf_hdr_t's b_flags is
+ * done in a special way since we have to clear and set bits
+ * at the same time. Consumers that wish to set the compression bits
+ * must use this function to ensure that the flags are updated in
+ * thread-safe manner.
+ */
+static void
+arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
+{
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+
+ /*
+ * Holes and embedded blocks will always have a psize = 0 so
+ * we ignore the compression of the blkptr and set the
+ * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF.
+ * Holes and embedded blocks remain anonymous so we don't
+ * want to uncompress them. Mark them as uncompressed.
+ */
+ if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
+ arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
+ HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
+ ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
+ ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+ } else {
+ arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
+ HDR_SET_COMPRESS(hdr, cmp);
+ ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
+ ASSERT(HDR_COMPRESSION_ENABLED(hdr));
+ }
+}
+
+static int
+arc_decompress(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
+ int error;
+
+ if (arc_buf_is_shared(buf)) {
+ ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+ } else if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) {
+ /*
+ * The arc_buf_hdr_t is either not compressed or is
+ * associated with an embedded block or a hole in which
+ * case they remain anonymous.
+ */
+ IMPLY(HDR_COMPRESSION_ENABLED(hdr), HDR_GET_PSIZE(hdr) == 0 ||
+ HDR_GET_PSIZE(hdr) == HDR_GET_LSIZE(hdr));
+ ASSERT(!HDR_SHARED_DATA(hdr));
+ bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_LSIZE(hdr));
+ } else {
+ ASSERT(!HDR_SHARED_DATA(hdr));
+ ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
+ error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
+ hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_PSIZE(hdr),
+ HDR_GET_LSIZE(hdr));
+ if (error != 0) {
+ zfs_dbgmsg("hdr %p, compress %d, psize %d, lsize %d",
+ hdr, HDR_GET_COMPRESS(hdr), HDR_GET_PSIZE(hdr),
+ HDR_GET_LSIZE(hdr));
+ return (SET_ERROR(EIO));
+ }
+ }
+ if (bswap != DMU_BSWAP_NUMFUNCS) {
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-all
mailing list